aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/adfs/adfs.h2
-rw-r--r--fs/adfs/inode.c5
-rw-r--r--fs/affs/affs.h3
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/write.c21
-rw-r--r--fs/autofs4/autofs_i.h7
-rw-r--r--fs/autofs4/dev-ioctl.c11
-rw-r--r--fs/autofs4/expire.c6
-rw-r--r--fs/autofs4/inode.c63
-rw-r--r--fs/autofs4/root.c474
-rw-r--r--fs/bfs/inode.c5
-rw-r--r--fs/bio.c9
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/file.c6
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/cachefiles/namei.c12
-rw-r--r--fs/cifs/CHANGES5
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h3
-rw-r--r--fs/cifs/cifspdu.h6
-rw-r--r--fs/cifs/cifsproto.h7
-rw-r--r--fs/cifs/cifssmb.c360
-rw-r--r--fs/cifs/connect.c38
-rw-r--r--fs/cifs/file.c4
-rw-r--r--fs/cifs/inode.c15
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/readdir.c8
-rw-r--r--fs/cifs/sess.c11
-rw-r--r--fs/cifs/xattr.c8
-rw-r--r--fs/compat_ioctl.c9
-rw-r--r--fs/dcache.c70
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/dlm/ast.c74
-rw-r--r--fs/dlm/ast.h4
-rw-r--r--fs/dlm/debug_fs.c2
-rw-r--r--fs/dlm/dlm_internal.h10
-rw-r--r--fs/dlm/lock.c120
-rw-r--r--fs/dlm/lockspace.c14
-rw-r--r--fs/dlm/user.c10
-rw-r--r--fs/dlm/user.h4
-rw-r--r--fs/exec.c20
-rw-r--r--fs/exofs/common.h39
-rw-r--r--fs/exofs/exofs.h55
-rw-r--r--fs/exofs/inode.c198
-rw-r--r--fs/exofs/ios.c575
-rw-r--r--fs/exofs/super.c121
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/inode.c11
-rw-r--r--fs/ext3/inode.c4
-rw-r--r--fs/ext4/balloc.c35
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/ext4.h110
-rw-r--r--fs/ext4/ext4_jbd2.c4
-rw-r--r--fs/ext4/ext4_jbd2.h24
-rw-r--r--fs/ext4/extents.c260
-rw-r--r--fs/ext4/file.c10
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c32
-rw-r--r--fs/ext4/inode.c465
-rw-r--r--fs/ext4/ioctl.c12
-rw-r--r--fs/ext4/mballoc.c73
-rw-r--r--fs/ext4/mballoc.h9
-rw-r--r--fs/ext4/migrate.c35
-rw-r--r--fs/ext4/move_extent.c36
-rw-r--r--fs/ext4/namei.c63
-rw-r--r--fs/ext4/resize.c102
-rw-r--r--fs/ext4/super.c342
-rw-r--r--fs/ext4/xattr.c56
-rw-r--r--fs/fat/inode.c9
-rw-r--r--fs/file.c2
-rw-r--r--fs/fs-writeback.c22
-rw-r--r--fs/fuse/dev.c30
-rw-r--r--fs/gfs2/aops.c4
-rw-r--r--fs/gfs2/bmap.c2
-rw-r--r--fs/gfs2/glock.c75
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/glops.c16
-rw-r--r--fs/gfs2/incore.h5
-rw-r--r--fs/gfs2/inode.c6
-rw-r--r--fs/gfs2/lock_dlm.c5
-rw-r--r--fs/gfs2/lops.c4
-rw-r--r--fs/gfs2/main.c28
-rw-r--r--fs/gfs2/meta_io.c46
-rw-r--r--fs/gfs2/meta_io.h12
-rw-r--r--fs/gfs2/ops_fstype.c6
-rw-r--r--fs/gfs2/ops_inode.c113
-rw-r--r--fs/gfs2/super.c32
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hpfs/anode.c2
-rw-r--r--fs/hpfs/dentry.c14
-rw-r--r--fs/hpfs/dir.c14
-rw-r--r--fs/hpfs/dnode.c21
-rw-r--r--fs/hpfs/ea.c7
-rw-r--r--fs/hpfs/hpfs_fn.h30
-rw-r--r--fs/hpfs/inode.c4
-rw-r--r--fs/hpfs/map.c6
-rw-r--r--fs/hpfs/name.c21
-rw-r--r--fs/hpfs/namei.c75
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/internal.h2
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jbd2/journal.c132
-rw-r--r--fs/jbd2/transaction.c43
-rw-r--r--fs/jfs/inode.c5
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/libfs.c77
-rw-r--r--fs/locks.c5
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/namei.c575
-rw-r--r--fs/namespace.c53
-rw-r--r--fs/nfs/Kconfig3
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/direct.c3
-rw-r--r--fs/nfs/fscache.c9
-rw-r--r--fs/nfs/inode.c95
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/iostat.h4
-rw-r--r--fs/nfs/mount_clnt.c2
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs4proc.c8
-rw-r--r--fs/nfs/nfs4xdr.c6
-rw-r--r--fs/nfs/symlink.c2
-rw-r--r--fs/nfs/write.c250
-rw-r--r--fs/nfsctl.c5
-rw-r--r--fs/nfsd/nfs4xdr.c12
-rw-r--r--fs/nfsd/vfs.c7
-rw-r--r--fs/nilfs2/dat.c3
-rw-r--r--fs/nilfs2/dir.c14
-rw-r--r--fs/nilfs2/ioctl.c66
-rw-r--r--fs/nilfs2/namei.c13
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/recovery.c41
-rw-r--r--fs/nilfs2/segbuf.c18
-rw-r--r--fs/nilfs2/segbuf.h5
-rw-r--r--fs/nilfs2/segment.c120
-rw-r--r--fs/nilfs2/segment.h2
-rw-r--r--fs/nilfs2/super.c15
-rw-r--r--fs/nilfs2/the_nilfs.c38
-rw-r--r--fs/nilfs2/the_nilfs.h3
-rw-r--r--fs/notify/inotify/inotify_user.c59
-rw-r--r--fs/ntfs/dir.c2
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ntfs/super.c8
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c5
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h7
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/Makefile3
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlmfs/Makefile5
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c (renamed from fs/ocfs2/dlm/dlmfs.c)127
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c (renamed from fs/ocfs2/dlm/dlmfsver.c)0
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h (renamed from fs/ocfs2/dlm/dlmfsver.h)0
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c (renamed from fs/ocfs2/dlm/userdlm.c)308
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h (renamed from fs/ocfs2/dlm/userdlm.h)16
-rw-r--r--fs/ocfs2/dlmglue.c284
-rw-r--r--fs/ocfs2/file.c13
-rw-r--r--fs/ocfs2/ioctl.h6
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/ocfs2/ocfs2.h32
-rw-r--r--fs/ocfs2/ocfs2_fs.h57
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h79
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h2
-rw-r--r--fs/ocfs2/refcounttree.c6
-rw-r--r--fs/ocfs2/stack_o2cb.c37
-rw-r--r--fs/ocfs2/stack_user.c49
-rw-r--r--fs/ocfs2/stackglue.c98
-rw-r--r--fs/ocfs2/stackglue.h95
-rw-r--r--fs/ocfs2/suballoc.c171
-rw-r--r--fs/ocfs2/suballoc.h1
-rw-r--r--fs/ocfs2/super.c10
-rw-r--r--fs/ocfs2/xattr.c2182
-rw-r--r--fs/omfs/inode.c10
-rw-r--r--fs/open.c2
-rw-r--r--fs/partitions/check.c7
-rw-r--r--fs/pnode.c28
-rw-r--r--fs/pnode.h5
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c40
-rw-r--r--fs/proc/generic.c5
-rw-r--r--fs/proc/kmsg.c14
-rw-r--r--fs/proc/proc_devtree.c7
-rw-r--r--fs/proc/root.c6
-rw-r--r--fs/reiserfs/inode.c6
-rw-r--r--fs/seq_file.c130
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/block.c76
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c68
-rw-r--r--fs/squashfs/decompressor.h55
-rw-r--r--fs/squashfs/dir.c1
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/file.c1
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/inode.c1
-rw-r--r--fs/squashfs/namei.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h6
-rw-r--r--fs/squashfs/squashfs_fs_sb.h40
-rw-r--r--fs/squashfs/super.c49
-rw-r--r--fs/squashfs/symlink.c1
-rw-r--r--fs/squashfs/zlib_wrapper.c150
-rw-r--r--fs/super.c21
-rw-r--r--fs/sysfs/inode.c35
-rw-r--r--fs/sysv/inode.c10
-rw-r--r--fs/sysv/sysv.h2
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c8
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/balloc.c2
-rw-r--r--fs/udf/dir.c4
-rw-r--r--fs/udf/inode.c6
-rw-r--r--fs/udf/namei.c20
-rw-r--r--fs/udf/symlink.c10
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/ufs/dir.c10
-rw-r--r--fs/ufs/inode.c5
-rw-r--r--fs/ufs/ufs.h6
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/linux-2.6/kmem.c56
-rw-r--r--fs/xfs/linux-2.6/kmem.h21
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c320
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h52
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c21
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h12
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c62
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c175
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c186
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h81
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c27
-rw-r--r--fs/xfs/quota/xfs_dquot.c47
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c99
-rw-r--r--fs/xfs/quota/xfs_dquot_item.h4
-rw-r--r--fs/xfs/quota/xfs_qm.c40
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c4
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c49
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_ag.h16
-rw-r--r--fs/xfs/xfs_alloc.c96
-rw-r--r--fs/xfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/xfs_attr.c52
-rw-r--r--fs/xfs/xfs_attr.h3
-rw-r--r--fs/xfs/xfs_attr_leaf.c30
-rw-r--r--fs/xfs/xfs_attr_sf.h2
-rw-r--r--fs/xfs/xfs_bmap.c17
-rw-r--r--fs/xfs/xfs_bmap_btree.c2
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c4
-rw-r--r--fs/xfs/xfs_buf_item.c72
-rw-r--r--fs/xfs/xfs_da_btree.c4
-rw-r--r--fs/xfs/xfs_da_btree.h5
-rw-r--r--fs/xfs/xfs_dfrag.c43
-rw-r--r--fs/xfs/xfs_dfrag.h3
-rw-r--r--fs/xfs/xfs_dir2.c8
-rw-r--r--fs/xfs/xfs_dir2.h4
-rw-r--r--fs/xfs/xfs_dir2_block.c9
-rw-r--r--fs/xfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_node.h2
-rw-r--r--fs/xfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_filestream.c42
-rw-r--r--fs/xfs/xfs_filestream.h28
-rw-r--r--fs/xfs/xfs_fsops.c42
-rw-r--r--fs/xfs/xfs_ialloc.c62
-rw-r--r--fs/xfs/xfs_iget.c10
-rw-r--r--fs/xfs/xfs_inode.c126
-rw-r--r--fs/xfs/xfs_inode.h11
-rw-r--r--fs/xfs/xfs_inode_item.c129
-rw-r--r--fs/xfs/xfs_inode_item.h6
-rw-r--r--fs/xfs/xfs_itable.c12
-rw-r--r--fs/xfs/xfs_log.c383
-rw-r--r--fs/xfs/xfs_log.h19
-rw-r--r--fs/xfs/xfs_log_priv.h5
-rw-r--r--fs/xfs/xfs_log_recover.c222
-rw-r--r--fs/xfs/xfs_log_recover.h23
-rw-r--r--fs/xfs/xfs_mount.c181
-rw-r--r--fs/xfs/xfs_mount.h29
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_quota.h9
-rw-r--r--fs/xfs/xfs_rw.c155
-rw-r--r--fs/xfs/xfs_rw.h4
-rw-r--r--fs/xfs/xfs_trans.c7
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_trans_ail.c34
-rw-r--r--fs/xfs/xfs_trans_buf.c27
-rw-r--r--fs/xfs/xfs_types.h4
-rw-r--r--fs/xfs/xfs_vnodeops.c33
-rw-r--r--fs/xfs/xfs_vnodeops.h10
311 files changed, 8030 insertions, 6259 deletions
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 9cc18775b832..2ff622f6f547 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -121,7 +121,7 @@ struct adfs_discmap {
121 121
122/* Inode stuff */ 122/* Inode stuff */
123struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); 123struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
124int adfs_write_inode(struct inode *inode,int unused); 124int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
125int adfs_notify_change(struct dentry *dentry, struct iattr *attr); 125int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
126 126
127/* map.c */ 127/* map.c */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3f57ce4bee5d..0f5e30978135 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/writeback.h>
12#include "adfs.h" 13#include "adfs.h"
13 14
14/* 15/*
@@ -360,7 +361,7 @@ out:
360 * The adfs-specific inode data has already been updated by 361 * The adfs-specific inode data has already been updated by
361 * adfs_notify_change() 362 * adfs_notify_change()
362 */ 363 */
363int adfs_write_inode(struct inode *inode, int wait) 364int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
364{ 365{
365 struct super_block *sb = inode->i_sb; 366 struct super_block *sb = inode->i_sb;
366 struct object_info obj; 367 struct object_info obj;
@@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait)
375 obj.attr = ADFS_I(inode)->attr; 376 obj.attr = ADFS_I(inode)->attr;
376 obj.size = inode->i_size; 377 obj.size = inode->i_size;
377 378
378 ret = adfs_dir_update(sb, &obj, wait); 379 ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
379 unlock_kernel(); 380 unlock_kernel();
380 return ret; 381 return ret;
381} 382}
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0e40caaba456..861dae68ac12 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -175,7 +175,8 @@ extern void affs_delete_inode(struct inode *inode);
175extern void affs_clear_inode(struct inode *inode); 175extern void affs_clear_inode(struct inode *inode);
176extern struct inode *affs_iget(struct super_block *sb, 176extern struct inode *affs_iget(struct super_block *sb,
177 unsigned long ino); 177 unsigned long ino);
178extern int affs_write_inode(struct inode *inode, int); 178extern int affs_write_inode(struct inode *inode,
179 struct writeback_control *wbc);
179extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); 180extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
180 181
181/* file.c */ 182/* file.c */
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3c4ec7d864c4..c9744d771d98 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -166,7 +166,7 @@ bad_inode:
166} 166}
167 167
168int 168int
169affs_write_inode(struct inode *inode, int unused) 169affs_write_inode(struct inode *inode, struct writeback_control *wbc)
170{ 170{
171 struct super_block *sb = inode->i_sb; 171 struct super_block *sb = inode->i_sb;
172 struct buffer_head *bh; 172 struct buffer_head *bh;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ece2a13bf71..c54dad4e6063 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -733,7 +733,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
733 struct page *page, void *fsdata); 733 struct page *page, void *fsdata);
734extern int afs_writepage(struct page *, struct writeback_control *); 734extern int afs_writepage(struct page *, struct writeback_control *);
735extern int afs_writepages(struct address_space *, struct writeback_control *); 735extern int afs_writepages(struct address_space *, struct writeback_control *);
736extern int afs_write_inode(struct inode *, int);
737extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); 736extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
738extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 737extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
739 unsigned long, loff_t); 738 unsigned long, loff_t);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e1ea1c240b6a..14f6431598ad 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = {
48static const struct super_operations afs_super_ops = { 48static const struct super_operations afs_super_ops = {
49 .statfs = afs_statfs, 49 .statfs = afs_statfs,
50 .alloc_inode = afs_alloc_inode, 50 .alloc_inode = afs_alloc_inode,
51 .write_inode = afs_write_inode,
52 .destroy_inode = afs_destroy_inode, 51 .destroy_inode = afs_destroy_inode,
53 .clear_inode = afs_clear_inode, 52 .clear_inode = afs_clear_inode,
54 .put_super = afs_put_super, 53 .put_super = afs_put_super,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 5e15a21dbf9f..3bed54a294d4 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -585,27 +585,6 @@ int afs_writepages(struct address_space *mapping,
585} 585}
586 586
587/* 587/*
588 * write an inode back
589 */
590int afs_write_inode(struct inode *inode, int sync)
591{
592 struct afs_vnode *vnode = AFS_FS_I(inode);
593 int ret;
594
595 _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
596
597 ret = 0;
598 if (sync) {
599 ret = filemap_fdatawait(inode->i_mapping);
600 if (ret < 0)
601 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
602 }
603
604 _leave(" = %d", ret);
605 return ret;
606}
607
608/*
609 * completion of write to server 588 * completion of write to server
610 */ 589 */
611void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) 590void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0118d67221b2..3d283abf67d7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -60,11 +60,6 @@ do { \
60 current->pid, __func__, ##args); \ 60 current->pid, __func__, ##args); \
61} while (0) 61} while (0)
62 62
63struct rehash_entry {
64 struct task_struct *task;
65 struct list_head list;
66};
67
68/* Unified info structure. This is pointed to by both the dentry and 63/* Unified info structure. This is pointed to by both the dentry and
69 inode structures. Each file in the filesystem has an instance of this 64 inode structures. Each file in the filesystem has an instance of this
70 structure. It holds a reference to the dentry, so dentries are never 65 structure. It holds a reference to the dentry, so dentries are never
@@ -81,7 +76,6 @@ struct autofs_info {
81 76
82 struct list_head active; 77 struct list_head active;
83 int active_count; 78 int active_count;
84 struct list_head rehash_list;
85 79
86 struct list_head expiring; 80 struct list_head expiring;
87 81
@@ -104,7 +98,6 @@ struct autofs_info {
104#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 98#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
105#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ 99#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
106#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ 100#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
107#define AUTOFS_INF_REHASH (1<<3) /* dentry in transit to ->lookup() */
108 101
109struct autofs_wait_queue { 102struct autofs_wait_queue {
110 wait_queue_head_t queue; 103 wait_queue_head_t queue;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 00bf8fcb245f..c8a80dffb455 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -544,10 +544,9 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
544 goto out; 544 goto out;
545 devid = new_encode_dev(path.mnt->mnt_sb->s_dev); 545 devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
546 err = 0; 546 err = 0;
547 if (path.dentry->d_inode && 547 if (path.mnt->mnt_root == path.dentry) {
548 path.mnt->mnt_root == path.dentry) {
549 err = 1; 548 err = 1;
550 magic = path.dentry->d_inode->i_sb->s_magic; 549 magic = path.mnt->mnt_sb->s_magic;
551 } 550 }
552 } else { 551 } else {
553 dev_t dev = sbi->sb->s_dev; 552 dev_t dev = sbi->sb->s_dev;
@@ -560,10 +559,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
560 559
561 err = have_submounts(path.dentry); 560 err = have_submounts(path.dentry);
562 561
563 if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) { 562 if (follow_down(&path))
564 if (follow_down(&path)) 563 magic = path.mnt->mnt_sb->s_magic;
565 magic = path.mnt->mnt_sb->s_magic;
566 }
567 } 564 }
568 565
569 param->ismountpoint.out.devid = devid; 566 param->ismountpoint.out.devid = devid;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 74bc9aa6df31..a796c9417fb1 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -279,7 +279,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
279 root->d_mounted--; 279 root->d_mounted--;
280 } 280 }
281 ino->flags |= AUTOFS_INF_EXPIRING; 281 ino->flags |= AUTOFS_INF_EXPIRING;
282 autofs4_add_expiring(root);
283 init_completion(&ino->expire_complete); 282 init_completion(&ino->expire_complete);
284 spin_unlock(&sbi->fs_lock); 283 spin_unlock(&sbi->fs_lock);
285 return root; 284 return root;
@@ -407,7 +406,6 @@ found:
407 expired, (int)expired->d_name.len, expired->d_name.name); 406 expired, (int)expired->d_name.len, expired->d_name.name);
408 ino = autofs4_dentry_ino(expired); 407 ino = autofs4_dentry_ino(expired);
409 ino->flags |= AUTOFS_INF_EXPIRING; 408 ino->flags |= AUTOFS_INF_EXPIRING;
410 autofs4_add_expiring(expired);
411 init_completion(&ino->expire_complete); 409 init_completion(&ino->expire_complete);
412 spin_unlock(&sbi->fs_lock); 410 spin_unlock(&sbi->fs_lock);
413 spin_lock(&dcache_lock); 411 spin_lock(&dcache_lock);
@@ -435,7 +433,7 @@ int autofs4_expire_wait(struct dentry *dentry)
435 433
436 DPRINTK("expire done status=%d", status); 434 DPRINTK("expire done status=%d", status);
437 435
438 if (d_unhashed(dentry) && IS_DEADDIR(dentry->d_inode)) 436 if (d_unhashed(dentry))
439 return -EAGAIN; 437 return -EAGAIN;
440 438
441 return status; 439 return status;
@@ -475,7 +473,6 @@ int autofs4_expire_run(struct super_block *sb,
475 spin_lock(&sbi->fs_lock); 473 spin_lock(&sbi->fs_lock);
476 ino = autofs4_dentry_ino(dentry); 474 ino = autofs4_dentry_ino(dentry);
477 ino->flags &= ~AUTOFS_INF_EXPIRING; 475 ino->flags &= ~AUTOFS_INF_EXPIRING;
478 autofs4_del_expiring(dentry);
479 complete_all(&ino->expire_complete); 476 complete_all(&ino->expire_complete);
480 spin_unlock(&sbi->fs_lock); 477 spin_unlock(&sbi->fs_lock);
481 478
@@ -506,7 +503,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
506 ino->flags &= ~AUTOFS_INF_MOUNTPOINT; 503 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
507 } 504 }
508 ino->flags &= ~AUTOFS_INF_EXPIRING; 505 ino->flags &= ~AUTOFS_INF_EXPIRING;
509 autofs4_del_expiring(dentry);
510 complete_all(&ino->expire_complete); 506 complete_all(&ino->expire_complete);
511 spin_unlock(&sbi->fs_lock); 507 spin_unlock(&sbi->fs_lock);
512 dput(dentry); 508 dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index d0a3de247458..821b2b955dac 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -49,7 +49,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
49 ino->dentry = NULL; 49 ino->dentry = NULL;
50 ino->size = 0; 50 ino->size = 0;
51 INIT_LIST_HEAD(&ino->active); 51 INIT_LIST_HEAD(&ino->active);
52 INIT_LIST_HEAD(&ino->rehash_list);
53 ino->active_count = 0; 52 ino->active_count = 0;
54 INIT_LIST_HEAD(&ino->expiring); 53 INIT_LIST_HEAD(&ino->expiring);
55 atomic_set(&ino->count, 0); 54 atomic_set(&ino->count, 0);
@@ -97,63 +96,6 @@ void autofs4_free_ino(struct autofs_info *ino)
97 kfree(ino); 96 kfree(ino);
98} 97}
99 98
100/*
101 * Deal with the infamous "Busy inodes after umount ..." message.
102 *
103 * Clean up the dentry tree. This happens with autofs if the user
104 * space program goes away due to a SIGKILL, SIGSEGV etc.
105 */
106static void autofs4_force_release(struct autofs_sb_info *sbi)
107{
108 struct dentry *this_parent = sbi->sb->s_root;
109 struct list_head *next;
110
111 if (!sbi->sb->s_root)
112 return;
113
114 spin_lock(&dcache_lock);
115repeat:
116 next = this_parent->d_subdirs.next;
117resume:
118 while (next != &this_parent->d_subdirs) {
119 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
120
121 /* Negative dentry - don`t care */
122 if (!simple_positive(dentry)) {
123 next = next->next;
124 continue;
125 }
126
127 if (!list_empty(&dentry->d_subdirs)) {
128 this_parent = dentry;
129 goto repeat;
130 }
131
132 next = next->next;
133 spin_unlock(&dcache_lock);
134
135 DPRINTK("dentry %p %.*s",
136 dentry, (int)dentry->d_name.len, dentry->d_name.name);
137
138 dput(dentry);
139 spin_lock(&dcache_lock);
140 }
141
142 if (this_parent != sbi->sb->s_root) {
143 struct dentry *dentry = this_parent;
144
145 next = this_parent->d_u.d_child.next;
146 this_parent = this_parent->d_parent;
147 spin_unlock(&dcache_lock);
148 DPRINTK("parent dentry %p %.*s",
149 dentry, (int)dentry->d_name.len, dentry->d_name.name);
150 dput(dentry);
151 spin_lock(&dcache_lock);
152 goto resume;
153 }
154 spin_unlock(&dcache_lock);
155}
156
157void autofs4_kill_sb(struct super_block *sb) 99void autofs4_kill_sb(struct super_block *sb)
158{ 100{
159 struct autofs_sb_info *sbi = autofs4_sbi(sb); 101 struct autofs_sb_info *sbi = autofs4_sbi(sb);
@@ -170,15 +112,12 @@ void autofs4_kill_sb(struct super_block *sb)
170 /* Free wait queues, close pipe */ 112 /* Free wait queues, close pipe */
171 autofs4_catatonic_mode(sbi); 113 autofs4_catatonic_mode(sbi);
172 114
173 /* Clean up and release dangling references */
174 autofs4_force_release(sbi);
175
176 sb->s_fs_info = NULL; 115 sb->s_fs_info = NULL;
177 kfree(sbi); 116 kfree(sbi);
178 117
179out_kill_sb: 118out_kill_sb:
180 DPRINTK("shutting down"); 119 DPRINTK("shutting down");
181 kill_anon_super(sb); 120 kill_litter_super(sb);
182} 121}
183 122
184static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) 123static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 30cc9ddf4b70..a015b49891df 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -104,99 +104,6 @@ static void autofs4_del_active(struct dentry *dentry)
104 return; 104 return;
105} 105}
106 106
107static void autofs4_add_rehash_entry(struct autofs_info *ino,
108 struct rehash_entry *entry)
109{
110 entry->task = current;
111 INIT_LIST_HEAD(&entry->list);
112 list_add(&entry->list, &ino->rehash_list);
113 return;
114}
115
116static void autofs4_remove_rehash_entry(struct autofs_info *ino)
117{
118 struct list_head *head = &ino->rehash_list;
119 struct rehash_entry *entry;
120 list_for_each_entry(entry, head, list) {
121 if (entry->task == current) {
122 list_del(&entry->list);
123 kfree(entry);
124 break;
125 }
126 }
127 return;
128}
129
130static void autofs4_remove_rehash_entrys(struct autofs_info *ino)
131{
132 struct autofs_sb_info *sbi = ino->sbi;
133 struct rehash_entry *entry, *next;
134 struct list_head *head;
135
136 spin_lock(&sbi->fs_lock);
137 spin_lock(&sbi->lookup_lock);
138 if (!(ino->flags & AUTOFS_INF_REHASH)) {
139 spin_unlock(&sbi->lookup_lock);
140 spin_unlock(&sbi->fs_lock);
141 return;
142 }
143 ino->flags &= ~AUTOFS_INF_REHASH;
144 head = &ino->rehash_list;
145 list_for_each_entry_safe(entry, next, head, list) {
146 list_del(&entry->list);
147 kfree(entry);
148 }
149 spin_unlock(&sbi->lookup_lock);
150 spin_unlock(&sbi->fs_lock);
151 dput(ino->dentry);
152
153 return;
154}
155
156static void autofs4_revalidate_drop(struct dentry *dentry,
157 struct rehash_entry *entry)
158{
159 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
160 struct autofs_info *ino = autofs4_dentry_ino(dentry);
161 /*
162 * Add to the active list so we can pick this up in
163 * ->lookup(). Also add an entry to a rehash list so
164 * we know when there are no dentrys in flight so we
165 * know when we can rehash the dentry.
166 */
167 spin_lock(&sbi->lookup_lock);
168 if (list_empty(&ino->active))
169 list_add(&ino->active, &sbi->active_list);
170 autofs4_add_rehash_entry(ino, entry);
171 spin_unlock(&sbi->lookup_lock);
172 if (!(ino->flags & AUTOFS_INF_REHASH)) {
173 ino->flags |= AUTOFS_INF_REHASH;
174 dget(dentry);
175 spin_lock(&dentry->d_lock);
176 __d_drop(dentry);
177 spin_unlock(&dentry->d_lock);
178 }
179 return;
180}
181
182static void autofs4_revalidate_rehash(struct dentry *dentry)
183{
184 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
185 struct autofs_info *ino = autofs4_dentry_ino(dentry);
186 if (ino->flags & AUTOFS_INF_REHASH) {
187 spin_lock(&sbi->lookup_lock);
188 autofs4_remove_rehash_entry(ino);
189 if (list_empty(&ino->rehash_list)) {
190 spin_unlock(&sbi->lookup_lock);
191 ino->flags &= ~AUTOFS_INF_REHASH;
192 d_rehash(dentry);
193 dput(ino->dentry);
194 } else
195 spin_unlock(&sbi->lookup_lock);
196 }
197 return;
198}
199
200static unsigned int autofs4_need_mount(unsigned int flags) 107static unsigned int autofs4_need_mount(unsigned int flags)
201{ 108{
202 unsigned int res = 0; 109 unsigned int res = 0;
@@ -236,7 +143,7 @@ out:
236 return dcache_dir_open(inode, file); 143 return dcache_dir_open(inode, file);
237} 144}
238 145
239static int try_to_fill_dentry(struct dentry *dentry) 146static int try_to_fill_dentry(struct dentry *dentry, int flags)
240{ 147{
241 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 148 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
242 struct autofs_info *ino = autofs4_dentry_ino(dentry); 149 struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -249,17 +156,55 @@ static int try_to_fill_dentry(struct dentry *dentry)
249 * Wait for a pending mount, triggering one if there 156 * Wait for a pending mount, triggering one if there
250 * isn't one already 157 * isn't one already
251 */ 158 */
252 DPRINTK("waiting for mount name=%.*s", 159 if (dentry->d_inode == NULL) {
253 dentry->d_name.len, dentry->d_name.name); 160 DPRINTK("waiting for mount name=%.*s",
161 dentry->d_name.len, dentry->d_name.name);
254 162
255 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 163 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
256 164
257 DPRINTK("mount done status=%d", status); 165 DPRINTK("mount done status=%d", status);
258 166
259 /* Update expiry counter */ 167 /* Turn this into a real negative dentry? */
260 ino->last_used = jiffies; 168 if (status == -ENOENT) {
169 spin_lock(&sbi->fs_lock);
170 ino->flags &= ~AUTOFS_INF_PENDING;
171 spin_unlock(&sbi->fs_lock);
172 return status;
173 } else if (status) {
174 /* Return a negative dentry, but leave it "pending" */
175 return status;
176 }
177 /* Trigger mount for path component or follow link */
178 } else if (ino->flags & AUTOFS_INF_PENDING ||
179 autofs4_need_mount(flags) ||
180 current->link_count) {
181 DPRINTK("waiting for mount name=%.*s",
182 dentry->d_name.len, dentry->d_name.name);
261 183
262 return status; 184 spin_lock(&sbi->fs_lock);
185 ino->flags |= AUTOFS_INF_PENDING;
186 spin_unlock(&sbi->fs_lock);
187 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
188
189 DPRINTK("mount done status=%d", status);
190
191 if (status) {
192 spin_lock(&sbi->fs_lock);
193 ino->flags &= ~AUTOFS_INF_PENDING;
194 spin_unlock(&sbi->fs_lock);
195 return status;
196 }
197 }
198
199 /* Initialize expiry counter after successful mount */
200 if (ino)
201 ino->last_used = jiffies;
202
203 spin_lock(&sbi->fs_lock);
204 ino->flags &= ~AUTOFS_INF_PENDING;
205 spin_unlock(&sbi->fs_lock);
206
207 return 0;
263} 208}
264 209
265/* For autofs direct mounts the follow link triggers the mount */ 210/* For autofs direct mounts the follow link triggers the mount */
@@ -313,16 +258,10 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
313 */ 258 */
314 if (ino->flags & AUTOFS_INF_PENDING || 259 if (ino->flags & AUTOFS_INF_PENDING ||
315 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) { 260 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
316 ino->flags |= AUTOFS_INF_PENDING;
317 spin_unlock(&dcache_lock); 261 spin_unlock(&dcache_lock);
318 spin_unlock(&sbi->fs_lock); 262 spin_unlock(&sbi->fs_lock);
319 263
320 status = try_to_fill_dentry(dentry); 264 status = try_to_fill_dentry(dentry, 0);
321
322 spin_lock(&sbi->fs_lock);
323 ino->flags &= ~AUTOFS_INF_PENDING;
324 spin_unlock(&sbi->fs_lock);
325
326 if (status) 265 if (status)
327 goto out_error; 266 goto out_error;
328 267
@@ -361,47 +300,18 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
361{ 300{
362 struct inode *dir = dentry->d_parent->d_inode; 301 struct inode *dir = dentry->d_parent->d_inode;
363 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 302 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
364 struct autofs_info *ino = autofs4_dentry_ino(dentry); 303 int oz_mode = autofs4_oz_mode(sbi);
365 struct rehash_entry *entry;
366 int flags = nd ? nd->flags : 0; 304 int flags = nd ? nd->flags : 0;
367 unsigned int mutex_aquired; 305 int status = 1;
368 306
369 DPRINTK("name = %.*s oz_mode = %d",
370 dentry->d_name.len, dentry->d_name.name, oz_mode);
371
372 /* Daemon never causes a mount to trigger */
373 if (autofs4_oz_mode(sbi))
374 return 1;
375
376 entry = kmalloc(sizeof(struct rehash_entry), GFP_KERNEL);
377 if (!entry)
378 return -ENOMEM;
379
380 mutex_aquired = mutex_trylock(&dir->i_mutex);
381
382 spin_lock(&sbi->fs_lock);
383 spin_lock(&dcache_lock);
384 /* Pending dentry */ 307 /* Pending dentry */
308 spin_lock(&sbi->fs_lock);
385 if (autofs4_ispending(dentry)) { 309 if (autofs4_ispending(dentry)) {
386 int status; 310 /* The daemon never causes a mount to trigger */
387
388 /*
389 * We can only unhash and send this to ->lookup() if
390 * the directory mutex is held over d_revalidate() and
391 * ->lookup(). This prevents the VFS from incorrectly
392 * seeing the dentry as non-existent.
393 */
394 ino->flags |= AUTOFS_INF_PENDING;
395 if (!mutex_aquired) {
396 autofs4_revalidate_drop(dentry, entry);
397 spin_unlock(&dcache_lock);
398 spin_unlock(&sbi->fs_lock);
399 return 0;
400 }
401 spin_unlock(&dcache_lock);
402 spin_unlock(&sbi->fs_lock); 311 spin_unlock(&sbi->fs_lock);
403 mutex_unlock(&dir->i_mutex); 312
404 kfree(entry); 313 if (oz_mode)
314 return 1;
405 315
406 /* 316 /*
407 * If the directory has gone away due to an expire 317 * If the directory has gone away due to an expire
@@ -415,82 +325,45 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
415 * A zero status is success otherwise we have a 325 * A zero status is success otherwise we have a
416 * negative error code. 326 * negative error code.
417 */ 327 */
418 status = try_to_fill_dentry(dentry); 328 status = try_to_fill_dentry(dentry, flags);
419
420 spin_lock(&sbi->fs_lock);
421 ino->flags &= ~AUTOFS_INF_PENDING;
422 spin_unlock(&sbi->fs_lock);
423
424 if (status == 0) 329 if (status == 0)
425 return 1; 330 return 1;
426 331
427 return status; 332 return status;
428 } 333 }
334 spin_unlock(&sbi->fs_lock);
335
336 /* Negative dentry.. invalidate if "old" */
337 if (dentry->d_inode == NULL)
338 return 0;
429 339
430 /* Check for a non-mountpoint directory with no contents */ 340 /* Check for a non-mountpoint directory with no contents */
341 spin_lock(&dcache_lock);
431 if (S_ISDIR(dentry->d_inode->i_mode) && 342 if (S_ISDIR(dentry->d_inode->i_mode) &&
432 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 343 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
433 DPRINTK("dentry=%p %.*s, emptydir", 344 DPRINTK("dentry=%p %.*s, emptydir",
434 dentry, dentry->d_name.len, dentry->d_name.name); 345 dentry, dentry->d_name.len, dentry->d_name.name);
346 spin_unlock(&dcache_lock);
435 347
436 if (autofs4_need_mount(flags) || current->link_count) { 348 /* The daemon never causes a mount to trigger */
437 int status; 349 if (oz_mode)
438 350 return 1;
439 /*
440 * We can only unhash and send this to ->lookup() if
441 * the directory mutex is held over d_revalidate() and
442 * ->lookup(). This prevents the VFS from incorrectly
443 * seeing the dentry as non-existent.
444 */
445 ino->flags |= AUTOFS_INF_PENDING;
446 if (!mutex_aquired) {
447 autofs4_revalidate_drop(dentry, entry);
448 spin_unlock(&dcache_lock);
449 spin_unlock(&sbi->fs_lock);
450 return 0;
451 }
452 spin_unlock(&dcache_lock);
453 spin_unlock(&sbi->fs_lock);
454 mutex_unlock(&dir->i_mutex);
455 kfree(entry);
456
457 /*
458 * A zero status is success otherwise we have a
459 * negative error code.
460 */
461 status = try_to_fill_dentry(dentry);
462
463 spin_lock(&sbi->fs_lock);
464 ino->flags &= ~AUTOFS_INF_PENDING;
465 spin_unlock(&sbi->fs_lock);
466 351
467 if (status == 0) 352 /*
468 return 1; 353 * A zero status is success otherwise we have a
354 * negative error code.
355 */
356 status = try_to_fill_dentry(dentry, flags);
357 if (status == 0)
358 return 1;
469 359
470 return status; 360 return status;
471 }
472 } 361 }
473 spin_unlock(&dcache_lock); 362 spin_unlock(&dcache_lock);
474 spin_unlock(&sbi->fs_lock);
475
476 if (mutex_aquired)
477 mutex_unlock(&dir->i_mutex);
478
479 kfree(entry);
480 363
481 return 1; 364 return 1;
482} 365}
483 366
484static void autofs4_free_rehash_entrys(struct autofs_info *inf)
485{
486 struct list_head *head = &inf->rehash_list;
487 struct rehash_entry *entry, *next;
488 list_for_each_entry_safe(entry, next, head, list) {
489 list_del(&entry->list);
490 kfree(entry);
491 }
492}
493
494void autofs4_dentry_release(struct dentry *de) 367void autofs4_dentry_release(struct dentry *de)
495{ 368{
496 struct autofs_info *inf; 369 struct autofs_info *inf;
@@ -509,8 +382,6 @@ void autofs4_dentry_release(struct dentry *de)
509 list_del(&inf->active); 382 list_del(&inf->active);
510 if (!list_empty(&inf->expiring)) 383 if (!list_empty(&inf->expiring))
511 list_del(&inf->expiring); 384 list_del(&inf->expiring);
512 if (!list_empty(&inf->rehash_list))
513 autofs4_free_rehash_entrys(inf);
514 spin_unlock(&sbi->lookup_lock); 385 spin_unlock(&sbi->lookup_lock);
515 } 386 }
516 387
@@ -543,7 +414,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
543 const unsigned char *str = name->name; 414 const unsigned char *str = name->name;
544 struct list_head *p, *head; 415 struct list_head *p, *head;
545 416
546restart:
547 spin_lock(&dcache_lock); 417 spin_lock(&dcache_lock);
548 spin_lock(&sbi->lookup_lock); 418 spin_lock(&sbi->lookup_lock);
549 head = &sbi->active_list; 419 head = &sbi->active_list;
@@ -561,19 +431,6 @@ restart:
561 if (atomic_read(&active->d_count) == 0) 431 if (atomic_read(&active->d_count) == 0)
562 goto next; 432 goto next;
563 433
564 if (active->d_inode && IS_DEADDIR(active->d_inode)) {
565 if (!list_empty(&ino->rehash_list)) {
566 dget(active);
567 spin_unlock(&active->d_lock);
568 spin_unlock(&sbi->lookup_lock);
569 spin_unlock(&dcache_lock);
570 autofs4_remove_rehash_entrys(ino);
571 dput(active);
572 goto restart;
573 }
574 goto next;
575 }
576
577 qstr = &active->d_name; 434 qstr = &active->d_name;
578 435
579 if (active->d_name.hash != hash) 436 if (active->d_name.hash != hash)
@@ -586,11 +443,13 @@ restart:
586 if (memcmp(qstr->name, str, len)) 443 if (memcmp(qstr->name, str, len))
587 goto next; 444 goto next;
588 445
589 dget(active); 446 if (d_unhashed(active)) {
590 spin_unlock(&active->d_lock); 447 dget(active);
591 spin_unlock(&sbi->lookup_lock); 448 spin_unlock(&active->d_lock);
592 spin_unlock(&dcache_lock); 449 spin_unlock(&sbi->lookup_lock);
593 return active; 450 spin_unlock(&dcache_lock);
451 return active;
452 }
594next: 453next:
595 spin_unlock(&active->d_lock); 454 spin_unlock(&active->d_lock);
596 } 455 }
@@ -639,11 +498,13 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
639 if (memcmp(qstr->name, str, len)) 498 if (memcmp(qstr->name, str, len))
640 goto next; 499 goto next;
641 500
642 dget(expiring); 501 if (d_unhashed(expiring)) {
643 spin_unlock(&expiring->d_lock); 502 dget(expiring);
644 spin_unlock(&sbi->lookup_lock); 503 spin_unlock(&expiring->d_lock);
645 spin_unlock(&dcache_lock); 504 spin_unlock(&sbi->lookup_lock);
646 return expiring; 505 spin_unlock(&dcache_lock);
506 return expiring;
507 }
647next: 508next:
648 spin_unlock(&expiring->d_lock); 509 spin_unlock(&expiring->d_lock);
649 } 510 }
@@ -653,48 +514,6 @@ next:
653 return NULL; 514 return NULL;
654} 515}
655 516
656static struct autofs_info *init_new_dentry(struct autofs_sb_info *sbi,
657 struct dentry *dentry, int oz_mode)
658{
659 struct autofs_info *ino;
660
661 /*
662 * Mark the dentry incomplete but don't hash it. We do this
663 * to serialize our inode creation operations (symlink and
664 * mkdir) which prevents deadlock during the callback to
665 * the daemon. Subsequent user space lookups for the same
666 * dentry are placed on the wait queue while the daemon
667 * itself is allowed passage unresticted so the create
668 * operation itself can then hash the dentry. Finally,
669 * we check for the hashed dentry and return the newly
670 * hashed dentry.
671 */
672 dentry->d_op = &autofs4_root_dentry_operations;
673
674 /*
675 * And we need to ensure that the same dentry is used for
676 * all following lookup calls until it is hashed so that
677 * the dentry flags are persistent throughout the request.
678 */
679 ino = autofs4_init_ino(NULL, sbi, 0555);
680 if (!ino)
681 return ERR_PTR(-ENOMEM);
682
683 dentry->d_fsdata = ino;
684 ino->dentry = dentry;
685
686 /*
687 * Only set the mount pending flag for new dentrys not created
688 * by the daemon.
689 */
690 if (!oz_mode)
691 ino->flags |= AUTOFS_INF_PENDING;
692
693 d_instantiate(dentry, NULL);
694
695 return ino;
696}
697
698/* Lookups in the root directory */ 517/* Lookups in the root directory */
699static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 518static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
700{ 519{
@@ -702,7 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
702 struct autofs_info *ino; 521 struct autofs_info *ino;
703 struct dentry *expiring, *active; 522 struct dentry *expiring, *active;
704 int oz_mode; 523 int oz_mode;
705 int status = 0;
706 524
707 DPRINTK("name = %.*s", 525 DPRINTK("name = %.*s",
708 dentry->d_name.len, dentry->d_name.name); 526 dentry->d_name.len, dentry->d_name.name);
@@ -717,26 +535,44 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
717 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 535 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
718 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 536 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
719 537
720 spin_lock(&sbi->fs_lock);
721 active = autofs4_lookup_active(dentry); 538 active = autofs4_lookup_active(dentry);
722 if (active) { 539 if (active) {
723 dentry = active; 540 dentry = active;
724 ino = autofs4_dentry_ino(dentry); 541 ino = autofs4_dentry_ino(dentry);
725 /* If this came from revalidate, rehash it */
726 autofs4_revalidate_rehash(dentry);
727 spin_unlock(&sbi->fs_lock);
728 } else { 542 } else {
729 spin_unlock(&sbi->fs_lock); 543 /*
730 ino = init_new_dentry(sbi, dentry, oz_mode); 544 * Mark the dentry incomplete but don't hash it. We do this
731 if (IS_ERR(ino)) 545 * to serialize our inode creation operations (symlink and
732 return (struct dentry *) ino; 546 * mkdir) which prevents deadlock during the callback to
733 } 547 * the daemon. Subsequent user space lookups for the same
548 * dentry are placed on the wait queue while the daemon
549 * itself is allowed passage unresticted so the create
550 * operation itself can then hash the dentry. Finally,
551 * we check for the hashed dentry and return the newly
552 * hashed dentry.
553 */
554 dentry->d_op = &autofs4_root_dentry_operations;
555
556 /*
557 * And we need to ensure that the same dentry is used for
558 * all following lookup calls until it is hashed so that
559 * the dentry flags are persistent throughout the request.
560 */
561 ino = autofs4_init_ino(NULL, sbi, 0555);
562 if (!ino)
563 return ERR_PTR(-ENOMEM);
734 564
735 autofs4_add_active(dentry); 565 dentry->d_fsdata = ino;
566 ino->dentry = dentry;
567
568 autofs4_add_active(dentry);
569
570 d_instantiate(dentry, NULL);
571 }
736 572
737 if (!oz_mode) { 573 if (!oz_mode) {
738 expiring = autofs4_lookup_expiring(dentry);
739 mutex_unlock(&dir->i_mutex); 574 mutex_unlock(&dir->i_mutex);
575 expiring = autofs4_lookup_expiring(dentry);
740 if (expiring) { 576 if (expiring) {
741 /* 577 /*
742 * If we are racing with expire the request might not 578 * If we are racing with expire the request might not
@@ -744,22 +580,23 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
744 * so it must have been successful, so just wait for it. 580 * so it must have been successful, so just wait for it.
745 */ 581 */
746 autofs4_expire_wait(expiring); 582 autofs4_expire_wait(expiring);
583 autofs4_del_expiring(expiring);
747 dput(expiring); 584 dput(expiring);
748 } 585 }
749 status = try_to_fill_dentry(dentry); 586
750 mutex_lock(&dir->i_mutex);
751 spin_lock(&sbi->fs_lock); 587 spin_lock(&sbi->fs_lock);
752 ino->flags &= ~AUTOFS_INF_PENDING; 588 ino->flags |= AUTOFS_INF_PENDING;
753 spin_unlock(&sbi->fs_lock); 589 spin_unlock(&sbi->fs_lock);
590 if (dentry->d_op && dentry->d_op->d_revalidate)
591 (dentry->d_op->d_revalidate)(dentry, nd);
592 mutex_lock(&dir->i_mutex);
754 } 593 }
755 594
756 autofs4_del_active(dentry);
757
758 /* 595 /*
759 * If we had a mount fail, check if we had to handle 596 * If we are still pending, check if we had to handle
760 * a signal. If so we can force a restart.. 597 * a signal. If so we can force a restart..
761 */ 598 */
762 if (status) { 599 if (ino->flags & AUTOFS_INF_PENDING) {
763 /* See if we were interrupted */ 600 /* See if we were interrupted */
764 if (signal_pending(current)) { 601 if (signal_pending(current)) {
765 sigset_t *sigset = &current->pending.signal; 602 sigset_t *sigset = &current->pending.signal;
@@ -771,46 +608,43 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
771 return ERR_PTR(-ERESTARTNOINTR); 608 return ERR_PTR(-ERESTARTNOINTR);
772 } 609 }
773 } 610 }
774 } 611 if (!oz_mode) {
775 612 spin_lock(&sbi->fs_lock);
776 /* 613 ino->flags &= ~AUTOFS_INF_PENDING;
777 * User space can (and has done in the past) remove and re-create 614 spin_unlock(&sbi->fs_lock);
778 * this directory during the callback. This can leave us with an
779 * unhashed dentry, but a successful mount! So we need to
780 * perform another cached lookup in case the dentry now exists.
781 */
782 if (!oz_mode && !have_submounts(dentry)) {
783 struct dentry *new;
784 new = d_lookup(dentry->d_parent, &dentry->d_name);
785 if (new) {
786 if (active)
787 dput(active);
788 return new;
789 } else {
790 if (!status)
791 status = -ENOENT;
792 } 615 }
793 } 616 }
794 617
795 /* 618 /*
796 * If we had a mount failure, return status to user space. 619 * If this dentry is unhashed, then we shouldn't honour this
797 * If the mount succeeded and we used a dentry from the active queue 620 * lookup. Returning ENOENT here doesn't do the right thing
798 * return it. 621 * for all system calls, but it should be OK for the operations
622 * we permit from an autofs.
799 */ 623 */
800 if (status) { 624 if (!oz_mode && d_unhashed(dentry)) {
801 dentry = ERR_PTR(status);
802 if (active)
803 dput(active);
804 return dentry;
805 } else {
806 /* 625 /*
807 * Valid successful mount, return active dentry or NULL 626 * A user space application can (and has done in the past)
808 * for a new dentry. 627 * remove and re-create this directory during the callback.
628 * This can leave us with an unhashed dentry, but a
629 * successful mount! So we need to perform another
630 * cached lookup in case the dentry now exists.
809 */ 631 */
632 struct dentry *parent = dentry->d_parent;
633 struct dentry *new = d_lookup(parent, &dentry->d_name);
634 if (new != NULL)
635 dentry = new;
636 else
637 dentry = ERR_PTR(-ENOENT);
638
810 if (active) 639 if (active)
811 return active; 640 dput(active);
641
642 return dentry;
812 } 643 }
813 644
645 if (active)
646 return active;
647
814 return NULL; 648 return NULL;
815} 649}
816 650
@@ -834,6 +668,8 @@ static int autofs4_dir_symlink(struct inode *dir,
834 if (!ino) 668 if (!ino)
835 return -ENOMEM; 669 return -ENOMEM;
836 670
671 autofs4_del_active(dentry);
672
837 ino->size = strlen(symname); 673 ino->size = strlen(symname);
838 cp = kmalloc(ino->size + 1, GFP_KERNEL); 674 cp = kmalloc(ino->size + 1, GFP_KERNEL);
839 if (!cp) { 675 if (!cp) {
@@ -910,6 +746,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
910 dir->i_mtime = CURRENT_TIME; 746 dir->i_mtime = CURRENT_TIME;
911 747
912 spin_lock(&dcache_lock); 748 spin_lock(&dcache_lock);
749 autofs4_add_expiring(dentry);
913 spin_lock(&dentry->d_lock); 750 spin_lock(&dentry->d_lock);
914 __d_drop(dentry); 751 __d_drop(dentry);
915 spin_unlock(&dentry->d_lock); 752 spin_unlock(&dentry->d_lock);
@@ -935,6 +772,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
935 spin_unlock(&dcache_lock); 772 spin_unlock(&dcache_lock);
936 return -ENOTEMPTY; 773 return -ENOTEMPTY;
937 } 774 }
775 autofs4_add_expiring(dentry);
938 spin_lock(&dentry->d_lock); 776 spin_lock(&dentry->d_lock);
939 __d_drop(dentry); 777 __d_drop(dentry);
940 spin_unlock(&dentry->d_lock); 778 spin_unlock(&dentry->d_lock);
@@ -972,6 +810,8 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
972 if (!ino) 810 if (!ino)
973 return -ENOMEM; 811 return -ENOMEM;
974 812
813 autofs4_del_active(dentry);
814
975 inode = autofs4_get_inode(dir->i_sb, ino); 815 inode = autofs4_get_inode(dir->i_sb, ino);
976 if (!inode) { 816 if (!inode) {
977 if (!dentry->d_fsdata) 817 if (!dentry->d_fsdata)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8f3d9fd89604..f22a7d3dc362 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
15#include <linux/smp_lock.h> 15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/vfs.h> 17#include <linux/vfs.h>
18#include <linux/writeback.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include "bfs.h" 20#include "bfs.h"
20 21
@@ -98,7 +99,7 @@ error:
98 return ERR_PTR(-EIO); 99 return ERR_PTR(-EIO);
99} 100}
100 101
101static int bfs_write_inode(struct inode *inode, int wait) 102static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
102{ 103{
103 struct bfs_sb_info *info = BFS_SB(inode->i_sb); 104 struct bfs_sb_info *info = BFS_SB(inode->i_sb);
104 unsigned int ino = (u16)inode->i_ino; 105 unsigned int ino = (u16)inode->i_ino;
@@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait)
147 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); 148 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
148 149
149 mark_buffer_dirty(bh); 150 mark_buffer_dirty(bh);
150 if (wait) { 151 if (wbc->sync_mode == WB_SYNC_ALL) {
151 sync_dirty_buffer(bh); 152 sync_dirty_buffer(bh);
152 if (buffer_req(bh) && !buffer_uptodate(bh)) 153 if (buffer_req(bh) && !buffer_uptodate(bh))
153 err = -EIO; 154 err = -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 88094afc29ea..dc17afd672e3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -507,10 +507,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
507 int nr_pages; 507 int nr_pages;
508 508
509 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 509 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
510 if (nr_pages > queue_max_phys_segments(q)) 510 if (nr_pages > queue_max_segments(q))
511 nr_pages = queue_max_phys_segments(q); 511 nr_pages = queue_max_segments(q);
512 if (nr_pages > queue_max_hw_segments(q))
513 nr_pages = queue_max_hw_segments(q);
514 512
515 return nr_pages; 513 return nr_pages;
516} 514}
@@ -575,8 +573,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
575 * make this too complex. 573 * make this too complex.
576 */ 574 */
577 575
578 while (bio->bi_phys_segments >= queue_max_phys_segments(q) 576 while (bio->bi_phys_segments >= queue_max_segments(q)) {
579 || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
580 577
581 if (retried_segments) 578 if (retried_segments)
582 return 0; 579 return 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2aa8ec6a0981..8b5cfdd4bfc1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2326,7 +2326,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2326int btrfs_readpage(struct file *file, struct page *page); 2326int btrfs_readpage(struct file *file, struct page *page);
2327void btrfs_delete_inode(struct inode *inode); 2327void btrfs_delete_inode(struct inode *inode);
2328void btrfs_put_inode(struct inode *inode); 2328void btrfs_put_inode(struct inode *inode);
2329int btrfs_write_inode(struct inode *inode, int wait); 2329int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2330void btrfs_dirty_inode(struct inode *inode); 2330void btrfs_dirty_inode(struct inode *inode);
2331struct inode *btrfs_alloc_inode(struct super_block *sb); 2331struct inode *btrfs_alloc_inode(struct super_block *sb);
2332void btrfs_destroy_inode(struct inode *inode); 2332void btrfs_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9d0809629967..6ed434ac037f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -720,13 +720,15 @@ again:
720 inode->i_ino, orig_offset); 720 inode->i_ino, orig_offset);
721 BUG_ON(ret); 721 BUG_ON(ret);
722 } 722 }
723 fi = btrfs_item_ptr(leaf, path->slots[0],
724 struct btrfs_file_extent_item);
725 if (del_nr == 0) { 723 if (del_nr == 0) {
724 fi = btrfs_item_ptr(leaf, path->slots[0],
725 struct btrfs_file_extent_item);
726 btrfs_set_file_extent_type(leaf, fi, 726 btrfs_set_file_extent_type(leaf, fi,
727 BTRFS_FILE_EXTENT_REG); 727 BTRFS_FILE_EXTENT_REG);
728 btrfs_mark_buffer_dirty(leaf); 728 btrfs_mark_buffer_dirty(leaf);
729 } else { 729 } else {
730 fi = btrfs_item_ptr(leaf, del_slot - 1,
731 struct btrfs_file_extent_item);
730 btrfs_set_file_extent_type(leaf, fi, 732 btrfs_set_file_extent_type(leaf, fi,
731 BTRFS_FILE_EXTENT_REG); 733 BTRFS_FILE_EXTENT_REG);
732 btrfs_set_file_extent_num_bytes(leaf, fi, 734 btrfs_set_file_extent_num_bytes(leaf, fi,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4deb280f8969..c41db6d45ab6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3968,7 +3968,7 @@ err:
3968 return ret; 3968 return ret;
3969} 3969}
3970 3970
3971int btrfs_write_inode(struct inode *inode, int wait) 3971int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3972{ 3972{
3973 struct btrfs_root *root = BTRFS_I(inode)->root; 3973 struct btrfs_root *root = BTRFS_I(inode)->root;
3974 struct btrfs_trans_handle *trans; 3974 struct btrfs_trans_handle *trans;
@@ -3977,7 +3977,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
3977 if (root->fs_info->btree_inode == inode) 3977 if (root->fs_info->btree_inode == inode)
3978 return 0; 3978 return 0;
3979 3979
3980 if (wait) { 3980 if (wbc->sync_mode == WB_SYNC_ALL) {
3981 trans = btrfs_join_transaction(root, 1); 3981 trans = btrfs_join_transaction(root, 1);
3982 btrfs_set_trans_block_group(trans, inode); 3982 btrfs_set_trans_block_group(trans, inode);
3983 ret = btrfs_commit_transaction(trans, root); 3983 ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 14ac4806e291..eeb4986ea7db 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -348,7 +348,17 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
348 dir = dget_parent(object->dentry); 348 dir = dget_parent(object->dentry);
349 349
350 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 350 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
351 ret = cachefiles_bury_object(cache, dir, object->dentry); 351
352 /* we need to check that our parent is _still_ our parent - it may have
353 * been renamed */
354 if (dir == object->dentry->d_parent) {
355 ret = cachefiles_bury_object(cache, dir, object->dentry);
356 } else {
357 /* it got moved, presumably by cachefilesd culling it, so it's
358 * no longer in the key path and we can ignore it */
359 mutex_unlock(&dir->d_inode->i_mutex);
360 ret = 0;
361 }
352 362
353 dput(dir); 363 dput(dir);
354 _leave(" = %d", ret); 364 _leave(" = %d", ret);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 7b2600b380d7..bc0025cdd1c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,8 @@
1Version 1.62
2------------
3Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
4to more strictly handle corrupt frames.
5
1Version 1.61 6Version 1.61
2------------ 7------------
3Fix append problem to Samba servers (files opened with O_APPEND could 8Fix append problem to Samba servers (files opened with O_APPEND could
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ac2b24c192f8..78c1b86d55f6 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
113extern const struct export_operations cifs_export_ops; 113extern const struct export_operations cifs_export_ops;
114#endif /* EXPERIMENTAL */ 114#endif /* EXPERIMENTAL */
115 115
116#define CIFS_VERSION "1.61" 116#define CIFS_VERSION "1.62"
117#endif /* _CIFSFS_H */ 117#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4b35f7ec0583..a1c817eb291a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -149,6 +149,7 @@ struct TCP_Server_Info {
149 bool svlocal:1; /* local server or remote */ 149 bool svlocal:1; /* local server or remote */
150 bool noblocksnd; /* use blocking sendmsg */ 150 bool noblocksnd; /* use blocking sendmsg */
151 bool noautotune; /* do not autotune send buf sizes */ 151 bool noautotune; /* do not autotune send buf sizes */
152 bool tcp_nodelay;
152 atomic_t inFlight; /* number of requests on the wire to server */ 153 atomic_t inFlight; /* number of requests on the wire to server */
153#ifdef CONFIG_CIFS_STATS2 154#ifdef CONFIG_CIFS_STATS2
154 atomic_t inSend; /* requests trying to send */ 155 atomic_t inSend; /* requests trying to send */
@@ -204,7 +205,7 @@ struct cifsUidInfo {
204struct cifsSesInfo { 205struct cifsSesInfo {
205 struct list_head smb_ses_list; 206 struct list_head smb_ses_list;
206 struct list_head tcon_list; 207 struct list_head tcon_list;
207 struct semaphore sesSem; 208 struct mutex session_mutex;
208#if 0 209#if 0
209 struct cifsUidInfo *uidInfo; /* pointer to user info */ 210 struct cifsUidInfo *uidInfo; /* pointer to user info */
210#endif 211#endif
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 3877737f96a6..14d036d8db11 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -415,10 +415,10 @@ struct smb_hdr {
415 __u8 WordCount; 415 __u8 WordCount;
416} __attribute__((packed)); 416} __attribute__((packed));
417/* given a pointer to an smb_hdr retrieve the value of byte count */ 417/* given a pointer to an smb_hdr retrieve the value of byte count */
418#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 418#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
419#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 419#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ 420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
421#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2) 421#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
422 422
423/* 423/*
424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20) 424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5646727e33f5..88e2bc44ac58 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -363,13 +363,10 @@ extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
363 __u32 filter, struct file *file, int multishot, 363 __u32 filter, struct file *file, int multishot,
364 const struct nls_table *nls_codepage); 364 const struct nls_table *nls_codepage);
365extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 365extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
366 const unsigned char *searchName, char *EAData, 366 const unsigned char *searchName,
367 const unsigned char *ea_name, char *EAData,
367 size_t bufsize, const struct nls_table *nls_codepage, 368 size_t bufsize, const struct nls_table *nls_codepage,
368 int remap_special_chars); 369 int remap_special_chars);
369extern ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
370 const unsigned char *searchName, const unsigned char *ea_name,
371 unsigned char *ea_value, size_t buf_size,
372 const struct nls_table *nls_codepage, int remap_special_chars);
373extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, 370extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
374 const char *fileName, const char *ea_name, 371 const char *fileName, const char *ea_name,
375 const void *ea_value, const __u16 ea_value_len, 372 const void *ea_value, const __u16 ea_value_len,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 941441d3e386..9d17df3e0768 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -170,19 +170,19 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
170 * need to prevent multiple threads trying to simultaneously 170 * need to prevent multiple threads trying to simultaneously
171 * reconnect the same SMB session 171 * reconnect the same SMB session
172 */ 172 */
173 down(&ses->sesSem); 173 mutex_lock(&ses->session_mutex);
174 if (ses->need_reconnect) 174 if (ses->need_reconnect)
175 rc = cifs_setup_session(0, ses, nls_codepage); 175 rc = cifs_setup_session(0, ses, nls_codepage);
176 176
177 /* do we need to reconnect tcon? */ 177 /* do we need to reconnect tcon? */
178 if (rc || !tcon->need_reconnect) { 178 if (rc || !tcon->need_reconnect) {
179 up(&ses->sesSem); 179 mutex_unlock(&ses->session_mutex);
180 goto out; 180 goto out;
181 } 181 }
182 182
183 mark_open_files_invalid(tcon); 183 mark_open_files_invalid(tcon);
184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); 184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
185 up(&ses->sesSem); 185 mutex_unlock(&ses->session_mutex);
186 cFYI(1, ("reconnect tcon rc = %d", rc)); 186 cFYI(1, ("reconnect tcon rc = %d", rc));
187 187
188 if (rc) 188 if (rc)
@@ -700,13 +700,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
700 if (!ses || !ses->server) 700 if (!ses || !ses->server)
701 return -EIO; 701 return -EIO;
702 702
703 down(&ses->sesSem); 703 mutex_lock(&ses->session_mutex);
704 if (ses->need_reconnect) 704 if (ses->need_reconnect)
705 goto session_already_dead; /* no need to send SMBlogoff if uid 705 goto session_already_dead; /* no need to send SMBlogoff if uid
706 already closed due to reconnect */ 706 already closed due to reconnect */
707 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); 707 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
708 if (rc) { 708 if (rc) {
709 up(&ses->sesSem); 709 mutex_unlock(&ses->session_mutex);
710 return rc; 710 return rc;
711 } 711 }
712 712
@@ -721,7 +721,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
721 pSMB->AndXCommand = 0xFF; 721 pSMB->AndXCommand = 0xFF;
722 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); 722 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
723session_already_dead: 723session_already_dead:
724 up(&ses->sesSem); 724 mutex_unlock(&ses->session_mutex);
725 725
726 /* if session dead then we do not need to do ulogoff, 726 /* if session dead then we do not need to do ulogoff,
727 since server closed smb session, no sense reporting 727 since server closed smb session, no sense reporting
@@ -5269,22 +5269,34 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5269 cifs_buf_release(pSMB); 5269 cifs_buf_release(pSMB);
5270 return rc; 5270 return rc;
5271} 5271}
5272
5272#ifdef CONFIG_CIFS_XATTR 5273#ifdef CONFIG_CIFS_XATTR
5274/*
5275 * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
5276 * function used by listxattr and getxattr type calls. When ea_name is set,
5277 * it looks for that attribute name and stuffs that value into the EAData
5278 * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
5279 * buffer. In both cases, the return value is either the length of the
5280 * resulting data or a negative error code. If EAData is a NULL pointer then
5281 * the data isn't copied to it, but the length is returned.
5282 */
5273ssize_t 5283ssize_t
5274CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 5284CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
5275 const unsigned char *searchName, 5285 const unsigned char *searchName, const unsigned char *ea_name,
5276 char *EAData, size_t buf_size, 5286 char *EAData, size_t buf_size,
5277 const struct nls_table *nls_codepage, int remap) 5287 const struct nls_table *nls_codepage, int remap)
5278{ 5288{
5279 /* BB assumes one setup word */ 5289 /* BB assumes one setup word */
5280 TRANSACTION2_QPI_REQ *pSMB = NULL; 5290 TRANSACTION2_QPI_REQ *pSMB = NULL;
5281 TRANSACTION2_QPI_RSP *pSMBr = NULL; 5291 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5282 int rc = 0; 5292 int rc = 0;
5283 int bytes_returned; 5293 int bytes_returned;
5284 int name_len; 5294 int list_len;
5295 struct fealist *ea_response_data;
5285 struct fea *temp_fea; 5296 struct fea *temp_fea;
5286 char *temp_ptr; 5297 char *temp_ptr;
5287 __u16 params, byte_count; 5298 char *end_of_smb;
5299 __u16 params, byte_count, data_offset;
5288 5300
5289 cFYI(1, ("In Query All EAs path %s", searchName)); 5301 cFYI(1, ("In Query All EAs path %s", searchName));
5290QAllEAsRetry: 5302QAllEAsRetry:
@@ -5294,22 +5306,22 @@ QAllEAsRetry:
5294 return rc; 5306 return rc;
5295 5307
5296 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5308 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5297 name_len = 5309 list_len =
5298 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5310 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
5299 PATH_MAX, nls_codepage, remap); 5311 PATH_MAX, nls_codepage, remap);
5300 name_len++; /* trailing null */ 5312 list_len++; /* trailing null */
5301 name_len *= 2; 5313 list_len *= 2;
5302 } else { /* BB improve the check for buffer overruns BB */ 5314 } else { /* BB improve the check for buffer overruns BB */
5303 name_len = strnlen(searchName, PATH_MAX); 5315 list_len = strnlen(searchName, PATH_MAX);
5304 name_len++; /* trailing null */ 5316 list_len++; /* trailing null */
5305 strncpy(pSMB->FileName, searchName, name_len); 5317 strncpy(pSMB->FileName, searchName, list_len);
5306 } 5318 }
5307 5319
5308 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5320 params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
5309 pSMB->TotalDataCount = 0; 5321 pSMB->TotalDataCount = 0;
5310 pSMB->MaxParameterCount = cpu_to_le16(2); 5322 pSMB->MaxParameterCount = cpu_to_le16(2);
5311 /* BB find exact max SMB PDU from sess structure BB */ 5323 /* BB find exact max SMB PDU from sess structure BB */
5312 pSMB->MaxDataCount = cpu_to_le16(4000); 5324 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
5313 pSMB->MaxSetupCount = 0; 5325 pSMB->MaxSetupCount = 0;
5314 pSMB->Reserved = 0; 5326 pSMB->Reserved = 0;
5315 pSMB->Flags = 0; 5327 pSMB->Flags = 0;
@@ -5334,237 +5346,117 @@ QAllEAsRetry:
5334 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5346 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5335 if (rc) { 5347 if (rc) {
5336 cFYI(1, ("Send error in QueryAllEAs = %d", rc)); 5348 cFYI(1, ("Send error in QueryAllEAs = %d", rc));
5337 } else { /* decode response */ 5349 goto QAllEAsOut;
5338 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5350 }
5339 5351
5340 /* BB also check enough total bytes returned */ 5352
5341 /* BB we need to improve the validity checking 5353 /* BB also check enough total bytes returned */
5342 of these trans2 responses */ 5354 /* BB we need to improve the validity checking
5343 if (rc || (pSMBr->ByteCount < 4)) 5355 of these trans2 responses */
5344 rc = -EIO; /* bad smb */ 5356
5345 /* else if (pFindData){ 5357 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
5346 memcpy((char *) pFindData, 5358 if (rc || (pSMBr->ByteCount < 4)) {
5347 (char *) &pSMBr->hdr.Protocol + 5359 rc = -EIO; /* bad smb */
5348 data_offset, kl); 5360 goto QAllEAsOut;
5349 }*/ else {
5350 /* check that length of list is not more than bcc */
5351 /* check that each entry does not go beyond length
5352 of list */
5353 /* check that each element of each entry does not
5354 go beyond end of list */
5355 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5356 struct fealist *ea_response_data;
5357 rc = 0;
5358 /* validate_trans2_offsets() */
5359 /* BB check if start of smb + data_offset > &bcc+ bcc */
5360 ea_response_data = (struct fealist *)
5361 (((char *) &pSMBr->hdr.Protocol) +
5362 data_offset);
5363 name_len = le32_to_cpu(ea_response_data->list_len);
5364 cFYI(1, ("ea length %d", name_len));
5365 if (name_len <= 8) {
5366 /* returned EA size zeroed at top of function */
5367 cFYI(1, ("empty EA list returned from server"));
5368 } else {
5369 /* account for ea list len */
5370 name_len -= 4;
5371 temp_fea = ea_response_data->list;
5372 temp_ptr = (char *)temp_fea;
5373 while (name_len > 0) {
5374 __u16 value_len;
5375 name_len -= 4;
5376 temp_ptr += 4;
5377 rc += temp_fea->name_len;
5378 /* account for prefix user. and trailing null */
5379 rc = rc + 5 + 1;
5380 if (rc < (int)buf_size) {
5381 memcpy(EAData, "user.", 5);
5382 EAData += 5;
5383 memcpy(EAData, temp_ptr,
5384 temp_fea->name_len);
5385 EAData += temp_fea->name_len;
5386 /* null terminate name */
5387 *EAData = 0;
5388 EAData = EAData + 1;
5389 } else if (buf_size == 0) {
5390 /* skip copy - calc size only */
5391 } else {
5392 /* stop before overrun buffer */
5393 rc = -ERANGE;
5394 break;
5395 }
5396 name_len -= temp_fea->name_len;
5397 temp_ptr += temp_fea->name_len;
5398 /* account for trailing null */
5399 name_len--;
5400 temp_ptr++;
5401 value_len =
5402 le16_to_cpu(temp_fea->value_len);
5403 name_len -= value_len;
5404 temp_ptr += value_len;
5405 /* BB check that temp_ptr is still
5406 within the SMB BB*/
5407
5408 /* no trailing null to account for
5409 in value len */
5410 /* go on to next EA */
5411 temp_fea = (struct fea *)temp_ptr;
5412 }
5413 }
5414 }
5415 } 5361 }
5416 cifs_buf_release(pSMB);
5417 if (rc == -EAGAIN)
5418 goto QAllEAsRetry;
5419 5362
5420 return (ssize_t)rc; 5363 /* check that length of list is not more than bcc */
5421} 5364 /* check that each entry does not go beyond length
5365 of list */
5366 /* check that each element of each entry does not
5367 go beyond end of list */
5368 /* validate_trans2_offsets() */
5369 /* BB check if start of smb + data_offset > &bcc+ bcc */
5422 5370
5423ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon, 5371 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5424 const unsigned char *searchName, const unsigned char *ea_name, 5372 ea_response_data = (struct fealist *)
5425 unsigned char *ea_value, size_t buf_size, 5373 (((char *) &pSMBr->hdr.Protocol) + data_offset);
5426 const struct nls_table *nls_codepage, int remap)
5427{
5428 TRANSACTION2_QPI_REQ *pSMB = NULL;
5429 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5430 int rc = 0;
5431 int bytes_returned;
5432 int name_len;
5433 struct fea *temp_fea;
5434 char *temp_ptr;
5435 __u16 params, byte_count;
5436 5374
5437 cFYI(1, ("In Query EA path %s", searchName)); 5375 list_len = le32_to_cpu(ea_response_data->list_len);
5438QEARetry: 5376 cFYI(1, ("ea length %d", list_len));
5439 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5377 if (list_len <= 8) {
5440 (void **) &pSMBr); 5378 cFYI(1, ("empty EA list returned from server"));
5441 if (rc) 5379 goto QAllEAsOut;
5442 return rc; 5380 }
5443 5381
5444 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5382 /* make sure list_len doesn't go past end of SMB */
5445 name_len = 5383 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
5446 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5384 if ((char *)ea_response_data + list_len > end_of_smb) {
5447 PATH_MAX, nls_codepage, remap); 5385 cFYI(1, ("EA list appears to go beyond SMB"));
5448 name_len++; /* trailing null */ 5386 rc = -EIO;
5449 name_len *= 2; 5387 goto QAllEAsOut;
5450 } else { /* BB improve the check for buffer overruns BB */
5451 name_len = strnlen(searchName, PATH_MAX);
5452 name_len++; /* trailing null */
5453 strncpy(pSMB->FileName, searchName, name_len);
5454 } 5388 }
5455 5389
5456 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5390 /* account for ea list len */
5457 pSMB->TotalDataCount = 0; 5391 list_len -= 4;
5458 pSMB->MaxParameterCount = cpu_to_le16(2); 5392 temp_fea = ea_response_data->list;
5459 /* BB find exact max SMB PDU from sess structure BB */ 5393 temp_ptr = (char *)temp_fea;
5460 pSMB->MaxDataCount = cpu_to_le16(4000); 5394 while (list_len > 0) {
5461 pSMB->MaxSetupCount = 0; 5395 unsigned int name_len;
5462 pSMB->Reserved = 0; 5396 __u16 value_len;
5463 pSMB->Flags = 0; 5397
5464 pSMB->Timeout = 0; 5398 list_len -= 4;
5465 pSMB->Reserved2 = 0; 5399 temp_ptr += 4;
5466 pSMB->ParameterOffset = cpu_to_le16(offsetof( 5400 /* make sure we can read name_len and value_len */
5467 struct smb_com_transaction2_qpi_req, InformationLevel) - 4); 5401 if (list_len < 0) {
5468 pSMB->DataCount = 0; 5402 cFYI(1, ("EA entry goes beyond length of list"));
5469 pSMB->DataOffset = 0; 5403 rc = -EIO;
5470 pSMB->SetupCount = 1; 5404 goto QAllEAsOut;
5471 pSMB->Reserved3 = 0; 5405 }
5472 pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
5473 byte_count = params + 1 /* pad */ ;
5474 pSMB->TotalParameterCount = cpu_to_le16(params);
5475 pSMB->ParameterCount = pSMB->TotalParameterCount;
5476 pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
5477 pSMB->Reserved4 = 0;
5478 pSMB->hdr.smb_buf_length += byte_count;
5479 pSMB->ByteCount = cpu_to_le16(byte_count);
5480 5406
5481 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5407 name_len = temp_fea->name_len;
5482 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5408 value_len = le16_to_cpu(temp_fea->value_len);
5483 if (rc) { 5409 list_len -= name_len + 1 + value_len;
5484 cFYI(1, ("Send error in Query EA = %d", rc)); 5410 if (list_len < 0) {
5485 } else { /* decode response */ 5411 cFYI(1, ("EA entry goes beyond length of list"));
5486 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5412 rc = -EIO;
5413 goto QAllEAsOut;
5414 }
5487 5415
5488 /* BB also check enough total bytes returned */ 5416 if (ea_name) {
5489 /* BB we need to improve the validity checking 5417 if (strncmp(ea_name, temp_ptr, name_len) == 0) {
5490 of these trans2 responses */ 5418 temp_ptr += name_len + 1;
5491 if (rc || (pSMBr->ByteCount < 4)) 5419 rc = value_len;
5492 rc = -EIO; /* bad smb */ 5420 if (buf_size == 0)
5493 /* else if (pFindData){ 5421 goto QAllEAsOut;
5494 memcpy((char *) pFindData, 5422 if ((size_t)value_len > buf_size) {
5495 (char *) &pSMBr->hdr.Protocol + 5423 rc = -ERANGE;
5496 data_offset, kl); 5424 goto QAllEAsOut;
5497 }*/ else {
5498 /* check that length of list is not more than bcc */
5499 /* check that each entry does not go beyond length
5500 of list */
5501 /* check that each element of each entry does not
5502 go beyond end of list */
5503 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5504 struct fealist *ea_response_data;
5505 rc = -ENODATA;
5506 /* validate_trans2_offsets() */
5507 /* BB check if start of smb + data_offset > &bcc+ bcc*/
5508 ea_response_data = (struct fealist *)
5509 (((char *) &pSMBr->hdr.Protocol) +
5510 data_offset);
5511 name_len = le32_to_cpu(ea_response_data->list_len);
5512 cFYI(1, ("ea length %d", name_len));
5513 if (name_len <= 8) {
5514 /* returned EA size zeroed at top of function */
5515 cFYI(1, ("empty EA list returned from server"));
5516 } else {
5517 /* account for ea list len */
5518 name_len -= 4;
5519 temp_fea = ea_response_data->list;
5520 temp_ptr = (char *)temp_fea;
5521 /* loop through checking if we have a matching
5522 name and then return the associated value */
5523 while (name_len > 0) {
5524 __u16 value_len;
5525 name_len -= 4;
5526 temp_ptr += 4;
5527 value_len =
5528 le16_to_cpu(temp_fea->value_len);
5529 /* BB validate that value_len falls within SMB,
5530 even though maximum for name_len is 255 */
5531 if (memcmp(temp_fea->name, ea_name,
5532 temp_fea->name_len) == 0) {
5533 /* found a match */
5534 rc = value_len;
5535 /* account for prefix user. and trailing null */
5536 if (rc <= (int)buf_size) {
5537 memcpy(ea_value,
5538 temp_fea->name+temp_fea->name_len+1,
5539 rc);
5540 /* ea values, unlike ea
5541 names, are not null
5542 terminated */
5543 } else if (buf_size == 0) {
5544 /* skip copy - calc size only */
5545 } else {
5546 /* stop before overrun buffer */
5547 rc = -ERANGE;
5548 }
5549 break;
5550 }
5551 name_len -= temp_fea->name_len;
5552 temp_ptr += temp_fea->name_len;
5553 /* account for trailing null */
5554 name_len--;
5555 temp_ptr++;
5556 name_len -= value_len;
5557 temp_ptr += value_len;
5558 /* No trailing null to account for in
5559 value_len. Go on to next EA */
5560 temp_fea = (struct fea *)temp_ptr;
5561 } 5425 }
5426 memcpy(EAData, temp_ptr, value_len);
5427 goto QAllEAsOut;
5428 }
5429 } else {
5430 /* account for prefix user. and trailing null */
5431 rc += (5 + 1 + name_len);
5432 if (rc < (int) buf_size) {
5433 memcpy(EAData, "user.", 5);
5434 EAData += 5;
5435 memcpy(EAData, temp_ptr, name_len);
5436 EAData += name_len;
5437 /* null terminate name */
5438 *EAData = 0;
5439 ++EAData;
5440 } else if (buf_size == 0) {
5441 /* skip copy - calc size only */
5442 } else {
5443 /* stop before overrun buffer */
5444 rc = -ERANGE;
5445 break;
5562 } 5446 }
5563 } 5447 }
5448 temp_ptr += name_len + 1 + value_len;
5449 temp_fea = (struct fea *)temp_ptr;
5564 } 5450 }
5451
5452 /* didn't find the named attribute */
5453 if (ea_name)
5454 rc = -ENODATA;
5455
5456QAllEAsOut:
5565 cifs_buf_release(pSMB); 5457 cifs_buf_release(pSMB);
5566 if (rc == -EAGAIN) 5458 if (rc == -EAGAIN)
5567 goto QEARetry; 5459 goto QAllEAsRetry;
5568 5460
5569 return (ssize_t)rc; 5461 return (ssize_t)rc;
5570} 5462}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3bbcaa716b3c..45eb6cba793f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -98,7 +98,7 @@ struct smb_vol {
98 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ 98 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
99 unsigned int rsize; 99 unsigned int rsize;
100 unsigned int wsize; 100 unsigned int wsize;
101 unsigned int sockopt; 101 bool sockopt_tcp_nodelay:1;
102 unsigned short int port; 102 unsigned short int port;
103 char *prepath; 103 char *prepath;
104}; 104};
@@ -1142,9 +1142,11 @@ cifs_parse_mount_options(char *options, const char *devname,
1142 simple_strtoul(value, &value, 0); 1142 simple_strtoul(value, &value, 0);
1143 } 1143 }
1144 } else if (strnicmp(data, "sockopt", 5) == 0) { 1144 } else if (strnicmp(data, "sockopt", 5) == 0) {
1145 if (value && *value) { 1145 if (!value || !*value) {
1146 vol->sockopt = 1146 cERROR(1, ("no socket option specified"));
1147 simple_strtoul(value, &value, 0); 1147 continue;
1148 } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
1149 vol->sockopt_tcp_nodelay = 1;
1148 } 1150 }
1149 } else if (strnicmp(data, "netbiosname", 4) == 0) { 1151 } else if (strnicmp(data, "netbiosname", 4) == 0) {
1150 if (!value || !*value || (*value == ' ')) { 1152 if (!value || !*value || (*value == ' ')) {
@@ -1514,6 +1516,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1514 1516
1515 tcp_ses->noblocksnd = volume_info->noblocksnd; 1517 tcp_ses->noblocksnd = volume_info->noblocksnd;
1516 tcp_ses->noautotune = volume_info->noautotune; 1518 tcp_ses->noautotune = volume_info->noautotune;
1519 tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
1517 atomic_set(&tcp_ses->inFlight, 0); 1520 atomic_set(&tcp_ses->inFlight, 0);
1518 init_waitqueue_head(&tcp_ses->response_q); 1521 init_waitqueue_head(&tcp_ses->response_q);
1519 init_waitqueue_head(&tcp_ses->request_q); 1522 init_waitqueue_head(&tcp_ses->request_q);
@@ -1764,6 +1767,7 @@ static int
1764ipv4_connect(struct TCP_Server_Info *server) 1767ipv4_connect(struct TCP_Server_Info *server)
1765{ 1768{
1766 int rc = 0; 1769 int rc = 0;
1770 int val;
1767 bool connected = false; 1771 bool connected = false;
1768 __be16 orig_port = 0; 1772 __be16 orig_port = 0;
1769 struct socket *socket = server->ssocket; 1773 struct socket *socket = server->ssocket;
@@ -1845,6 +1849,14 @@ ipv4_connect(struct TCP_Server_Info *server)
1845 socket->sk->sk_rcvbuf = 140 * 1024; 1849 socket->sk->sk_rcvbuf = 140 * 1024;
1846 } 1850 }
1847 1851
1852 if (server->tcp_nodelay) {
1853 val = 1;
1854 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
1855 (char *)&val, sizeof(val));
1856 if (rc)
1857 cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
1858 }
1859
1848 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx", 1860 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
1849 socket->sk->sk_sndbuf, 1861 socket->sk->sk_sndbuf,
1850 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo)); 1862 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
@@ -1916,6 +1928,7 @@ static int
1916ipv6_connect(struct TCP_Server_Info *server) 1928ipv6_connect(struct TCP_Server_Info *server)
1917{ 1929{
1918 int rc = 0; 1930 int rc = 0;
1931 int val;
1919 bool connected = false; 1932 bool connected = false;
1920 __be16 orig_port = 0; 1933 __be16 orig_port = 0;
1921 struct socket *socket = server->ssocket; 1934 struct socket *socket = server->ssocket;
@@ -1987,6 +2000,15 @@ ipv6_connect(struct TCP_Server_Info *server)
1987 */ 2000 */
1988 socket->sk->sk_rcvtimeo = 7 * HZ; 2001 socket->sk->sk_rcvtimeo = 7 * HZ;
1989 socket->sk->sk_sndtimeo = 5 * HZ; 2002 socket->sk->sk_sndtimeo = 5 * HZ;
2003
2004 if (server->tcp_nodelay) {
2005 val = 1;
2006 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2007 (char *)&val, sizeof(val));
2008 if (rc)
2009 cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
2010 }
2011
1990 server->ssocket = socket; 2012 server->ssocket = socket;
1991 2013
1992 return rc; 2014 return rc;
@@ -2366,13 +2388,13 @@ try_mount_again:
2366 */ 2388 */
2367 cifs_put_tcp_session(srvTcp); 2389 cifs_put_tcp_session(srvTcp);
2368 2390
2369 down(&pSesInfo->sesSem); 2391 mutex_lock(&pSesInfo->session_mutex);
2370 if (pSesInfo->need_reconnect) { 2392 if (pSesInfo->need_reconnect) {
2371 cFYI(1, ("Session needs reconnect")); 2393 cFYI(1, ("Session needs reconnect"));
2372 rc = cifs_setup_session(xid, pSesInfo, 2394 rc = cifs_setup_session(xid, pSesInfo,
2373 cifs_sb->local_nls); 2395 cifs_sb->local_nls);
2374 } 2396 }
2375 up(&pSesInfo->sesSem); 2397 mutex_unlock(&pSesInfo->session_mutex);
2376 } else if (!rc) { 2398 } else if (!rc) {
2377 cFYI(1, ("Existing smb sess not found")); 2399 cFYI(1, ("Existing smb sess not found"));
2378 pSesInfo = sesInfoAlloc(); 2400 pSesInfo = sesInfoAlloc();
@@ -2415,12 +2437,12 @@ try_mount_again:
2415 } 2437 }
2416 pSesInfo->linux_uid = volume_info->linux_uid; 2438 pSesInfo->linux_uid = volume_info->linux_uid;
2417 pSesInfo->overrideSecFlg = volume_info->secFlg; 2439 pSesInfo->overrideSecFlg = volume_info->secFlg;
2418 down(&pSesInfo->sesSem); 2440 mutex_lock(&pSesInfo->session_mutex);
2419 2441
2420 /* BB FIXME need to pass vol->secFlgs BB */ 2442 /* BB FIXME need to pass vol->secFlgs BB */
2421 rc = cifs_setup_session(xid, pSesInfo, 2443 rc = cifs_setup_session(xid, pSesInfo,
2422 cifs_sb->local_nls); 2444 cifs_sb->local_nls);
2423 up(&pSesInfo->sesSem); 2445 mutex_unlock(&pSesInfo->session_mutex);
2424 } 2446 }
2425 2447
2426 /* search for existing tcon to this server share */ 2448 /* search for existing tcon to this server share */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 057e1dae12ab..3d8f8a96f5a3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2289,9 +2289,9 @@ cifs_oplock_break(struct slow_work *work)
2289 if (inode && S_ISREG(inode->i_mode)) { 2289 if (inode && S_ISREG(inode->i_mode)) {
2290#ifdef CONFIG_CIFS_EXPERIMENTAL 2290#ifdef CONFIG_CIFS_EXPERIMENTAL
2291 if (cinode->clientCanCacheAll == 0) 2291 if (cinode->clientCanCacheAll == 0)
2292 break_lease(inode, FMODE_READ); 2292 break_lease(inode, O_RDONLY);
2293 else if (cinode->clientCanCacheRead == 0) 2293 else if (cinode->clientCanCacheRead == 0)
2294 break_lease(inode, FMODE_WRITE); 2294 break_lease(inode, O_WRONLY);
2295#endif 2295#endif
2296 rc = filemap_fdatawrite(inode->i_mapping); 2296 rc = filemap_fdatawrite(inode->i_mapping);
2297 if (cinode->clientCanCacheRead == 0) { 2297 if (cinode->clientCanCacheRead == 0) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cf18ee765590..8bdbc818164c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -111,6 +111,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
111 111
112 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; 112 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
113 113
114 cifs_i->server_eof = fattr->cf_eof;
114 /* 115 /*
115 * Can't safely change the file size here if the client is writing to 116 * Can't safely change the file size here if the client is writing to
116 * it due to potential races. 117 * it due to potential races.
@@ -366,7 +367,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
366 char ea_value[4]; 367 char ea_value[4];
367 __u32 mode; 368 __u32 mode;
368 369
369 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS", 370 rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
370 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 371 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
371 cifs_sb->mnt_cifs_flags & 372 cifs_sb->mnt_cifs_flags &
372 CIFS_MOUNT_MAP_SPECIAL_CHR); 373 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -1762,8 +1763,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1762 CIFS_MOUNT_MAP_SPECIAL_CHR); 1763 CIFS_MOUNT_MAP_SPECIAL_CHR);
1763 } 1764 }
1764 1765
1765 if (!rc) 1766 if (!rc) {
1766 rc = inode_setattr(inode, attrs); 1767 rc = inode_setattr(inode, attrs);
1768
1769 /* force revalidate when any of these times are set since some
1770 of the fs types (eg ext3, fat) do not have fine enough
1771 time granularity to match protocol, and we do not have a
1772 a way (yet) to query the server fs's time granularity (and
1773 whether it rounds times down).
1774 */
1775 if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)))
1776 cifsInode->time = 0;
1777 }
1767out: 1778out:
1768 kfree(args); 1779 kfree(args);
1769 kfree(full_path); 1780 kfree(full_path);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d27d4ec6579b..d1474996a812 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -79,7 +79,7 @@ sesInfoAlloc(void)
79 ++ret_buf->ses_count; 79 ++ret_buf->ses_count;
80 INIT_LIST_HEAD(&ret_buf->smb_ses_list); 80 INIT_LIST_HEAD(&ret_buf->smb_ses_list);
81 INIT_LIST_HEAD(&ret_buf->tcon_list); 81 INIT_LIST_HEAD(&ret_buf->tcon_list);
82 init_MUTEX(&ret_buf->sesSem); 82 mutex_init(&ret_buf->session_mutex);
83 } 83 }
84 return ret_buf; 84 return ret_buf;
85} 85}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f84062f9a985..c343b14ba2d3 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -77,6 +77,11 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
77 77
78 cFYI(1, ("For %s", name->name)); 78 cFYI(1, ("For %s", name->name));
79 79
80 if (parent->d_op && parent->d_op->d_hash)
81 parent->d_op->d_hash(parent, name);
82 else
83 name->hash = full_name_hash(name->name, name->len);
84
80 dentry = d_lookup(parent, name); 85 dentry = d_lookup(parent, name);
81 if (dentry) { 86 if (dentry) {
82 /* FIXME: check for inode number changes? */ 87 /* FIXME: check for inode number changes? */
@@ -666,12 +671,11 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
666 min(len, max_len), nlt, 671 min(len, max_len), nlt,
667 cifs_sb->mnt_cifs_flags & 672 cifs_sb->mnt_cifs_flags &
668 CIFS_MOUNT_MAP_SPECIAL_CHR); 673 CIFS_MOUNT_MAP_SPECIAL_CHR);
674 pqst->len -= nls_nullsize(nlt);
669 } else { 675 } else {
670 pqst->name = filename; 676 pqst->name = filename;
671 pqst->len = len; 677 pqst->len = len;
672 } 678 }
673 pqst->hash = full_name_hash(pqst->name, pqst->len);
674/* cFYI(1, ("filldir on %s",pqst->name)); */
675 return rc; 679 return rc;
676} 680}
677 681
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7085a6275c4c..aaa9c1c5a5bd 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -223,9 +223,9 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
223 /* null user mount */ 223 /* null user mount */
224 *bcc_ptr = 0; 224 *bcc_ptr = 0;
225 *(bcc_ptr+1) = 0; 225 *(bcc_ptr+1) = 0;
226 } else { /* 300 should be long enough for any conceivable user name */ 226 } else {
227 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName, 227 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
228 300, nls_cp); 228 MAX_USERNAME_SIZE, nls_cp);
229 } 229 }
230 bcc_ptr += 2 * bytes_ret; 230 bcc_ptr += 2 * bytes_ret;
231 bcc_ptr += 2; /* account for null termination */ 231 bcc_ptr += 2; /* account for null termination */
@@ -246,11 +246,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
246 /* copy user */ 246 /* copy user */
247 if (ses->userName == NULL) { 247 if (ses->userName == NULL) {
248 /* BB what about null user mounts - check that we do this BB */ 248 /* BB what about null user mounts - check that we do this BB */
249 } else { /* 300 should be long enough for any conceivable user name */ 249 } else {
250 strncpy(bcc_ptr, ses->userName, 300); 250 strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE);
251 } 251 }
252 /* BB improve check for overflow */ 252 bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
253 bcc_ptr += strnlen(ses->userName, 300);
254 *bcc_ptr = 0; 253 *bcc_ptr = 0;
255 bcc_ptr++; /* account for null termination */ 254 bcc_ptr++; /* account for null termination */
256 255
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a75afa3dd9e1..3e2ef0de1209 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -244,7 +244,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
244 /* revalidate/getattr then populate from inode */ 244 /* revalidate/getattr then populate from inode */
245 } /* BB add else when above is implemented */ 245 } /* BB add else when above is implemented */
246 ea_name += 5; /* skip past user. prefix */ 246 ea_name += 5; /* skip past user. prefix */
247 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 247 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
248 buf_size, cifs_sb->local_nls, 248 buf_size, cifs_sb->local_nls,
249 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 249 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
250 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { 250 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
@@ -252,7 +252,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
252 goto get_ea_exit; 252 goto get_ea_exit;
253 253
254 ea_name += 4; /* skip past os2. prefix */ 254 ea_name += 4; /* skip past os2. prefix */
255 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 255 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
256 buf_size, cifs_sb->local_nls, 256 buf_size, cifs_sb->local_nls,
257 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 257 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
258 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, 258 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -364,8 +364,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
364 /* if proc/fs/cifs/streamstoxattr is set then 364 /* if proc/fs/cifs/streamstoxattr is set then
365 search server for EAs or streams to 365 search server for EAs or streams to
366 returns as xattrs */ 366 returns as xattrs */
367 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, data, buf_size, 367 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
368 cifs_sb->local_nls, 368 buf_size, cifs_sb->local_nls,
369 cifs_sb->mnt_cifs_flags & 369 cifs_sb->mnt_cifs_flags &
370 CIFS_MOUNT_MAP_SPECIAL_CHR); 370 CIFS_MOUNT_MAP_SPECIAL_CHR);
371 371
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c5c45de1a2ee..0ca9ec4a79c3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -301,6 +301,12 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
301 u32 data; 301 u32 data;
302 void __user *dxferp; 302 void __user *dxferp;
303 int err; 303 int err;
304 int interface_id;
305
306 if (get_user(interface_id, &sgio32->interface_id))
307 return -EFAULT;
308 if (interface_id != 'S')
309 return sys_ioctl(fd, cmd, (unsigned long)sgio32);
304 310
305 if (get_user(iovec_count, &sgio32->iovec_count)) 311 if (get_user(iovec_count, &sgio32->iovec_count))
306 return -EFAULT; 312 return -EFAULT;
@@ -936,6 +942,7 @@ COMPATIBLE_IOCTL(TCSETSF)
936COMPATIBLE_IOCTL(TIOCLINUX) 942COMPATIBLE_IOCTL(TIOCLINUX)
937COMPATIBLE_IOCTL(TIOCSBRK) 943COMPATIBLE_IOCTL(TIOCSBRK)
938COMPATIBLE_IOCTL(TIOCCBRK) 944COMPATIBLE_IOCTL(TIOCCBRK)
945COMPATIBLE_IOCTL(TIOCGSID)
939COMPATIBLE_IOCTL(TIOCGICOUNT) 946COMPATIBLE_IOCTL(TIOCGICOUNT)
940/* Little t */ 947/* Little t */
941COMPATIBLE_IOCTL(TIOCGETD) 948COMPATIBLE_IOCTL(TIOCGETD)
@@ -1038,6 +1045,8 @@ COMPATIBLE_IOCTL(FIOQSIZE)
1038#ifdef CONFIG_BLOCK 1045#ifdef CONFIG_BLOCK
1039/* loop */ 1046/* loop */
1040IGNORE_IOCTL(LOOP_CLR_FD) 1047IGNORE_IOCTL(LOOP_CLR_FD)
1048/* md calls this on random blockdevs */
1049IGNORE_IOCTL(RAID_VERSION)
1041/* SG stuff */ 1050/* SG stuff */
1042COMPATIBLE_IOCTL(SG_SET_TIMEOUT) 1051COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
1043COMPATIBLE_IOCTL(SG_GET_TIMEOUT) 1052COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
diff --git a/fs/dcache.c b/fs/dcache.c
index 953173a293a9..f1358e5c3a59 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,6 +257,7 @@ kill_it:
257 if (dentry) 257 if (dentry)
258 goto repeat; 258 goto repeat;
259} 259}
260EXPORT_SYMBOL(dput);
260 261
261/** 262/**
262 * d_invalidate - invalidate a dentry 263 * d_invalidate - invalidate a dentry
@@ -314,6 +315,7 @@ int d_invalidate(struct dentry * dentry)
314 spin_unlock(&dcache_lock); 315 spin_unlock(&dcache_lock);
315 return 0; 316 return 0;
316} 317}
318EXPORT_SYMBOL(d_invalidate);
317 319
318/* This should be called _only_ with dcache_lock held */ 320/* This should be called _only_ with dcache_lock held */
319 321
@@ -328,6 +330,7 @@ struct dentry * dget_locked(struct dentry *dentry)
328{ 330{
329 return __dget_locked(dentry); 331 return __dget_locked(dentry);
330} 332}
333EXPORT_SYMBOL(dget_locked);
331 334
332/** 335/**
333 * d_find_alias - grab a hashed alias of inode 336 * d_find_alias - grab a hashed alias of inode
@@ -384,6 +387,7 @@ struct dentry * d_find_alias(struct inode *inode)
384 } 387 }
385 return de; 388 return de;
386} 389}
390EXPORT_SYMBOL(d_find_alias);
387 391
388/* 392/*
389 * Try to kill dentries associated with this inode. 393 * Try to kill dentries associated with this inode.
@@ -408,6 +412,7 @@ restart:
408 } 412 }
409 spin_unlock(&dcache_lock); 413 spin_unlock(&dcache_lock);
410} 414}
415EXPORT_SYMBOL(d_prune_aliases);
411 416
412/* 417/*
413 * Throw away a dentry - free the inode, dput the parent. This requires that 418 * Throw away a dentry - free the inode, dput the parent. This requires that
@@ -610,6 +615,7 @@ void shrink_dcache_sb(struct super_block * sb)
610{ 615{
611 __shrink_dcache_sb(sb, NULL, 0); 616 __shrink_dcache_sb(sb, NULL, 0);
612} 617}
618EXPORT_SYMBOL(shrink_dcache_sb);
613 619
614/* 620/*
615 * destroy a single subtree of dentries for unmount 621 * destroy a single subtree of dentries for unmount
@@ -792,6 +798,7 @@ positive:
792 spin_unlock(&dcache_lock); 798 spin_unlock(&dcache_lock);
793 return 1; 799 return 1;
794} 800}
801EXPORT_SYMBOL(have_submounts);
795 802
796/* 803/*
797 * Search the dentry child list for the specified parent, 804 * Search the dentry child list for the specified parent,
@@ -876,6 +883,7 @@ void shrink_dcache_parent(struct dentry * parent)
876 while ((found = select_parent(parent)) != 0) 883 while ((found = select_parent(parent)) != 0)
877 __shrink_dcache_sb(sb, &found, 0); 884 __shrink_dcache_sb(sb, &found, 0);
878} 885}
886EXPORT_SYMBOL(shrink_dcache_parent);
879 887
880/* 888/*
881 * Scan `nr' dentries and return the number which remain. 889 * Scan `nr' dentries and return the number which remain.
@@ -968,6 +976,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
968 976
969 return dentry; 977 return dentry;
970} 978}
979EXPORT_SYMBOL(d_alloc);
971 980
972struct dentry *d_alloc_name(struct dentry *parent, const char *name) 981struct dentry *d_alloc_name(struct dentry *parent, const char *name)
973{ 982{
@@ -1012,6 +1021,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
1012 spin_unlock(&dcache_lock); 1021 spin_unlock(&dcache_lock);
1013 security_d_instantiate(entry, inode); 1022 security_d_instantiate(entry, inode);
1014} 1023}
1024EXPORT_SYMBOL(d_instantiate);
1015 1025
1016/** 1026/**
1017 * d_instantiate_unique - instantiate a non-aliased dentry 1027 * d_instantiate_unique - instantiate a non-aliased dentry
@@ -1108,6 +1118,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1108 } 1118 }
1109 return res; 1119 return res;
1110} 1120}
1121EXPORT_SYMBOL(d_alloc_root);
1111 1122
1112static inline struct hlist_head *d_hash(struct dentry *parent, 1123static inline struct hlist_head *d_hash(struct dentry *parent,
1113 unsigned long hash) 1124 unsigned long hash)
@@ -1211,7 +1222,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1211 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); 1222 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
1212 spin_unlock(&dcache_lock); 1223 spin_unlock(&dcache_lock);
1213 security_d_instantiate(new, inode); 1224 security_d_instantiate(new, inode);
1214 d_rehash(dentry);
1215 d_move(new, dentry); 1225 d_move(new, dentry);
1216 iput(inode); 1226 iput(inode);
1217 } else { 1227 } else {
@@ -1225,6 +1235,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1225 d_add(dentry, inode); 1235 d_add(dentry, inode);
1226 return new; 1236 return new;
1227} 1237}
1238EXPORT_SYMBOL(d_splice_alias);
1228 1239
1229/** 1240/**
1230 * d_add_ci - lookup or allocate new dentry with case-exact name 1241 * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -1314,6 +1325,7 @@ err_out:
1314 iput(inode); 1325 iput(inode);
1315 return ERR_PTR(error); 1326 return ERR_PTR(error);
1316} 1327}
1328EXPORT_SYMBOL(d_add_ci);
1317 1329
1318/** 1330/**
1319 * d_lookup - search for a dentry 1331 * d_lookup - search for a dentry
@@ -1357,6 +1369,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1357 } while (read_seqretry(&rename_lock, seq)); 1369 } while (read_seqretry(&rename_lock, seq));
1358 return dentry; 1370 return dentry;
1359} 1371}
1372EXPORT_SYMBOL(d_lookup);
1360 1373
1361struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1374struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1362{ 1375{
@@ -1483,6 +1496,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
1483out: 1496out:
1484 return 0; 1497 return 0;
1485} 1498}
1499EXPORT_SYMBOL(d_validate);
1486 1500
1487/* 1501/*
1488 * When a file is deleted, we have two options: 1502 * When a file is deleted, we have two options:
@@ -1528,6 +1542,7 @@ void d_delete(struct dentry * dentry)
1528 1542
1529 fsnotify_nameremove(dentry, isdir); 1543 fsnotify_nameremove(dentry, isdir);
1530} 1544}
1545EXPORT_SYMBOL(d_delete);
1531 1546
1532static void __d_rehash(struct dentry * entry, struct hlist_head *list) 1547static void __d_rehash(struct dentry * entry, struct hlist_head *list)
1533{ 1548{
@@ -1556,6 +1571,7 @@ void d_rehash(struct dentry * entry)
1556 spin_unlock(&entry->d_lock); 1571 spin_unlock(&entry->d_lock);
1557 spin_unlock(&dcache_lock); 1572 spin_unlock(&dcache_lock);
1558} 1573}
1574EXPORT_SYMBOL(d_rehash);
1559 1575
1560/* 1576/*
1561 * When switching names, the actual string doesn't strictly have to 1577 * When switching names, the actual string doesn't strictly have to
@@ -1702,6 +1718,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
1702 d_move_locked(dentry, target); 1718 d_move_locked(dentry, target);
1703 spin_unlock(&dcache_lock); 1719 spin_unlock(&dcache_lock);
1704} 1720}
1721EXPORT_SYMBOL(d_move);
1705 1722
1706/** 1723/**
1707 * d_ancestor - search for an ancestor 1724 * d_ancestor - search for an ancestor
@@ -1868,6 +1885,7 @@ shouldnt_be_hashed:
1868 spin_unlock(&dcache_lock); 1885 spin_unlock(&dcache_lock);
1869 BUG(); 1886 BUG();
1870} 1887}
1888EXPORT_SYMBOL_GPL(d_materialise_unique);
1871 1889
1872static int prepend(char **buffer, int *buflen, const char *str, int namelen) 1890static int prepend(char **buffer, int *buflen, const char *str, int namelen)
1873{ 1891{
@@ -2005,6 +2023,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
2005 path_put(&root); 2023 path_put(&root);
2006 return res; 2024 return res;
2007} 2025}
2026EXPORT_SYMBOL(d_path);
2008 2027
2009/* 2028/*
2010 * Helper function for dentry_operations.d_dname() members 2029 * Helper function for dentry_operations.d_dname() members
@@ -2171,6 +2190,30 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
2171 return result; 2190 return result;
2172} 2191}
2173 2192
2193int path_is_under(struct path *path1, struct path *path2)
2194{
2195 struct vfsmount *mnt = path1->mnt;
2196 struct dentry *dentry = path1->dentry;
2197 int res;
2198 spin_lock(&vfsmount_lock);
2199 if (mnt != path2->mnt) {
2200 for (;;) {
2201 if (mnt->mnt_parent == mnt) {
2202 spin_unlock(&vfsmount_lock);
2203 return 0;
2204 }
2205 if (mnt->mnt_parent == path2->mnt)
2206 break;
2207 mnt = mnt->mnt_parent;
2208 }
2209 dentry = mnt->mnt_mountpoint;
2210 }
2211 res = is_subdir(dentry, path2->dentry);
2212 spin_unlock(&vfsmount_lock);
2213 return res;
2214}
2215EXPORT_SYMBOL(path_is_under);
2216
2174void d_genocide(struct dentry *root) 2217void d_genocide(struct dentry *root)
2175{ 2218{
2176 struct dentry *this_parent = root; 2219 struct dentry *this_parent = root;
@@ -2228,6 +2271,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name)
2228 } 2271 }
2229 return ino; 2272 return ino;
2230} 2273}
2274EXPORT_SYMBOL(find_inode_number);
2231 2275
2232static __initdata unsigned long dhash_entries; 2276static __initdata unsigned long dhash_entries;
2233static int __init set_dhash_entries(char *str) 2277static int __init set_dhash_entries(char *str)
@@ -2297,6 +2341,7 @@ static void __init dcache_init(void)
2297 2341
2298/* SLAB cache for __getname() consumers */ 2342/* SLAB cache for __getname() consumers */
2299struct kmem_cache *names_cachep __read_mostly; 2343struct kmem_cache *names_cachep __read_mostly;
2344EXPORT_SYMBOL(names_cachep);
2300 2345
2301EXPORT_SYMBOL(d_genocide); 2346EXPORT_SYMBOL(d_genocide);
2302 2347
@@ -2326,26 +2371,3 @@ void __init vfs_caches_init(unsigned long mempages)
2326 bdev_cache_init(); 2371 bdev_cache_init();
2327 chrdev_init(); 2372 chrdev_init();
2328} 2373}
2329
2330EXPORT_SYMBOL(d_alloc);
2331EXPORT_SYMBOL(d_alloc_root);
2332EXPORT_SYMBOL(d_delete);
2333EXPORT_SYMBOL(d_find_alias);
2334EXPORT_SYMBOL(d_instantiate);
2335EXPORT_SYMBOL(d_invalidate);
2336EXPORT_SYMBOL(d_lookup);
2337EXPORT_SYMBOL(d_move);
2338EXPORT_SYMBOL_GPL(d_materialise_unique);
2339EXPORT_SYMBOL(d_path);
2340EXPORT_SYMBOL(d_prune_aliases);
2341EXPORT_SYMBOL(d_rehash);
2342EXPORT_SYMBOL(d_splice_alias);
2343EXPORT_SYMBOL(d_add_ci);
2344EXPORT_SYMBOL(d_validate);
2345EXPORT_SYMBOL(dget_locked);
2346EXPORT_SYMBOL(dput);
2347EXPORT_SYMBOL(find_inode_number);
2348EXPORT_SYMBOL(have_submounts);
2349EXPORT_SYMBOL(names_cachep);
2350EXPORT_SYMBOL(shrink_dcache_parent);
2351EXPORT_SYMBOL(shrink_dcache_sb);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 274ac865bae8..049d6c36da09 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -496,7 +496,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
496 } 496 }
497 d_move(old_dentry, dentry); 497 d_move(old_dentry, dentry);
498 fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, 498 fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
499 old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode), 499 S_ISDIR(old_dentry->d_inode->i_mode),
500 NULL, old_dentry); 500 NULL, old_dentry);
501 fsnotify_oldname_free(old_name); 501 fsnotify_oldname_free(old_name);
502 unlock_rename(new_dir, old_dir); 502 unlock_rename(new_dir, old_dir);
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dc2ad6008b2d..4314f0d48d85 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
33 spin_unlock(&ast_queue_lock); 33 spin_unlock(&ast_queue_lock);
34} 34}
35 35
36void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode) 36void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
37{ 37{
38 if (lkb->lkb_flags & DLM_IFL_USER) { 38 if (lkb->lkb_flags & DLM_IFL_USER) {
39 dlm_user_add_ast(lkb, type, bastmode); 39 dlm_user_add_ast(lkb, type, mode);
40 return; 40 return;
41 } 41 }
42 42
@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { 44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
45 kref_get(&lkb->lkb_ref); 45 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue); 46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 lkb->lkb_ast_first = type;
47 } 48 }
49
50 /* sanity check, this should not happen */
51
52 if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
53 log_print("repeat cast %d castmode %d lock %x %s",
54 mode, lkb->lkb_castmode,
55 lkb->lkb_id, lkb->lkb_resource->res_name);
56
48 lkb->lkb_ast_type |= type; 57 lkb->lkb_ast_type |= type;
49 if (bastmode) 58 if (type == AST_BAST)
50 lkb->lkb_bastmode = bastmode; 59 lkb->lkb_bastmode = mode;
60 else
61 lkb->lkb_castmode = mode;
51 spin_unlock(&ast_queue_lock); 62 spin_unlock(&ast_queue_lock);
52 63
53 set_bit(WAKE_ASTS, &astd_wakeflags); 64 set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,9 +70,9 @@ static void process_asts(void)
59 struct dlm_ls *ls = NULL; 70 struct dlm_ls *ls = NULL;
60 struct dlm_rsb *r = NULL; 71 struct dlm_rsb *r = NULL;
61 struct dlm_lkb *lkb; 72 struct dlm_lkb *lkb;
62 void (*cast) (void *astparam); 73 void (*castfn) (void *astparam);
63 void (*bast) (void *astparam, int mode); 74 void (*bastfn) (void *astparam, int mode);
64 int type = 0, bastmode; 75 int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
65 76
66repeat: 77repeat:
67 spin_lock(&ast_queue_lock); 78 spin_lock(&ast_queue_lock);
@@ -75,17 +86,48 @@ repeat:
75 list_del(&lkb->lkb_astqueue); 86 list_del(&lkb->lkb_astqueue);
76 type = lkb->lkb_ast_type; 87 type = lkb->lkb_ast_type;
77 lkb->lkb_ast_type = 0; 88 lkb->lkb_ast_type = 0;
89 first = lkb->lkb_ast_first;
90 lkb->lkb_ast_first = 0;
78 bastmode = lkb->lkb_bastmode; 91 bastmode = lkb->lkb_bastmode;
79 92 castmode = lkb->lkb_castmode;
93 castfn = lkb->lkb_astfn;
94 bastfn = lkb->lkb_bastfn;
80 spin_unlock(&ast_queue_lock); 95 spin_unlock(&ast_queue_lock);
81 cast = lkb->lkb_astfn;
82 bast = lkb->lkb_bastfn;
83
84 if ((type & AST_COMP) && cast)
85 cast(lkb->lkb_astparam);
86 96
87 if ((type & AST_BAST) && bast) 97 do_cast = (type & AST_COMP) && castfn;
88 bast(lkb->lkb_astparam, bastmode); 98 do_bast = (type & AST_BAST) && bastfn;
99
100 /* Skip a bast if its blocking mode is compatible with the
101 granted mode of the preceding cast. */
102
103 if (do_bast) {
104 if (first == AST_COMP)
105 last_castmode = castmode;
106 else
107 last_castmode = lkb->lkb_castmode_done;
108 if (dlm_modes_compat(bastmode, last_castmode))
109 do_bast = 0;
110 }
111
112 if (first == AST_COMP) {
113 if (do_cast)
114 castfn(lkb->lkb_astparam);
115 if (do_bast)
116 bastfn(lkb->lkb_astparam, bastmode);
117 } else if (first == AST_BAST) {
118 if (do_bast)
119 bastfn(lkb->lkb_astparam, bastmode);
120 if (do_cast)
121 castfn(lkb->lkb_astparam);
122 } else {
123 log_error(ls, "bad ast_first %d ast_type %d",
124 first, type);
125 }
126
127 if (do_cast)
128 lkb->lkb_castmode_done = castmode;
129 if (do_bast)
130 lkb->lkb_bastmode_done = bastmode;
89 131
90 /* this removes the reference added by dlm_add_ast 132 /* this removes the reference added by dlm_add_ast
91 and may result in the lkb being freed */ 133 and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 1b5fc5f428fd..bcb1aaba519d 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
13#ifndef __ASTD_DOT_H__ 13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__ 14#define __ASTD_DOT_H__
15 15
16void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode); 16void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
17void dlm_del_ast(struct dlm_lkb *lkb); 17void dlm_del_ast(struct dlm_lkb *lkb);
18 18
19void dlm_astd_wake(void); 19void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 375a2359b3bf..29d6139c35fc 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -256,7 +256,7 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
256 lkb->lkb_status, 256 lkb->lkb_status,
257 lkb->lkb_grmode, 257 lkb->lkb_grmode,
258 lkb->lkb_rqmode, 258 lkb->lkb_rqmode,
259 lkb->lkb_highbast, 259 lkb->lkb_bastmode,
260 rsb_lookup, 260 rsb_lookup,
261 lkb->lkb_wait_type, 261 lkb->lkb_wait_type,
262 lkb->lkb_lvbseq, 262 lkb->lkb_lvbseq,
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 826d3dc6e0ab..f632b58cd222 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -232,11 +232,17 @@ struct dlm_lkb {
232 int8_t lkb_status; /* granted, waiting, convert */ 232 int8_t lkb_status; /* granted, waiting, convert */
233 int8_t lkb_rqmode; /* requested lock mode */ 233 int8_t lkb_rqmode; /* requested lock mode */
234 int8_t lkb_grmode; /* granted lock mode */ 234 int8_t lkb_grmode; /* granted lock mode */
235 int8_t lkb_bastmode; /* requested mode */
236 int8_t lkb_highbast; /* highest mode bast sent for */ 235 int8_t lkb_highbast; /* highest mode bast sent for */
236
237 int8_t lkb_wait_type; /* type of reply waiting for */ 237 int8_t lkb_wait_type; /* type of reply waiting for */
238 int8_t lkb_wait_count; 238 int8_t lkb_wait_count;
239 int8_t lkb_ast_type; /* type of ast queued for */ 239 int8_t lkb_ast_type; /* type of ast queued for */
240 int8_t lkb_ast_first; /* type of first ast queued */
241
242 int8_t lkb_bastmode; /* req mode of queued bast */
243 int8_t lkb_castmode; /* gr mode of queued cast */
244 int8_t lkb_bastmode_done; /* last delivered bastmode */
245 int8_t lkb_castmode_done; /* last delivered castmode */
240 246
241 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 247 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
242 struct list_head lkb_statequeue; /* rsb g/c/w list */ 248 struct list_head lkb_statequeue; /* rsb g/c/w list */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 9c0c1db1e105..46ffd3eeaaf7 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
307 lkb->lkb_lksb->sb_status = rv; 307 lkb->lkb_lksb->sb_status = rv;
308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; 308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
309 309
310 dlm_add_ast(lkb, AST_COMP, 0); 310 dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
311} 311}
312 312
313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) 313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -320,10 +320,12 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
320{ 320{
321 lkb->lkb_time_bast = ktime_get(); 321 lkb->lkb_time_bast = ktime_get();
322 322
323 if (is_master_copy(lkb)) 323 if (is_master_copy(lkb)) {
324 lkb->lkb_bastmode = rqmode; /* printed by debugfs */
324 send_bast(r, lkb, rqmode); 325 send_bast(r, lkb, rqmode);
325 else 326 } else {
326 dlm_add_ast(lkb, AST_BAST, rqmode); 327 dlm_add_ast(lkb, AST_BAST, rqmode);
328 }
327} 329}
328 330
329/* 331/*
@@ -2280,20 +2282,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2280 if (can_be_queued(lkb)) { 2282 if (can_be_queued(lkb)) {
2281 error = -EINPROGRESS; 2283 error = -EINPROGRESS;
2282 add_lkb(r, lkb, DLM_LKSTS_WAITING); 2284 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2283 send_blocking_asts(r, lkb);
2284 add_timeout(lkb); 2285 add_timeout(lkb);
2285 goto out; 2286 goto out;
2286 } 2287 }
2287 2288
2288 error = -EAGAIN; 2289 error = -EAGAIN;
2289 if (force_blocking_asts(lkb))
2290 send_blocking_asts_all(r, lkb);
2291 queue_cast(r, lkb, -EAGAIN); 2290 queue_cast(r, lkb, -EAGAIN);
2292
2293 out: 2291 out:
2294 return error; 2292 return error;
2295} 2293}
2296 2294
2295static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2296 int error)
2297{
2298 switch (error) {
2299 case -EAGAIN:
2300 if (force_blocking_asts(lkb))
2301 send_blocking_asts_all(r, lkb);
2302 break;
2303 case -EINPROGRESS:
2304 send_blocking_asts(r, lkb);
2305 break;
2306 }
2307}
2308
2297static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) 2309static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2298{ 2310{
2299 int error = 0; 2311 int error = 0;
@@ -2304,7 +2316,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2304 if (can_be_granted(r, lkb, 1, &deadlk)) { 2316 if (can_be_granted(r, lkb, 1, &deadlk)) {
2305 grant_lock(r, lkb); 2317 grant_lock(r, lkb);
2306 queue_cast(r, lkb, 0); 2318 queue_cast(r, lkb, 0);
2307 grant_pending_locks(r);
2308 goto out; 2319 goto out;
2309 } 2320 }
2310 2321
@@ -2334,7 +2345,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2334 if (_can_be_granted(r, lkb, 1)) { 2345 if (_can_be_granted(r, lkb, 1)) {
2335 grant_lock(r, lkb); 2346 grant_lock(r, lkb);
2336 queue_cast(r, lkb, 0); 2347 queue_cast(r, lkb, 0);
2337 grant_pending_locks(r);
2338 goto out; 2348 goto out;
2339 } 2349 }
2340 /* else fall through and move to convert queue */ 2350 /* else fall through and move to convert queue */
@@ -2344,28 +2354,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2344 error = -EINPROGRESS; 2354 error = -EINPROGRESS;
2345 del_lkb(r, lkb); 2355 del_lkb(r, lkb);
2346 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 2356 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2347 send_blocking_asts(r, lkb);
2348 add_timeout(lkb); 2357 add_timeout(lkb);
2349 goto out; 2358 goto out;
2350 } 2359 }
2351 2360
2352 error = -EAGAIN; 2361 error = -EAGAIN;
2353 if (force_blocking_asts(lkb))
2354 send_blocking_asts_all(r, lkb);
2355 queue_cast(r, lkb, -EAGAIN); 2362 queue_cast(r, lkb, -EAGAIN);
2356
2357 out: 2363 out:
2358 return error; 2364 return error;
2359} 2365}
2360 2366
2367static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2368 int error)
2369{
2370 switch (error) {
2371 case 0:
2372 grant_pending_locks(r);
2373 /* grant_pending_locks also sends basts */
2374 break;
2375 case -EAGAIN:
2376 if (force_blocking_asts(lkb))
2377 send_blocking_asts_all(r, lkb);
2378 break;
2379 case -EINPROGRESS:
2380 send_blocking_asts(r, lkb);
2381 break;
2382 }
2383}
2384
2361static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) 2385static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2362{ 2386{
2363 remove_lock(r, lkb); 2387 remove_lock(r, lkb);
2364 queue_cast(r, lkb, -DLM_EUNLOCK); 2388 queue_cast(r, lkb, -DLM_EUNLOCK);
2365 grant_pending_locks(r);
2366 return -DLM_EUNLOCK; 2389 return -DLM_EUNLOCK;
2367} 2390}
2368 2391
2392static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2393 int error)
2394{
2395 grant_pending_locks(r);
2396}
2397
2369/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ 2398/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
2370 2399
2371static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) 2400static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2375,12 +2404,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2375 error = revert_lock(r, lkb); 2404 error = revert_lock(r, lkb);
2376 if (error) { 2405 if (error) {
2377 queue_cast(r, lkb, -DLM_ECANCEL); 2406 queue_cast(r, lkb, -DLM_ECANCEL);
2378 grant_pending_locks(r);
2379 return -DLM_ECANCEL; 2407 return -DLM_ECANCEL;
2380 } 2408 }
2381 return 0; 2409 return 0;
2382} 2410}
2383 2411
2412static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2413 int error)
2414{
2415 if (error)
2416 grant_pending_locks(r);
2417}
2418
2384/* 2419/*
2385 * Four stage 3 varieties: 2420 * Four stage 3 varieties:
2386 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() 2421 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
@@ -2402,11 +2437,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2402 goto out; 2437 goto out;
2403 } 2438 }
2404 2439
2405 if (is_remote(r)) 2440 if (is_remote(r)) {
2406 /* receive_request() calls do_request() on remote node */ 2441 /* receive_request() calls do_request() on remote node */
2407 error = send_request(r, lkb); 2442 error = send_request(r, lkb);
2408 else 2443 } else {
2409 error = do_request(r, lkb); 2444 error = do_request(r, lkb);
2445 /* for remote locks the request_reply is sent
2446 between do_request and do_request_effects */
2447 do_request_effects(r, lkb, error);
2448 }
2410 out: 2449 out:
2411 return error; 2450 return error;
2412} 2451}
@@ -2417,11 +2456,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2417{ 2456{
2418 int error; 2457 int error;
2419 2458
2420 if (is_remote(r)) 2459 if (is_remote(r)) {
2421 /* receive_convert() calls do_convert() on remote node */ 2460 /* receive_convert() calls do_convert() on remote node */
2422 error = send_convert(r, lkb); 2461 error = send_convert(r, lkb);
2423 else 2462 } else {
2424 error = do_convert(r, lkb); 2463 error = do_convert(r, lkb);
2464 /* for remote locks the convert_reply is sent
2465 between do_convert and do_convert_effects */
2466 do_convert_effects(r, lkb, error);
2467 }
2425 2468
2426 return error; 2469 return error;
2427} 2470}
@@ -2432,11 +2475,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2432{ 2475{
2433 int error; 2476 int error;
2434 2477
2435 if (is_remote(r)) 2478 if (is_remote(r)) {
2436 /* receive_unlock() calls do_unlock() on remote node */ 2479 /* receive_unlock() calls do_unlock() on remote node */
2437 error = send_unlock(r, lkb); 2480 error = send_unlock(r, lkb);
2438 else 2481 } else {
2439 error = do_unlock(r, lkb); 2482 error = do_unlock(r, lkb);
2483 /* for remote locks the unlock_reply is sent
2484 between do_unlock and do_unlock_effects */
2485 do_unlock_effects(r, lkb, error);
2486 }
2440 2487
2441 return error; 2488 return error;
2442} 2489}
@@ -2447,11 +2494,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2447{ 2494{
2448 int error; 2495 int error;
2449 2496
2450 if (is_remote(r)) 2497 if (is_remote(r)) {
2451 /* receive_cancel() calls do_cancel() on remote node */ 2498 /* receive_cancel() calls do_cancel() on remote node */
2452 error = send_cancel(r, lkb); 2499 error = send_cancel(r, lkb);
2453 else 2500 } else {
2454 error = do_cancel(r, lkb); 2501 error = do_cancel(r, lkb);
2502 /* for remote locks the cancel_reply is sent
2503 between do_cancel and do_cancel_effects */
2504 do_cancel_effects(r, lkb, error);
2505 }
2455 2506
2456 return error; 2507 return error;
2457} 2508}
@@ -3191,6 +3242,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
3191 attach_lkb(r, lkb); 3242 attach_lkb(r, lkb);
3192 error = do_request(r, lkb); 3243 error = do_request(r, lkb);
3193 send_request_reply(r, lkb, error); 3244 send_request_reply(r, lkb, error);
3245 do_request_effects(r, lkb, error);
3194 3246
3195 unlock_rsb(r); 3247 unlock_rsb(r);
3196 put_rsb(r); 3248 put_rsb(r);
@@ -3226,15 +3278,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
3226 goto out; 3278 goto out;
3227 3279
3228 receive_flags(lkb, ms); 3280 receive_flags(lkb, ms);
3281
3229 error = receive_convert_args(ls, lkb, ms); 3282 error = receive_convert_args(ls, lkb, ms);
3230 if (error) 3283 if (error) {
3231 goto out_reply; 3284 send_convert_reply(r, lkb, error);
3285 goto out;
3286 }
3287
3232 reply = !down_conversion(lkb); 3288 reply = !down_conversion(lkb);
3233 3289
3234 error = do_convert(r, lkb); 3290 error = do_convert(r, lkb);
3235 out_reply:
3236 if (reply) 3291 if (reply)
3237 send_convert_reply(r, lkb, error); 3292 send_convert_reply(r, lkb, error);
3293 do_convert_effects(r, lkb, error);
3238 out: 3294 out:
3239 unlock_rsb(r); 3295 unlock_rsb(r);
3240 put_rsb(r); 3296 put_rsb(r);
@@ -3266,13 +3322,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
3266 goto out; 3322 goto out;
3267 3323
3268 receive_flags(lkb, ms); 3324 receive_flags(lkb, ms);
3325
3269 error = receive_unlock_args(ls, lkb, ms); 3326 error = receive_unlock_args(ls, lkb, ms);
3270 if (error) 3327 if (error) {
3271 goto out_reply; 3328 send_unlock_reply(r, lkb, error);
3329 goto out;
3330 }
3272 3331
3273 error = do_unlock(r, lkb); 3332 error = do_unlock(r, lkb);
3274 out_reply:
3275 send_unlock_reply(r, lkb, error); 3333 send_unlock_reply(r, lkb, error);
3334 do_unlock_effects(r, lkb, error);
3276 out: 3335 out:
3277 unlock_rsb(r); 3336 unlock_rsb(r);
3278 put_rsb(r); 3337 put_rsb(r);
@@ -3307,6 +3366,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
3307 3366
3308 error = do_cancel(r, lkb); 3367 error = do_cancel(r, lkb);
3309 send_cancel_reply(r, lkb, error); 3368 send_cancel_reply(r, lkb, error);
3369 do_cancel_effects(r, lkb, error);
3310 out: 3370 out:
3311 unlock_rsb(r); 3371 unlock_rsb(r);
3312 put_rsb(r); 3372 put_rsb(r);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index c010ecfc0d29..26a8bd40400a 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -191,6 +191,18 @@ static int do_uevent(struct dlm_ls *ls, int in)
191 return error; 191 return error;
192} 192}
193 193
194static int dlm_uevent(struct kset *kset, struct kobject *kobj,
195 struct kobj_uevent_env *env)
196{
197 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
198
199 add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
200 return 0;
201}
202
203static struct kset_uevent_ops dlm_uevent_ops = {
204 .uevent = dlm_uevent,
205};
194 206
195int __init dlm_lockspace_init(void) 207int __init dlm_lockspace_init(void)
196{ 208{
@@ -199,7 +211,7 @@ int __init dlm_lockspace_init(void)
199 INIT_LIST_HEAD(&lslist); 211 INIT_LIST_HEAD(&lslist);
200 spin_lock_init(&lslist_lock); 212 spin_lock_init(&lslist_lock);
201 213
202 dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj); 214 dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
203 if (!dlm_kset) { 215 if (!dlm_kset) {
204 printk(KERN_WARNING "%s: can not create kset\n", __func__); 216 printk(KERN_WARNING "%s: can not create kset\n", __func__);
205 return -ENOMEM; 217 return -ENOMEM;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index e73a4bb572aa..a4bfd31ac45b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -173,7 +173,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
173/* we could possibly check if the cancel of an orphan has resulted in the lkb 173/* we could possibly check if the cancel of an orphan has resulted in the lkb
174 being removed and then remove that lkb from the orphans list and free it */ 174 being removed and then remove that lkb from the orphans list and free it */
175 175
176void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode) 176void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
177{ 177{
178 struct dlm_ls *ls; 178 struct dlm_ls *ls;
179 struct dlm_user_args *ua; 179 struct dlm_user_args *ua;
@@ -206,8 +206,10 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
206 206
207 ast_type = lkb->lkb_ast_type; 207 ast_type = lkb->lkb_ast_type;
208 lkb->lkb_ast_type |= type; 208 lkb->lkb_ast_type |= type;
209 if (bastmode) 209 if (type == AST_BAST)
210 lkb->lkb_bastmode = bastmode; 210 lkb->lkb_bastmode = mode;
211 else
212 lkb->lkb_castmode = mode;
211 213
212 if (!ast_type) { 214 if (!ast_type) {
213 kref_get(&lkb->lkb_ref); 215 kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 1c9686492286..f196091dd7ff 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -9,7 +9,7 @@
9#ifndef __USER_DOT_H__ 9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__ 10#define __USER_DOT_H__
11 11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls); 15int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/exec.c b/fs/exec.c
index 0790a107ff7e..cce6bbdbdbb1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -571,6 +571,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
571 struct vm_area_struct *prev = NULL; 571 struct vm_area_struct *prev = NULL;
572 unsigned long vm_flags; 572 unsigned long vm_flags;
573 unsigned long stack_base; 573 unsigned long stack_base;
574 unsigned long stack_size;
575 unsigned long stack_expand;
576 unsigned long rlim_stack;
574 577
575#ifdef CONFIG_STACK_GROWSUP 578#ifdef CONFIG_STACK_GROWSUP
576 /* Limit stack size to 1GB */ 579 /* Limit stack size to 1GB */
@@ -627,10 +630,23 @@ int setup_arg_pages(struct linux_binprm *bprm,
627 goto out_unlock; 630 goto out_unlock;
628 } 631 }
629 632
633 stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE;
634 stack_size = vma->vm_end - vma->vm_start;
635 /*
636 * Align this down to a page boundary as expand_stack
637 * will align it up.
638 */
639 rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
630#ifdef CONFIG_STACK_GROWSUP 640#ifdef CONFIG_STACK_GROWSUP
631 stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; 641 if (stack_size + stack_expand > rlim_stack)
642 stack_base = vma->vm_start + rlim_stack;
643 else
644 stack_base = vma->vm_end + stack_expand;
632#else 645#else
633 stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE; 646 if (stack_size + stack_expand > rlim_stack)
647 stack_base = vma->vm_end - rlim_stack;
648 else
649 stack_base = vma->vm_start - stack_expand;
634#endif 650#endif
635 ret = expand_stack(vma, stack_base); 651 ret = expand_stack(vma, stack_base);
636 if (ret) 652 if (ret)
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1b178e61718..f0d520312d8b 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -55,6 +55,8 @@
55/* exofs Application specific page/attribute */ 55/* exofs Application specific page/attribute */
56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) 56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
57# define EXOFS_ATTR_INODE_DATA 1 57# define EXOFS_ATTR_INODE_DATA 1
58# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
59# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
58 60
59/* 61/*
60 * The maximum number of files we can have is limited by the size of the 62 * The maximum number of files we can have is limited by the size of the
@@ -206,4 +208,41 @@ enum {
206 (((name_len) + offsetof(struct exofs_dir_entry, name) + \ 208 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
207 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) 209 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
208 210
211/*
212 * The on-disk (optional) layout structure.
213 * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
214 * attribute, attached to any inode, usually to a directory.
215 */
216
217enum exofs_inode_layout_gen_functions {
218 LAYOUT_MOVING_WINDOW = 0,
219 LAYOUT_IMPLICT = 1,
220};
221
222struct exofs_on_disk_inode_layout {
223 __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
224 __le16 pad;
225 union {
226 /* gen_func == LAYOUT_MOVING_WINDOW (default) */
227 struct exofs_layout_sliding_window {
228 __le32 num_devices; /* first n devices in global-table*/
229 } sliding_window __packed;
230
231 /* gen_func == LAYOUT_IMPLICT */
232 struct exofs_layout_implict_list {
233 struct exofs_dt_data_map data_map;
234 /* Variable array of size data_map.cb_num_comps. These
235 * are device indexes of the devices in the global table
236 */
237 __le32 dev_indexes[];
238 } implict __packed;
239 };
240} __packed;
241
242static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
243{
244 return sizeof(struct exofs_on_disk_inode_layout) +
245 max_devs * sizeof(__le32);
246}
247
209#endif /*ifndef __EXOFS_COM_H__*/ 248#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c35fd4623986..8442e353309f 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -55,12 +55,28 @@
55/* u64 has problems with printk this will cast it to unsigned long long */ 55/* u64 has problems with printk this will cast it to unsigned long long */
56#define _LLU(x) (unsigned long long)(x) 56#define _LLU(x) (unsigned long long)(x)
57 57
58struct exofs_layout {
59 osd_id s_pid; /* partition ID of file system*/
60
61 /* Our way of looking at the data_map */
62 unsigned stripe_unit;
63 unsigned mirrors_p1;
64
65 unsigned group_width;
66 u64 group_depth;
67 unsigned group_count;
68
69 enum exofs_inode_layout_gen_functions lay_func;
70
71 unsigned s_numdevs; /* Num of devices in array */
72 struct osd_dev *s_ods[0]; /* Variable length */
73};
74
58/* 75/*
59 * our extension to the in-memory superblock 76 * our extension to the in-memory superblock
60 */ 77 */
61struct exofs_sb_info { 78struct exofs_sb_info {
62 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ 79 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
63 osd_id s_pid; /* partition ID of file system*/
64 int s_timeout; /* timeout for OSD operations */ 80 int s_timeout; /* timeout for OSD operations */
65 uint64_t s_nextid; /* highest object ID used */ 81 uint64_t s_nextid; /* highest object ID used */
66 uint32_t s_numfiles; /* number of files on fs */ 82 uint32_t s_numfiles; /* number of files on fs */
@@ -69,22 +85,27 @@ struct exofs_sb_info {
69 atomic_t s_curr_pending; /* number of pending commands */ 85 atomic_t s_curr_pending; /* number of pending commands */
70 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ 86 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
71 87
72 struct pnfs_osd_data_map data_map; /* Default raid to use */ 88 struct pnfs_osd_data_map data_map; /* Default raid to use
73 unsigned s_numdevs; /* Num of devices in array */ 89 * FIXME: Needed ?
74 struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */ 90 */
91/* struct exofs_layout dir_layout;*/ /* Default dir layout */
92 struct exofs_layout layout; /* Default files layout,
93 * contains the variable osd_dev
94 * array. Keep last */
95 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
75}; 96};
76 97
77/* 98/*
78 * our extension to the in-memory inode 99 * our extension to the in-memory inode
79 */ 100 */
80struct exofs_i_info { 101struct exofs_i_info {
102 struct inode vfs_inode; /* normal in-memory inode */
103 wait_queue_head_t i_wq; /* wait queue for inode */
81 unsigned long i_flags; /* various atomic flags */ 104 unsigned long i_flags; /* various atomic flags */
82 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ 105 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
83 uint32_t i_dir_start_lookup; /* which page to start lookup */ 106 uint32_t i_dir_start_lookup; /* which page to start lookup */
84 wait_queue_head_t i_wq; /* wait queue for inode */
85 uint64_t i_commit_size; /* the object's written length */ 107 uint64_t i_commit_size; /* the object's written length */
86 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ 108 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
87 struct inode vfs_inode; /* normal in-memory inode */
88}; 109};
89 110
90static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) 111static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -101,7 +122,7 @@ struct exofs_io_state {
101 void *private; 122 void *private;
102 exofs_io_done_fn done; 123 exofs_io_done_fn done;
103 124
104 struct exofs_sb_info *sbi; 125 struct exofs_layout *layout;
105 struct osd_obj_id obj; 126 struct osd_obj_id obj;
106 u8 *cred; 127 u8 *cred;
107 128
@@ -109,7 +130,11 @@ struct exofs_io_state {
109 loff_t offset; 130 loff_t offset;
110 unsigned long length; 131 unsigned long length;
111 void *kern_buff; 132 void *kern_buff;
112 struct bio *bio; 133
134 struct page **pages;
135 unsigned nr_pages;
136 unsigned pgbase;
137 unsigned pages_consumed;
113 138
114 /* Attributes */ 139 /* Attributes */
115 unsigned in_attr_len; 140 unsigned in_attr_len;
@@ -122,6 +147,9 @@ struct exofs_io_state {
122 struct exofs_per_dev_state { 147 struct exofs_per_dev_state {
123 struct osd_request *or; 148 struct osd_request *or;
124 struct bio *bio; 149 struct bio *bio;
150 loff_t offset;
151 unsigned length;
152 unsigned dev;
125 } per_dev[]; 153 } per_dev[];
126}; 154};
127 155
@@ -175,6 +203,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
175} 203}
176 204
177/* 205/*
206 * Given a layout, object_number and stripe_index return the associated global
207 * dev_index
208 */
209unsigned exofs_layout_od_id(struct exofs_layout *layout,
210 osd_id obj_no, unsigned layout_index);
211/*
178 * Maximum count of links to a file 212 * Maximum count of links to a file
179 */ 213 */
180#define EXOFS_LINK_MAX 32000 214#define EXOFS_LINK_MAX 32000
@@ -189,7 +223,8 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
189int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, 223int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
190 u64 offset, void *p, unsigned length); 224 u64 offset, void *p, unsigned length);
191 225
192int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios); 226int exofs_get_io_state(struct exofs_layout *layout,
227 struct exofs_io_state **ios);
193void exofs_put_io_state(struct exofs_io_state *ios); 228void exofs_put_io_state(struct exofs_io_state *ios);
194 229
195int exofs_check_io(struct exofs_io_state *ios, u64 *resid); 230int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
@@ -226,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
226 struct page **pagep, void **fsdata); 261 struct page **pagep, void **fsdata);
227extern struct inode *exofs_iget(struct super_block *, unsigned long); 262extern struct inode *exofs_iget(struct super_block *, unsigned long);
228struct inode *exofs_new_inode(struct inode *, int); 263struct inode *exofs_new_inode(struct inode *, int);
229extern int exofs_write_inode(struct inode *, int); 264extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
230extern void exofs_delete_inode(struct inode *); 265extern void exofs_delete_inode(struct inode *);
231 266
232/* dir.c: */ 267/* dir.c: */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2afbcebeda71..a17e4b733e35 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -41,16 +41,18 @@
41 41
42enum { BIO_MAX_PAGES_KMALLOC = 42enum { BIO_MAX_PAGES_KMALLOC =
43 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), 43 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
44 MAX_PAGES_KMALLOC =
45 PAGE_SIZE / sizeof(struct page *),
44}; 46};
45 47
46struct page_collect { 48struct page_collect {
47 struct exofs_sb_info *sbi; 49 struct exofs_sb_info *sbi;
48 struct request_queue *req_q;
49 struct inode *inode; 50 struct inode *inode;
50 unsigned expected_pages; 51 unsigned expected_pages;
51 struct exofs_io_state *ios; 52 struct exofs_io_state *ios;
52 53
53 struct bio *bio; 54 struct page **pages;
55 unsigned alloc_pages;
54 unsigned nr_pages; 56 unsigned nr_pages;
55 unsigned long length; 57 unsigned long length;
56 loff_t pg_first; /* keep 64bit also in 32-arches */ 58 loff_t pg_first; /* keep 64bit also in 32-arches */
@@ -62,15 +64,12 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
62 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 64 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
63 65
64 pcol->sbi = sbi; 66 pcol->sbi = sbi;
65 /* Create master bios on first Q, later on cloning, each clone will be
66 * allocated on it's destination Q
67 */
68 pcol->req_q = osd_request_queue(sbi->s_ods[0]);
69 pcol->inode = inode; 67 pcol->inode = inode;
70 pcol->expected_pages = expected_pages; 68 pcol->expected_pages = expected_pages;
71 69
72 pcol->ios = NULL; 70 pcol->ios = NULL;
73 pcol->bio = NULL; 71 pcol->pages = NULL;
72 pcol->alloc_pages = 0;
74 pcol->nr_pages = 0; 73 pcol->nr_pages = 0;
75 pcol->length = 0; 74 pcol->length = 0;
76 pcol->pg_first = -1; 75 pcol->pg_first = -1;
@@ -80,7 +79,8 @@ static void _pcol_reset(struct page_collect *pcol)
80{ 79{
81 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); 80 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
82 81
83 pcol->bio = NULL; 82 pcol->pages = NULL;
83 pcol->alloc_pages = 0;
84 pcol->nr_pages = 0; 84 pcol->nr_pages = 0;
85 pcol->length = 0; 85 pcol->length = 0;
86 pcol->pg_first = -1; 86 pcol->pg_first = -1;
@@ -90,38 +90,43 @@ static void _pcol_reset(struct page_collect *pcol)
90 * it might not end here. don't be left with nothing 90 * it might not end here. don't be left with nothing
91 */ 91 */
92 if (!pcol->expected_pages) 92 if (!pcol->expected_pages)
93 pcol->expected_pages = BIO_MAX_PAGES_KMALLOC; 93 pcol->expected_pages = MAX_PAGES_KMALLOC;
94} 94}
95 95
96static int pcol_try_alloc(struct page_collect *pcol) 96static int pcol_try_alloc(struct page_collect *pcol)
97{ 97{
98 int pages = min_t(unsigned, pcol->expected_pages, 98 unsigned pages = min_t(unsigned, pcol->expected_pages,
99 BIO_MAX_PAGES_KMALLOC); 99 MAX_PAGES_KMALLOC);
100 100
101 if (!pcol->ios) { /* First time allocate io_state */ 101 if (!pcol->ios) { /* First time allocate io_state */
102 int ret = exofs_get_io_state(pcol->sbi, &pcol->ios); 102 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
103 103
104 if (ret) 104 if (ret)
105 return ret; 105 return ret;
106 } 106 }
107 107
108 /* TODO: easily support bio chaining */
109 pages = min_t(unsigned, pages,
110 pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
111
108 for (; pages; pages >>= 1) { 112 for (; pages; pages >>= 1) {
109 pcol->bio = bio_kmalloc(GFP_KERNEL, pages); 113 pcol->pages = kmalloc(pages * sizeof(struct page *),
110 if (likely(pcol->bio)) 114 GFP_KERNEL);
115 if (likely(pcol->pages)) {
116 pcol->alloc_pages = pages;
111 return 0; 117 return 0;
118 }
112 } 119 }
113 120
114 EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n", 121 EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
115 pcol->expected_pages); 122 pcol->expected_pages);
116 return -ENOMEM; 123 return -ENOMEM;
117} 124}
118 125
119static void pcol_free(struct page_collect *pcol) 126static void pcol_free(struct page_collect *pcol)
120{ 127{
121 if (pcol->bio) { 128 kfree(pcol->pages);
122 bio_put(pcol->bio); 129 pcol->pages = NULL;
123 pcol->bio = NULL;
124 }
125 130
126 if (pcol->ios) { 131 if (pcol->ios) {
127 exofs_put_io_state(pcol->ios); 132 exofs_put_io_state(pcol->ios);
@@ -132,11 +137,10 @@ static void pcol_free(struct page_collect *pcol)
132static int pcol_add_page(struct page_collect *pcol, struct page *page, 137static int pcol_add_page(struct page_collect *pcol, struct page *page,
133 unsigned len) 138 unsigned len)
134{ 139{
135 int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); 140 if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
136 if (unlikely(len != added_len))
137 return -ENOMEM; 141 return -ENOMEM;
138 142
139 ++pcol->nr_pages; 143 pcol->pages[pcol->nr_pages++] = page;
140 pcol->length += len; 144 pcol->length += len;
141 return 0; 145 return 0;
142} 146}
@@ -181,7 +185,6 @@ static void update_write_page(struct page *page, int ret)
181 */ 185 */
182static int __readpages_done(struct page_collect *pcol, bool do_unlock) 186static int __readpages_done(struct page_collect *pcol, bool do_unlock)
183{ 187{
184 struct bio_vec *bvec;
185 int i; 188 int i;
186 u64 resid; 189 u64 resid;
187 u64 good_bytes; 190 u64 good_bytes;
@@ -193,13 +196,13 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
193 else 196 else
194 good_bytes = pcol->length - resid; 197 good_bytes = pcol->length - resid;
195 198
196 EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx" 199 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
197 " length=0x%lx nr_pages=%u\n", 200 " length=0x%lx nr_pages=%u\n",
198 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 201 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
199 pcol->nr_pages); 202 pcol->nr_pages);
200 203
201 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 204 for (i = 0; i < pcol->nr_pages; i++) {
202 struct page *page = bvec->bv_page; 205 struct page *page = pcol->pages[i];
203 struct inode *inode = page->mapping->host; 206 struct inode *inode = page->mapping->host;
204 int page_stat; 207 int page_stat;
205 208
@@ -218,11 +221,11 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
218 ret = update_read_page(page, page_stat); 221 ret = update_read_page(page, page_stat);
219 if (do_unlock) 222 if (do_unlock)
220 unlock_page(page); 223 unlock_page(page);
221 length += bvec->bv_len; 224 length += PAGE_SIZE;
222 } 225 }
223 226
224 pcol_free(pcol); 227 pcol_free(pcol);
225 EXOFS_DBGMSG("readpages_done END\n"); 228 EXOFS_DBGMSG2("readpages_done END\n");
226 return ret; 229 return ret;
227} 230}
228 231
@@ -238,11 +241,10 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
238 241
239static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) 242static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
240{ 243{
241 struct bio_vec *bvec;
242 int i; 244 int i;
243 245
244 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 246 for (i = 0; i < pcol->nr_pages; i++) {
245 struct page *page = bvec->bv_page; 247 struct page *page = pcol->pages[i];
246 248
247 if (rw == READ) 249 if (rw == READ)
248 update_read_page(page, ret); 250 update_read_page(page, ret);
@@ -260,13 +262,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
260 struct page_collect *pcol_copy = NULL; 262 struct page_collect *pcol_copy = NULL;
261 int ret; 263 int ret;
262 264
263 if (!pcol->bio) 265 if (!pcol->pages)
264 return 0; 266 return 0;
265 267
266 /* see comment in _readpage() about sync reads */ 268 /* see comment in _readpage() about sync reads */
267 WARN_ON(is_sync && (pcol->nr_pages != 1)); 269 WARN_ON(is_sync && (pcol->nr_pages != 1));
268 270
269 ios->bio = pcol->bio; 271 ios->pages = pcol->pages;
272 ios->nr_pages = pcol->nr_pages;
270 ios->length = pcol->length; 273 ios->length = pcol->length;
271 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; 274 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
272 275
@@ -290,7 +293,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
290 293
291 atomic_inc(&pcol->sbi->s_curr_pending); 294 atomic_inc(&pcol->sbi->s_curr_pending);
292 295
293 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", 296 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
294 ios->obj.id, _LLU(ios->offset), pcol->length); 297 ios->obj.id, _LLU(ios->offset), pcol->length);
295 298
296 /* pages ownership was passed to pcol_copy */ 299 /* pages ownership was passed to pcol_copy */
@@ -366,7 +369,7 @@ try_again:
366 goto try_again; 369 goto try_again;
367 } 370 }
368 371
369 if (!pcol->bio) { 372 if (!pcol->pages) {
370 ret = pcol_try_alloc(pcol); 373 ret = pcol_try_alloc(pcol);
371 if (unlikely(ret)) 374 if (unlikely(ret))
372 goto fail; 375 goto fail;
@@ -448,7 +451,6 @@ static int exofs_readpage(struct file *file, struct page *page)
448static void writepages_done(struct exofs_io_state *ios, void *p) 451static void writepages_done(struct exofs_io_state *ios, void *p)
449{ 452{
450 struct page_collect *pcol = p; 453 struct page_collect *pcol = p;
451 struct bio_vec *bvec;
452 int i; 454 int i;
453 u64 resid; 455 u64 resid;
454 u64 good_bytes; 456 u64 good_bytes;
@@ -462,13 +464,13 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
462 else 464 else
463 good_bytes = pcol->length - resid; 465 good_bytes = pcol->length - resid;
464 466
465 EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx" 467 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
466 " length=0x%lx nr_pages=%u\n", 468 " length=0x%lx nr_pages=%u\n",
467 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 469 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
468 pcol->nr_pages); 470 pcol->nr_pages);
469 471
470 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 472 for (i = 0; i < pcol->nr_pages; i++) {
471 struct page *page = bvec->bv_page; 473 struct page *page = pcol->pages[i];
472 struct inode *inode = page->mapping->host; 474 struct inode *inode = page->mapping->host;
473 int page_stat; 475 int page_stat;
474 476
@@ -485,12 +487,12 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
485 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", 487 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 inode->i_ino, page->index, page_stat); 488 inode->i_ino, page->index, page_stat);
487 489
488 length += bvec->bv_len; 490 length += PAGE_SIZE;
489 } 491 }
490 492
491 pcol_free(pcol); 493 pcol_free(pcol);
492 kfree(pcol); 494 kfree(pcol);
493 EXOFS_DBGMSG("writepages_done END\n"); 495 EXOFS_DBGMSG2("writepages_done END\n");
494} 496}
495 497
496static int write_exec(struct page_collect *pcol) 498static int write_exec(struct page_collect *pcol)
@@ -500,7 +502,7 @@ static int write_exec(struct page_collect *pcol)
500 struct page_collect *pcol_copy = NULL; 502 struct page_collect *pcol_copy = NULL;
501 int ret; 503 int ret;
502 504
503 if (!pcol->bio) 505 if (!pcol->pages)
504 return 0; 506 return 0;
505 507
506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 508 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -512,9 +514,8 @@ static int write_exec(struct page_collect *pcol)
512 514
513 *pcol_copy = *pcol; 515 *pcol_copy = *pcol;
514 516
515 pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ 517 ios->pages = pcol_copy->pages;
516 518 ios->nr_pages = pcol_copy->nr_pages;
517 ios->bio = pcol_copy->bio;
518 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; 519 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
519 ios->length = pcol_copy->length; 520 ios->length = pcol_copy->length;
520 ios->done = writepages_done; 521 ios->done = writepages_done;
@@ -527,7 +528,7 @@ static int write_exec(struct page_collect *pcol)
527 } 528 }
528 529
529 atomic_inc(&pcol->sbi->s_curr_pending); 530 atomic_inc(&pcol->sbi->s_curr_pending);
530 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", 531 EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
531 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), 532 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
532 pcol->length); 533 pcol->length);
533 /* pages ownership was passed to pcol_copy */ 534 /* pages ownership was passed to pcol_copy */
@@ -605,7 +606,7 @@ try_again:
605 goto try_again; 606 goto try_again;
606 } 607 }
607 608
608 if (!pcol->bio) { 609 if (!pcol->pages) {
609 ret = pcol_try_alloc(pcol); 610 ret = pcol_try_alloc(pcol);
610 if (unlikely(ret)) 611 if (unlikely(ret))
611 goto fail; 612 goto fail;
@@ -616,7 +617,7 @@ try_again:
616 617
617 ret = pcol_add_page(pcol, page, len); 618 ret = pcol_add_page(pcol, page, len);
618 if (unlikely(ret)) { 619 if (unlikely(ret)) {
619 EXOFS_DBGMSG("Failed pcol_add_page " 620 EXOFS_DBGMSG2("Failed pcol_add_page "
620 "nr_pages=%u total_length=0x%lx\n", 621 "nr_pages=%u total_length=0x%lx\n",
621 pcol->nr_pages, pcol->length); 622 pcol->nr_pages, pcol->length);
622 623
@@ -663,7 +664,7 @@ static int exofs_writepages(struct address_space *mapping,
663 if (expected_pages < 32L) 664 if (expected_pages < 32L)
664 expected_pages = 32L; 665 expected_pages = 32L;
665 666
666 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " 667 EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
667 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", 668 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
668 mapping->host->i_ino, wbc->range_start, wbc->range_end, 669 mapping->host->i_ino, wbc->range_start, wbc->range_end,
669 mapping->nrpages, start, end, expected_pages); 670 mapping->nrpages, start, end, expected_pages);
@@ -859,20 +860,33 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
859 return error; 860 return error;
860} 861}
861 862
863static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
864 EXOFS_APAGE_FS_DATA,
865 EXOFS_ATTR_INODE_FILE_LAYOUT,
866 0);
867static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
868 EXOFS_APAGE_FS_DATA,
869 EXOFS_ATTR_INODE_DIR_LAYOUT,
870 0);
871
862/* 872/*
863 * Read an inode from the OSD, and return it as is. We also return the size 873 * Read the Linux inode info from the OSD, and return it as is. In exofs the
864 * attribute in the 'obj_size' argument. 874 * inode info is in an application specific page/attribute of the osd-object.
865 */ 875 */
866static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, 876static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
867 struct exofs_fcb *inode, uint64_t *obj_size) 877 struct exofs_fcb *inode)
868{ 878{
869 struct exofs_sb_info *sbi = sb->s_fs_info; 879 struct exofs_sb_info *sbi = sb->s_fs_info;
870 struct osd_attr attrs[2]; 880 struct osd_attr attrs[] = {
881 [0] = g_attr_inode_data,
882 [1] = g_attr_inode_file_layout,
883 [2] = g_attr_inode_dir_layout,
884 };
871 struct exofs_io_state *ios; 885 struct exofs_io_state *ios;
886 struct exofs_on_disk_inode_layout *layout;
872 int ret; 887 int ret;
873 888
874 *obj_size = ~0; 889 ret = exofs_get_io_state(&sbi->layout, &ios);
875 ret = exofs_get_io_state(sbi, &ios);
876 if (unlikely(ret)) { 890 if (unlikely(ret)) {
877 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 891 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
878 return ret; 892 return ret;
@@ -882,14 +896,25 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
882 exofs_make_credential(oi->i_cred, &ios->obj); 896 exofs_make_credential(oi->i_cred, &ios->obj);
883 ios->cred = oi->i_cred; 897 ios->cred = oi->i_cred;
884 898
885 attrs[0] = g_attr_inode_data; 899 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
886 attrs[1] = g_attr_logical_length; 900 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
901
887 ios->in_attr = attrs; 902 ios->in_attr = attrs;
888 ios->in_attr_len = ARRAY_SIZE(attrs); 903 ios->in_attr_len = ARRAY_SIZE(attrs);
889 904
890 ret = exofs_sbi_read(ios); 905 ret = exofs_sbi_read(ios);
891 if (ret) 906 if (unlikely(ret)) {
907 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
908 _LLU(ios->obj.id), ret);
909 memset(inode, 0, sizeof(*inode));
910 inode->i_mode = 0040000 | (0777 & ~022);
911 /* If object is lost on target we might as well enable it's
912 * delete.
913 */
914 if ((ret == -ENOENT) || (ret == -EINVAL))
915 ret = 0;
892 goto out; 916 goto out;
917 }
893 918
894 ret = extract_attr_from_ios(ios, &attrs[0]); 919 ret = extract_attr_from_ios(ios, &attrs[0]);
895 if (ret) { 920 if (ret) {
@@ -901,11 +926,33 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
901 926
902 ret = extract_attr_from_ios(ios, &attrs[1]); 927 ret = extract_attr_from_ios(ios, &attrs[1]);
903 if (ret) { 928 if (ret) {
904 EXOFS_ERR("%s: extract_attr of logical_length failed\n", 929 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
905 __func__); 930 goto out;
931 }
932 if (attrs[1].len) {
933 layout = attrs[1].val_ptr;
934 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
935 EXOFS_ERR("%s: unsupported files layout %d\n",
936 __func__, layout->gen_func);
937 ret = -ENOTSUPP;
938 goto out;
939 }
940 }
941
942 ret = extract_attr_from_ios(ios, &attrs[2]);
943 if (ret) {
944 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
906 goto out; 945 goto out;
907 } 946 }
908 *obj_size = get_unaligned_be64(attrs[1].val_ptr); 947 if (attrs[2].len) {
948 layout = attrs[2].val_ptr;
949 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
950 EXOFS_ERR("%s: unsupported meta-data layout %d\n",
951 __func__, layout->gen_func);
952 ret = -ENOTSUPP;
953 goto out;
954 }
955 }
909 956
910out: 957out:
911 exofs_put_io_state(ios); 958 exofs_put_io_state(ios);
@@ -925,7 +972,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
925 struct exofs_i_info *oi; 972 struct exofs_i_info *oi;
926 struct exofs_fcb fcb; 973 struct exofs_fcb fcb;
927 struct inode *inode; 974 struct inode *inode;
928 uint64_t obj_size;
929 int ret; 975 int ret;
930 976
931 inode = iget_locked(sb, ino); 977 inode = iget_locked(sb, ino);
@@ -937,7 +983,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
937 __oi_init(oi); 983 __oi_init(oi);
938 984
939 /* read the inode from the osd */ 985 /* read the inode from the osd */
940 ret = exofs_get_inode(sb, oi, &fcb, &obj_size); 986 ret = exofs_get_inode(sb, oi, &fcb);
941 if (ret) 987 if (ret)
942 goto bad_inode; 988 goto bad_inode;
943 989
@@ -958,13 +1004,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
958 inode->i_blkbits = EXOFS_BLKSHIFT; 1004 inode->i_blkbits = EXOFS_BLKSHIFT;
959 inode->i_generation = le32_to_cpu(fcb.i_generation); 1005 inode->i_generation = le32_to_cpu(fcb.i_generation);
960 1006
961 if ((inode->i_size != obj_size) &&
962 (!exofs_inode_is_fast_symlink(inode))) {
963 EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
964 inode->i_size, _LLU(obj_size));
965 /* FIXME: call exofs_inode_recovery() */
966 }
967
968 oi->i_dir_start_lookup = 0; 1007 oi->i_dir_start_lookup = 0;
969 1008
970 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { 1009 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
@@ -1043,7 +1082,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
1043 1082
1044 if (unlikely(ret)) { 1083 if (unlikely(ret)) {
1045 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1084 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1046 _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid)); 1085 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
1047 /*TODO: When FS is corrupted creation can fail, object already 1086 /*TODO: When FS is corrupted creation can fail, object already
1048 * exist. Get rid of this asynchronous creation, if exist 1087 * exist. Get rid of this asynchronous creation, if exist
1049 * increment the obj counter and try the next object. Until we 1088 * increment the obj counter and try the next object. Until we
@@ -1104,7 +1143,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1104 1143
1105 mark_inode_dirty(inode); 1144 mark_inode_dirty(inode);
1106 1145
1107 ret = exofs_get_io_state(sbi, &ios); 1146 ret = exofs_get_io_state(&sbi->layout, &ios);
1108 if (unlikely(ret)) { 1147 if (unlikely(ret)) {
1109 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); 1148 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
1110 return ERR_PTR(ret); 1149 return ERR_PTR(ret);
@@ -1170,8 +1209,10 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1170 int ret; 1209 int ret;
1171 1210
1172 args = kzalloc(sizeof(*args), GFP_KERNEL); 1211 args = kzalloc(sizeof(*args), GFP_KERNEL);
1173 if (!args) 1212 if (!args) {
1213 EXOFS_DBGMSG("Faild kzalloc of args\n");
1174 return -ENOMEM; 1214 return -ENOMEM;
1215 }
1175 1216
1176 fcb = &args->fcb; 1217 fcb = &args->fcb;
1177 1218
@@ -1200,7 +1241,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1200 } else 1241 } else
1201 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1242 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1202 1243
1203 ret = exofs_get_io_state(sbi, &ios); 1244 ret = exofs_get_io_state(&sbi->layout, &ios);
1204 if (unlikely(ret)) { 1245 if (unlikely(ret)) {
1205 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 1246 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
1206 goto free_args; 1247 goto free_args;
@@ -1234,13 +1275,14 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1234free_args: 1275free_args:
1235 kfree(args); 1276 kfree(args);
1236out: 1277out:
1237 EXOFS_DBGMSG("ret=>%d\n", ret); 1278 EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
1279 inode->i_ino, do_sync, ret);
1238 return ret; 1280 return ret;
1239} 1281}
1240 1282
1241int exofs_write_inode(struct inode *inode, int wait) 1283int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
1242{ 1284{
1243 return exofs_update_inode(inode, wait); 1285 return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1244} 1286}
1245 1287
1246/* 1288/*
@@ -1283,7 +1325,7 @@ void exofs_delete_inode(struct inode *inode)
1283 1325
1284 clear_inode(inode); 1326 clear_inode(inode);
1285 1327
1286 ret = exofs_get_io_state(sbi, &ios); 1328 ret = exofs_get_io_state(&sbi->layout, &ios);
1287 if (unlikely(ret)) { 1329 if (unlikely(ret)) {
1288 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); 1330 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
1289 return; 1331 return;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5bad01fa1f9f..5293bc411d17 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -23,9 +23,13 @@
23 */ 23 */
24 24
25#include <scsi/scsi_device.h> 25#include <scsi/scsi_device.h>
26#include <asm/div64.h>
26 27
27#include "exofs.h" 28#include "exofs.h"
28 29
30#define EXOFS_DBGMSG2(M...) do {} while (0)
31/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
32
29void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) 33void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
30{ 34{
31 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); 35 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
@@ -64,21 +68,24 @@ out:
64 return ret; 68 return ret;
65} 69}
66 70
67int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) 71int exofs_get_io_state(struct exofs_layout *layout,
72 struct exofs_io_state **pios)
68{ 73{
69 struct exofs_io_state *ios; 74 struct exofs_io_state *ios;
70 75
71 /*TODO: Maybe use kmem_cach per sbi of size 76 /*TODO: Maybe use kmem_cach per sbi of size
72 * exofs_io_state_size(sbi->s_numdevs) 77 * exofs_io_state_size(layout->s_numdevs)
73 */ 78 */
74 ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); 79 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
75 if (unlikely(!ios)) { 80 if (unlikely(!ios)) {
81 EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
82 exofs_io_state_size(layout->s_numdevs));
76 *pios = NULL; 83 *pios = NULL;
77 return -ENOMEM; 84 return -ENOMEM;
78 } 85 }
79 86
80 ios->sbi = sbi; 87 ios->layout = layout;
81 ios->obj.partition = sbi->s_pid; 88 ios->obj.partition = layout->s_pid;
82 *pios = ios; 89 *pios = ios;
83 return 0; 90 return 0;
84} 91}
@@ -101,6 +108,29 @@ void exofs_put_io_state(struct exofs_io_state *ios)
101 } 108 }
102} 109}
103 110
111unsigned exofs_layout_od_id(struct exofs_layout *layout,
112 osd_id obj_no, unsigned layout_index)
113{
114/* switch (layout->lay_func) {
115 case LAYOUT_MOVING_WINDOW:
116 {*/
117 unsigned dev_mod = obj_no;
118
119 return (layout_index + dev_mod * layout->mirrors_p1) %
120 layout->s_numdevs;
121/* }
122 case LAYOUT_FUNC_IMPLICT:
123 return layout->devs[layout_index];
124 }*/
125}
126
127static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
128 unsigned layout_index)
129{
130 return ios->layout->s_ods[
131 exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
132}
133
104static void _sync_done(struct exofs_io_state *ios, void *p) 134static void _sync_done(struct exofs_io_state *ios, void *p)
105{ 135{
106 struct completion *waiting = p; 136 struct completion *waiting = p;
@@ -168,6 +198,21 @@ static int exofs_io_execute(struct exofs_io_state *ios)
168 return ret; 198 return ret;
169} 199}
170 200
201static void _clear_bio(struct bio *bio)
202{
203 struct bio_vec *bv;
204 unsigned i;
205
206 __bio_for_each_segment(bv, bio, i, 0) {
207 unsigned this_count = bv->bv_len;
208
209 if (likely(PAGE_SIZE == this_count))
210 clear_highpage(bv->bv_page);
211 else
212 zero_user(bv->bv_page, bv->bv_offset, this_count);
213 }
214}
215
171int exofs_check_io(struct exofs_io_state *ios, u64 *resid) 216int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
172{ 217{
173 enum osd_err_priority acumulated_osd_err = 0; 218 enum osd_err_priority acumulated_osd_err = 0;
@@ -176,16 +221,25 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
176 221
177 for (i = 0; i < ios->numdevs; i++) { 222 for (i = 0; i < ios->numdevs; i++) {
178 struct osd_sense_info osi; 223 struct osd_sense_info osi;
179 int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi); 224 struct osd_request *or = ios->per_dev[i].or;
225 int ret;
226
227 if (unlikely(!or))
228 continue;
180 229
230 ret = osd_req_decode_sense(or, &osi);
181 if (likely(!ret)) 231 if (likely(!ret))
182 continue; 232 continue;
183 233
184 if (unlikely(ret == -EFAULT)) { 234 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
185 EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__); 235 /* start read offset passed endof file */
186 /*FIXME: All the pages in this device range should: 236 _clear_bio(ios->per_dev[i].bio);
187 * clear_highpage(page); 237 EXOFS_DBGMSG("start read offset passed end of file "
188 */ 238 "offset=0x%llx, length=0x%llx\n",
239 _LLU(ios->per_dev[i].offset),
240 _LLU(ios->per_dev[i].length));
241
242 continue; /* we recovered */
189 } 243 }
190 244
191 if (osi.osd_err_pri >= acumulated_osd_err) { 245 if (osi.osd_err_pri >= acumulated_osd_err) {
@@ -205,14 +259,259 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
205 return acumulated_lin_err; 259 return acumulated_lin_err;
206} 260}
207 261
262/*
263 * L - logical offset into the file
264 *
265 * U - The number of bytes in a stripe within a group
266 *
267 * U = stripe_unit * group_width
268 *
269 * T - The number of bytes striped within a group of component objects
270 * (before advancing to the next group)
271 *
272 * T = stripe_unit * group_width * group_depth
273 *
274 * S - The number of bytes striped across all component objects
275 * before the pattern repeats
276 *
277 * S = stripe_unit * group_width * group_depth * group_count
278 *
279 * M - The "major" (i.e., across all components) stripe number
280 *
281 * M = L / S
282 *
283 * G - Counts the groups from the beginning of the major stripe
284 *
285 * G = (L - (M * S)) / T [or (L % S) / T]
286 *
287 * H - The byte offset within the group
288 *
289 * H = (L - (M * S)) % T [or (L % S) % T]
290 *
291 * N - The "minor" (i.e., across the group) stripe number
292 *
293 * N = H / U
294 *
295 * C - The component index coresponding to L
296 *
297 * C = (H - (N * U)) / stripe_unit + G * group_width
298 * [or (L % U) / stripe_unit + G * group_width]
299 *
300 * O - The component offset coresponding to L
301 *
302 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
303 */
304struct _striping_info {
305 u64 obj_offset;
306 u64 group_length;
307 u64 total_group_length;
308 u64 Major;
309 unsigned dev;
310 unsigned unit_off;
311};
312
313static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
314 struct _striping_info *si)
315{
316 u32 stripe_unit = ios->layout->stripe_unit;
317 u32 group_width = ios->layout->group_width;
318 u64 group_depth = ios->layout->group_depth;
319
320 u32 U = stripe_unit * group_width;
321 u64 T = U * group_depth;
322 u64 S = T * ios->layout->group_count;
323 u64 M = div64_u64(file_offset, S);
324
325 /*
326 G = (L - (M * S)) / T
327 H = (L - (M * S)) % T
328 */
329 u64 LmodS = file_offset - M * S;
330 u32 G = div64_u64(LmodS, T);
331 u64 H = LmodS - G * T;
332
333 u32 N = div_u64(H, U);
334
335 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
336 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
337 si->dev *= ios->layout->mirrors_p1;
338
339 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
340
341 si->obj_offset = si->unit_off + (N * stripe_unit) +
342 (M * group_depth * stripe_unit);
343
344 si->group_length = T - H;
345 si->total_group_length = T;
346 si->Major = M;
347}
348
349static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
350 unsigned pgbase, struct exofs_per_dev_state *per_dev,
351 int cur_len)
352{
353 unsigned pg = *cur_pg;
354 struct request_queue *q =
355 osd_request_queue(exofs_ios_od(ios, per_dev->dev));
356
357 per_dev->length += cur_len;
358
359 if (per_dev->bio == NULL) {
360 unsigned pages_in_stripe = ios->layout->group_width *
361 (ios->layout->stripe_unit / PAGE_SIZE);
362 unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
363 ios->layout->group_width;
364
365 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
366 if (unlikely(!per_dev->bio)) {
367 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
368 bio_size);
369 return -ENOMEM;
370 }
371 }
372
373 while (cur_len > 0) {
374 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
375 unsigned added_len;
376
377 BUG_ON(ios->nr_pages <= pg);
378 cur_len -= pglen;
379
380 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
381 pglen, pgbase);
382 if (unlikely(pglen != added_len))
383 return -ENOMEM;
384 pgbase = 0;
385 ++pg;
386 }
387 BUG_ON(cur_len);
388
389 *cur_pg = pg;
390 return 0;
391}
392
393static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
394 struct _striping_info *si, unsigned first_comp)
395{
396 unsigned stripe_unit = ios->layout->stripe_unit;
397 unsigned mirrors_p1 = ios->layout->mirrors_p1;
398 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
399 unsigned dev = si->dev;
400 unsigned first_dev = dev - (dev % devs_in_group);
401 unsigned comp = first_comp + (dev - first_dev);
402 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
403 unsigned cur_pg = ios->pages_consumed;
404 int ret = 0;
405
406 while (length) {
407 struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
408 unsigned cur_len, page_off = 0;
409
410 if (!per_dev->length) {
411 per_dev->dev = dev;
412 if (dev < si->dev) {
413 per_dev->offset = si->obj_offset + stripe_unit -
414 si->unit_off;
415 cur_len = stripe_unit;
416 } else if (dev == si->dev) {
417 per_dev->offset = si->obj_offset;
418 cur_len = stripe_unit - si->unit_off;
419 page_off = si->unit_off & ~PAGE_MASK;
420 BUG_ON(page_off && (page_off != ios->pgbase));
421 } else { /* dev > si->dev */
422 per_dev->offset = si->obj_offset - si->unit_off;
423 cur_len = stripe_unit;
424 }
425
426 if (max_comp < comp)
427 max_comp = comp;
428
429 dev += mirrors_p1;
430 dev = (dev % devs_in_group) + first_dev;
431 } else {
432 cur_len = stripe_unit;
433 }
434 if (cur_len >= length)
435 cur_len = length;
436
437 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
438 cur_len);
439 if (unlikely(ret))
440 goto out;
441
442 comp += mirrors_p1;
443 comp = (comp % devs_in_group) + first_comp;
444
445 length -= cur_len;
446 }
447out:
448 ios->numdevs = max_comp + mirrors_p1;
449 ios->pages_consumed = cur_pg;
450 return ret;
451}
452
453static int _prepare_for_striping(struct exofs_io_state *ios)
454{
455 u64 length = ios->length;
456 struct _striping_info si;
457 unsigned devs_in_group = ios->layout->group_width *
458 ios->layout->mirrors_p1;
459 unsigned first_comp = 0;
460 int ret = 0;
461
462 _calc_stripe_info(ios, ios->offset, &si);
463
464 if (!ios->pages) {
465 if (ios->kern_buff) {
466 struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
467
468 per_dev->offset = si.obj_offset;
469 per_dev->dev = si.dev;
470
471 /* no cross device without page array */
472 BUG_ON((ios->layout->group_width > 1) &&
473 (si.unit_off + ios->length >
474 ios->layout->stripe_unit));
475 }
476 ios->numdevs = ios->layout->mirrors_p1;
477 return 0;
478 }
479
480 while (length) {
481 if (length < si.group_length)
482 si.group_length = length;
483
484 ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
485 if (unlikely(ret))
486 goto out;
487
488 length -= si.group_length;
489
490 si.group_length = si.total_group_length;
491 si.unit_off = 0;
492 ++si.Major;
493 si.obj_offset = si.Major * ios->layout->stripe_unit *
494 ios->layout->group_depth;
495
496 si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
497 si.dev %= ios->layout->s_numdevs;
498
499 first_comp += devs_in_group;
500 first_comp %= ios->layout->s_numdevs;
501 }
502
503out:
504 return ret;
505}
506
208int exofs_sbi_create(struct exofs_io_state *ios) 507int exofs_sbi_create(struct exofs_io_state *ios)
209{ 508{
210 int i, ret; 509 int i, ret;
211 510
212 for (i = 0; i < ios->sbi->s_numdevs; i++) { 511 for (i = 0; i < ios->layout->s_numdevs; i++) {
213 struct osd_request *or; 512 struct osd_request *or;
214 513
215 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); 514 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
216 if (unlikely(!or)) { 515 if (unlikely(!or)) {
217 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 516 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
218 ret = -ENOMEM; 517 ret = -ENOMEM;
@@ -233,10 +532,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios)
233{ 532{
234 int i, ret; 533 int i, ret;
235 534
236 for (i = 0; i < ios->sbi->s_numdevs; i++) { 535 for (i = 0; i < ios->layout->s_numdevs; i++) {
237 struct osd_request *or; 536 struct osd_request *or;
238 537
239 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); 538 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
240 if (unlikely(!or)) { 539 if (unlikely(!or)) {
241 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 540 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
242 ret = -ENOMEM; 541 ret = -ENOMEM;
@@ -253,51 +552,74 @@ out:
253 return ret; 552 return ret;
254} 553}
255 554
256int exofs_sbi_write(struct exofs_io_state *ios) 555static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
257{ 556{
258 int i, ret; 557 struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
558 unsigned dev = ios->per_dev[cur_comp].dev;
559 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
560 int ret = 0;
259 561
260 for (i = 0; i < ios->sbi->s_numdevs; i++) { 562 if (ios->pages && !master_dev->length)
563 return 0; /* Just an empty slot */
564
565 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
566 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
261 struct osd_request *or; 567 struct osd_request *or;
262 568
263 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); 569 or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
264 if (unlikely(!or)) { 570 if (unlikely(!or)) {
265 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 571 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
266 ret = -ENOMEM; 572 ret = -ENOMEM;
267 goto out; 573 goto out;
268 } 574 }
269 ios->per_dev[i].or = or; 575 per_dev->or = or;
270 ios->numdevs++; 576 per_dev->offset = master_dev->offset;
271 577
272 if (ios->bio) { 578 if (ios->pages) {
273 struct bio *bio; 579 struct bio *bio;
274 580
275 if (i != 0) { 581 if (per_dev != master_dev) {
276 bio = bio_kmalloc(GFP_KERNEL, 582 bio = bio_kmalloc(GFP_KERNEL,
277 ios->bio->bi_max_vecs); 583 master_dev->bio->bi_max_vecs);
278 if (unlikely(!bio)) { 584 if (unlikely(!bio)) {
585 EXOFS_DBGMSG(
586 "Faild to allocate BIO size=%u\n",
587 master_dev->bio->bi_max_vecs);
279 ret = -ENOMEM; 588 ret = -ENOMEM;
280 goto out; 589 goto out;
281 } 590 }
282 591
283 __bio_clone(bio, ios->bio); 592 __bio_clone(bio, master_dev->bio);
284 bio->bi_bdev = NULL; 593 bio->bi_bdev = NULL;
285 bio->bi_next = NULL; 594 bio->bi_next = NULL;
286 ios->per_dev[i].bio = bio; 595 per_dev->length = master_dev->length;
596 per_dev->bio = bio;
597 per_dev->dev = dev;
287 } else { 598 } else {
288 bio = ios->bio; 599 bio = master_dev->bio;
600 /* FIXME: bio_set_dir() */
601 bio->bi_rw |= (1 << BIO_RW);
289 } 602 }
290 603
291 osd_req_write(or, &ios->obj, ios->offset, bio, 604 osd_req_write(or, &ios->obj, per_dev->offset, bio,
292 ios->length); 605 per_dev->length);
293/* EXOFS_DBGMSG("write sync=%d\n", sync);*/ 606 EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
607 "length=0x%llx dev=%d\n",
608 _LLU(ios->obj.id), _LLU(per_dev->offset),
609 _LLU(per_dev->length), dev);
294 } else if (ios->kern_buff) { 610 } else if (ios->kern_buff) {
295 osd_req_write_kern(or, &ios->obj, ios->offset, 611 ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
296 ios->kern_buff, ios->length); 612 ios->kern_buff, ios->length);
297/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/ 613 if (unlikely(ret))
614 goto out;
615 EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
616 "length=0x%llx dev=%d\n",
617 _LLU(ios->obj.id), _LLU(per_dev->offset),
618 _LLU(ios->length), dev);
298 } else { 619 } else {
299 osd_req_set_attributes(or, &ios->obj); 620 osd_req_set_attributes(or, &ios->obj);
300/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/ 621 EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
622 _LLU(ios->obj.id), ios->out_attr_len, dev);
301 } 623 }
302 624
303 if (ios->out_attr) 625 if (ios->out_attr)
@@ -308,54 +630,93 @@ int exofs_sbi_write(struct exofs_io_state *ios)
308 osd_req_add_get_attr_list(or, ios->in_attr, 630 osd_req_add_get_attr_list(or, ios->in_attr,
309 ios->in_attr_len); 631 ios->in_attr_len);
310 } 632 }
311 ret = exofs_io_execute(ios);
312 633
313out: 634out:
314 return ret; 635 return ret;
315} 636}
316 637
317int exofs_sbi_read(struct exofs_io_state *ios) 638int exofs_sbi_write(struct exofs_io_state *ios)
318{ 639{
319 int i, ret; 640 int i;
641 int ret;
320 642
321 for (i = 0; i < 1; i++) { 643 ret = _prepare_for_striping(ios);
322 struct osd_request *or; 644 if (unlikely(ret))
323 unsigned first_dev = (unsigned)ios->obj.id; 645 return ret;
324 646
325 first_dev %= ios->sbi->s_numdevs; 647 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
326 or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL); 648 ret = _sbi_write_mirror(ios, i);
327 if (unlikely(!or)) { 649 if (unlikely(ret))
328 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 650 return ret;
329 ret = -ENOMEM; 651 }
330 goto out;
331 }
332 ios->per_dev[i].or = or;
333 ios->numdevs++;
334 652
335 if (ios->bio) { 653 ret = exofs_io_execute(ios);
336 osd_req_read(or, &ios->obj, ios->offset, ios->bio, 654 return ret;
337 ios->length); 655}
338/* EXOFS_DBGMSG("read sync=%d\n", sync);*/
339 } else if (ios->kern_buff) {
340 osd_req_read_kern(or, &ios->obj, ios->offset,
341 ios->kern_buff, ios->length);
342/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
343 } else {
344 osd_req_get_attributes(or, &ios->obj);
345/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
346 }
347 656
348 if (ios->out_attr) 657static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
349 osd_req_add_set_attr_list(or, ios->out_attr, 658{
350 ios->out_attr_len); 659 struct osd_request *or;
660 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
661 unsigned first_dev = (unsigned)ios->obj.id;
351 662
352 if (ios->in_attr) 663 if (ios->pages && !per_dev->length)
353 osd_req_add_get_attr_list(or, ios->in_attr, 664 return 0; /* Just an empty slot */
354 ios->in_attr_len); 665
666 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
667 or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
668 if (unlikely(!or)) {
669 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
670 return -ENOMEM;
355 } 671 }
356 ret = exofs_io_execute(ios); 672 per_dev->or = or;
673
674 if (ios->pages) {
675 osd_req_read(or, &ios->obj, per_dev->offset,
676 per_dev->bio, per_dev->length);
677 EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
678 " dev=%d\n", _LLU(ios->obj.id),
679 _LLU(per_dev->offset), _LLU(per_dev->length),
680 first_dev);
681 } else if (ios->kern_buff) {
682 int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
683 ios->kern_buff, ios->length);
684 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
685 "length=0x%llx dev=%d ret=>%d\n",
686 _LLU(ios->obj.id), _LLU(per_dev->offset),
687 _LLU(ios->length), first_dev, ret);
688 if (unlikely(ret))
689 return ret;
690 } else {
691 osd_req_get_attributes(or, &ios->obj);
692 EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
693 _LLU(ios->obj.id), ios->in_attr_len, first_dev);
694 }
695 if (ios->out_attr)
696 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
357 697
358out: 698 if (ios->in_attr)
699 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
700
701 return 0;
702}
703
704int exofs_sbi_read(struct exofs_io_state *ios)
705{
706 int i;
707 int ret;
708
709 ret = _prepare_for_striping(ios);
710 if (unlikely(ret))
711 return ret;
712
713 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
714 ret = _sbi_read_mirror(ios, i);
715 if (unlikely(ret))
716 return ret;
717 }
718
719 ret = exofs_io_execute(ios);
359 return ret; 720 return ret;
360} 721}
361 722
@@ -380,42 +741,82 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
380 return -EIO; 741 return -EIO;
381} 742}
382 743
744static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
745 struct osd_attr *attr)
746{
747 int last_comp = cur_comp + ios->layout->mirrors_p1;
748
749 for (; cur_comp < last_comp; ++cur_comp) {
750 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
751 struct osd_request *or;
752
753 or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
754 if (unlikely(!or)) {
755 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
756 return -ENOMEM;
757 }
758 per_dev->or = or;
759
760 osd_req_set_attributes(or, &ios->obj);
761 osd_req_add_set_attr_list(or, attr, 1);
762 }
763
764 return 0;
765}
766
383int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) 767int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
384{ 768{
385 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; 769 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
386 struct exofs_io_state *ios; 770 struct exofs_io_state *ios;
387 struct osd_attr attr; 771 struct exofs_trunc_attr {
388 __be64 newsize; 772 struct osd_attr attr;
773 __be64 newsize;
774 } *size_attrs;
775 struct _striping_info si;
389 int i, ret; 776 int i, ret;
390 777
391 if (exofs_get_io_state(sbi, &ios)) 778 ret = exofs_get_io_state(&sbi->layout, &ios);
392 return -ENOMEM; 779 if (unlikely(ret))
780 return ret;
781
782 size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
783 GFP_KERNEL);
784 if (unlikely(!size_attrs)) {
785 ret = -ENOMEM;
786 goto out;
787 }
393 788
394 ios->obj.id = exofs_oi_objno(oi); 789 ios->obj.id = exofs_oi_objno(oi);
395 ios->cred = oi->i_cred; 790 ios->cred = oi->i_cred;
396 791
397 newsize = cpu_to_be64(size); 792 ios->numdevs = ios->layout->s_numdevs;
398 attr = g_attr_logical_length; 793 _calc_stripe_info(ios, size, &si);
399 attr.val_ptr = &newsize;
400 794
401 for (i = 0; i < sbi->s_numdevs; i++) { 795 for (i = 0; i < ios->layout->group_width; ++i) {
402 struct osd_request *or; 796 struct exofs_trunc_attr *size_attr = &size_attrs[i];
797 u64 obj_size;
403 798
404 or = osd_start_request(sbi->s_ods[i], GFP_KERNEL); 799 if (i < si.dev)
405 if (unlikely(!or)) { 800 obj_size = si.obj_offset +
406 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 801 ios->layout->stripe_unit - si.unit_off;
407 ret = -ENOMEM; 802 else if (i == si.dev)
408 goto out; 803 obj_size = si.obj_offset;
409 } 804 else /* i > si.dev */
410 ios->per_dev[i].or = or; 805 obj_size = si.obj_offset - si.unit_off;
411 ios->numdevs++;
412 806
413 osd_req_set_attributes(or, &ios->obj); 807 size_attr->newsize = cpu_to_be64(obj_size);
414 osd_req_add_set_attr_list(or, &attr, 1); 808 size_attr->attr = g_attr_logical_length;
809 size_attr->attr.val_ptr = &size_attr->newsize;
810
811 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
812 &size_attr->attr);
813 if (unlikely(ret))
814 goto out;
415 } 815 }
416 ret = exofs_io_execute(ios); 816 ret = exofs_io_execute(ios);
417 817
418out: 818out:
819 kfree(size_attrs);
419 exofs_put_io_state(ios); 820 exofs_put_io_state(ios);
420 return ret; 821 return ret;
421} 822}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index a1d1e77b12eb..6cf5e4e84d61 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -210,7 +210,7 @@ int exofs_sync_fs(struct super_block *sb, int wait)
210 sbi = sb->s_fs_info; 210 sbi = sb->s_fs_info;
211 fscb = &sbi->s_fscb; 211 fscb = &sbi->s_fscb;
212 212
213 ret = exofs_get_io_state(sbi, &ios); 213 ret = exofs_get_io_state(&sbi->layout, &ios);
214 if (ret) 214 if (ret)
215 goto out; 215 goto out;
216 216
@@ -264,12 +264,12 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
264 264
265void exofs_free_sbi(struct exofs_sb_info *sbi) 265void exofs_free_sbi(struct exofs_sb_info *sbi)
266{ 266{
267 while (sbi->s_numdevs) { 267 while (sbi->layout.s_numdevs) {
268 int i = --sbi->s_numdevs; 268 int i = --sbi->layout.s_numdevs;
269 struct osd_dev *od = sbi->s_ods[i]; 269 struct osd_dev *od = sbi->layout.s_ods[i];
270 270
271 if (od) { 271 if (od) {
272 sbi->s_ods[i] = NULL; 272 sbi->layout.s_ods[i] = NULL;
273 osduld_put_device(od); 273 osduld_put_device(od);
274 } 274 }
275 } 275 }
@@ -298,7 +298,8 @@ static void exofs_put_super(struct super_block *sb)
298 msecs_to_jiffies(100)); 298 msecs_to_jiffies(100));
299 } 299 }
300 300
301 _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid); 301 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
302 sbi->layout.s_pid);
302 303
303 exofs_free_sbi(sbi); 304 exofs_free_sbi(sbi);
304 sb->s_fs_info = NULL; 305 sb->s_fs_info = NULL;
@@ -307,6 +308,8 @@ static void exofs_put_super(struct super_block *sb)
307static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, 308static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
308 struct exofs_device_table *dt) 309 struct exofs_device_table *dt)
309{ 310{
311 u64 stripe_length;
312
310 sbi->data_map.odm_num_comps = 313 sbi->data_map.odm_num_comps =
311 le32_to_cpu(dt->dt_data_map.cb_num_comps); 314 le32_to_cpu(dt->dt_data_map.cb_num_comps);
312 sbi->data_map.odm_stripe_unit = 315 sbi->data_map.odm_stripe_unit =
@@ -320,14 +323,63 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
320 sbi->data_map.odm_raid_algorithm = 323 sbi->data_map.odm_raid_algorithm =
321 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); 324 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
322 325
323/* FIXME: Hard coded mirror only for now. if not so do not mount */ 326/* FIXME: Only raid0 for now. if not so, do not mount */
324 if ((sbi->data_map.odm_num_comps != numdevs) || 327 if (sbi->data_map.odm_num_comps != numdevs) {
325 (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || 328 EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
326 (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) || 329 sbi->data_map.odm_num_comps, numdevs);
327 (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
328 return -EINVAL; 330 return -EINVAL;
329 else 331 }
330 return 0; 332 if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
333 EXOFS_ERR("Only RAID_0 for now\n");
334 return -EINVAL;
335 }
336 if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
337 EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
338 numdevs, sbi->data_map.odm_mirror_cnt);
339 return -EINVAL;
340 }
341
342 if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
343 EXOFS_ERR("Stripe Unit(0x%llx)"
344 " must be Multples of PAGE_SIZE(0x%lx)\n",
345 _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
346 return -EINVAL;
347 }
348
349 sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
350 sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
351
352 if (sbi->data_map.odm_group_width) {
353 sbi->layout.group_width = sbi->data_map.odm_group_width;
354 sbi->layout.group_depth = sbi->data_map.odm_group_depth;
355 if (!sbi->layout.group_depth) {
356 EXOFS_ERR("group_depth == 0 && group_width != 0\n");
357 return -EINVAL;
358 }
359 sbi->layout.group_count = sbi->data_map.odm_num_comps /
360 sbi->layout.mirrors_p1 /
361 sbi->data_map.odm_group_width;
362 } else {
363 if (sbi->data_map.odm_group_depth) {
364 printk(KERN_NOTICE "Warning: group_depth ignored "
365 "group_width == 0 && group_depth == %d\n",
366 sbi->data_map.odm_group_depth);
367 sbi->data_map.odm_group_depth = 0;
368 }
369 sbi->layout.group_width = sbi->data_map.odm_num_comps /
370 sbi->layout.mirrors_p1;
371 sbi->layout.group_depth = -1;
372 sbi->layout.group_count = 1;
373 }
374
375 stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
376 if (stripe_length >= (1ULL << 32)) {
377 EXOFS_ERR("Total Stripe length(0x%llx)"
378 " >= 32bit is not supported\n", _LLU(stripe_length));
379 return -EINVAL;
380 }
381
382 return 0;
331} 383}
332 384
333/* @odi is valid only as long as @fscb_dev is valid */ 385/* @odi is valid only as long as @fscb_dev is valid */
@@ -361,7 +413,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
361{ 413{
362 struct exofs_sb_info *sbi = *psbi; 414 struct exofs_sb_info *sbi = *psbi;
363 struct osd_dev *fscb_od; 415 struct osd_dev *fscb_od;
364 struct osd_obj_id obj = {.partition = sbi->s_pid, 416 struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
365 .id = EXOFS_DEVTABLE_ID}; 417 .id = EXOFS_DEVTABLE_ID};
366 struct exofs_device_table *dt; 418 struct exofs_device_table *dt;
367 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + 419 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
@@ -376,9 +428,9 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
376 return -ENOMEM; 428 return -ENOMEM;
377 } 429 }
378 430
379 fscb_od = sbi->s_ods[0]; 431 fscb_od = sbi->layout.s_ods[0];
380 sbi->s_ods[0] = NULL; 432 sbi->layout.s_ods[0] = NULL;
381 sbi->s_numdevs = 0; 433 sbi->layout.s_numdevs = 0;
382 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); 434 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
383 if (unlikely(ret)) { 435 if (unlikely(ret)) {
384 EXOFS_ERR("ERROR: reading device table\n"); 436 EXOFS_ERR("ERROR: reading device table\n");
@@ -397,14 +449,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
397 goto out; 449 goto out;
398 450
399 if (likely(numdevs > 1)) { 451 if (likely(numdevs > 1)) {
400 unsigned size = numdevs * sizeof(sbi->s_ods[0]); 452 unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
401 453
402 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); 454 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
403 if (unlikely(!sbi)) { 455 if (unlikely(!sbi)) {
404 ret = -ENOMEM; 456 ret = -ENOMEM;
405 goto out; 457 goto out;
406 } 458 }
407 memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0])); 459 memset(&sbi->layout.s_ods[1], 0,
460 size - sizeof(sbi->layout.s_ods[0]));
408 *psbi = sbi; 461 *psbi = sbi;
409 } 462 }
410 463
@@ -427,8 +480,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
427 * line. We always keep them in device-table order. 480 * line. We always keep them in device-table order.
428 */ 481 */
429 if (fscb_od && osduld_device_same(fscb_od, &odi)) { 482 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
430 sbi->s_ods[i] = fscb_od; 483 sbi->layout.s_ods[i] = fscb_od;
431 ++sbi->s_numdevs; 484 ++sbi->layout.s_numdevs;
432 fscb_od = NULL; 485 fscb_od = NULL;
433 continue; 486 continue;
434 } 487 }
@@ -441,8 +494,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
441 goto out; 494 goto out;
442 } 495 }
443 496
444 sbi->s_ods[i] = od; 497 sbi->layout.s_ods[i] = od;
445 ++sbi->s_numdevs; 498 ++sbi->layout.s_numdevs;
446 499
447 /* Read the fscb of the other devices to make sure the FS 500 /* Read the fscb of the other devices to make sure the FS
448 * partition is there. 501 * partition is there.
@@ -499,9 +552,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
499 goto free_sbi; 552 goto free_sbi;
500 } 553 }
501 554
502 sbi->s_ods[0] = od; 555 /* Default layout in case we do not have a device-table */
503 sbi->s_numdevs = 1; 556 sbi->layout.stripe_unit = PAGE_SIZE;
504 sbi->s_pid = opts->pid; 557 sbi->layout.mirrors_p1 = 1;
558 sbi->layout.group_width = 1;
559 sbi->layout.group_depth = -1;
560 sbi->layout.group_count = 1;
561 sbi->layout.s_ods[0] = od;
562 sbi->layout.s_numdevs = 1;
563 sbi->layout.s_pid = opts->pid;
505 sbi->s_timeout = opts->timeout; 564 sbi->s_timeout = opts->timeout;
506 565
507 /* fill in some other data by hand */ 566 /* fill in some other data by hand */
@@ -514,7 +573,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
514 sb->s_bdev = NULL; 573 sb->s_bdev = NULL;
515 sb->s_dev = 0; 574 sb->s_dev = 0;
516 575
517 obj.partition = sbi->s_pid; 576 obj.partition = sbi->layout.s_pid;
518 obj.id = EXOFS_SUPER_ID; 577 obj.id = EXOFS_SUPER_ID;
519 exofs_make_credential(sbi->s_cred, &obj); 578 exofs_make_credential(sbi->s_cred, &obj);
520 579
@@ -578,13 +637,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
578 goto free_sbi; 637 goto free_sbi;
579 } 638 }
580 639
581 _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0], 640 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
582 sbi->s_pid); 641 sbi->layout.s_pid);
583 return 0; 642 return 0;
584 643
585free_sbi: 644free_sbi:
586 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 645 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
587 opts->dev_name, sbi->s_pid, ret); 646 opts->dev_name, sbi->layout.s_pid, ret);
588 exofs_free_sbi(sbi); 647 exofs_free_sbi(sbi);
589 return ret; 648 return ret;
590} 649}
@@ -627,7 +686,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
627 uint8_t cred_a[OSD_CAP_LEN]; 686 uint8_t cred_a[OSD_CAP_LEN];
628 int ret; 687 int ret;
629 688
630 ret = exofs_get_io_state(sbi, &ios); 689 ret = exofs_get_io_state(&sbi->layout, &ios);
631 if (ret) { 690 if (ret) {
632 EXOFS_DBGMSG("exofs_get_io_state failed.\n"); 691 EXOFS_DBGMSG("exofs_get_io_state failed.\n");
633 return ret; 692 return ret;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 061914add3cf..0b038e47ad2f 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
118 118
119/* inode.c */ 119/* inode.c */
120extern struct inode *ext2_iget (struct super_block *, unsigned long); 120extern struct inode *ext2_iget (struct super_block *, unsigned long);
121extern int ext2_write_inode (struct inode *, int); 121extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_delete_inode (struct inode *);
123extern int ext2_sync_inode (struct inode *); 123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 71b032c65a02..36ae1cac767c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others");
41MODULE_DESCRIPTION("Second Extended Filesystem"); 41MODULE_DESCRIPTION("Second Extended Filesystem");
42MODULE_LICENSE("GPL"); 42MODULE_LICENSE("GPL");
43 43
44static int __ext2_write_inode(struct inode *inode, int do_sync);
45
44/* 46/*
45 * Test whether an inode is a fast symlink. 47 * Test whether an inode is a fast symlink.
46 */ 48 */
@@ -64,7 +66,7 @@ void ext2_delete_inode (struct inode * inode)
64 goto no_delete; 66 goto no_delete;
65 EXT2_I(inode)->i_dtime = get_seconds(); 67 EXT2_I(inode)->i_dtime = get_seconds();
66 mark_inode_dirty(inode); 68 mark_inode_dirty(inode);
67 ext2_write_inode(inode, inode_needs_sync(inode)); 69 __ext2_write_inode(inode, inode_needs_sync(inode));
68 70
69 inode->i_size = 0; 71 inode->i_size = 0;
70 if (inode->i_blocks) 72 if (inode->i_blocks)
@@ -1335,7 +1337,7 @@ bad_inode:
1335 return ERR_PTR(ret); 1337 return ERR_PTR(ret);
1336} 1338}
1337 1339
1338int ext2_write_inode(struct inode *inode, int do_sync) 1340static int __ext2_write_inode(struct inode *inode, int do_sync)
1339{ 1341{
1340 struct ext2_inode_info *ei = EXT2_I(inode); 1342 struct ext2_inode_info *ei = EXT2_I(inode);
1341 struct super_block *sb = inode->i_sb; 1343 struct super_block *sb = inode->i_sb;
@@ -1440,6 +1442,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
1440 return err; 1442 return err;
1441} 1443}
1442 1444
1445int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1446{
1447 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1448}
1449
1443int ext2_sync_inode(struct inode *inode) 1450int ext2_sync_inode(struct inode *inode)
1444{ 1451{
1445 struct writeback_control wbc = { 1452 struct writeback_control wbc = {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 455e6e6e5cb9..7aca55fcc976 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3096,7 +3096,7 @@ out_brelse:
3096 * `stuff()' is running, and the new i_size will be lost. Plus the inode 3096 * `stuff()' is running, and the new i_size will be lost. Plus the inode
3097 * will no longer be on the superblock's dirty inode list. 3097 * will no longer be on the superblock's dirty inode list.
3098 */ 3098 */
3099int ext3_write_inode(struct inode *inode, int wait) 3099int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3100{ 3100{
3101 if (current->flags & PF_MEMALLOC) 3101 if (current->flags & PF_MEMALLOC)
3102 return 0; 3102 return 0;
@@ -3107,7 +3107,7 @@ int ext3_write_inode(struct inode *inode, int wait)
3107 return -EIO; 3107 return -EIO;
3108 } 3108 }
3109 3109
3110 if (!wait) 3110 if (wbc->sync_mode != WB_SYNC_ALL)
3111 return 0; 3111 return 0;
3112 3112
3113 return ext3_force_commit(inode->i_sb); 3113 return ext3_force_commit(inode->i_sb);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 22bc7435d913..d2f37a5516c7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
97 /* If checksum is bad mark all blocks used to prevent allocation 97 /* If checksum is bad mark all blocks used to prevent allocation
98 * essentially implementing a per-group read-only flag. */ 98 * essentially implementing a per-group read-only flag. */
99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
100 ext4_error(sb, __func__, 100 ext4_error(sb, "Checksum bad for group %u",
101 "Checksum bad for group %u", block_group); 101 block_group);
102 ext4_free_blks_set(sb, gdp, 0); 102 ext4_free_blks_set(sb, gdp, 0);
103 ext4_free_inodes_set(sb, gdp, 0); 103 ext4_free_inodes_set(sb, gdp, 0);
104 ext4_itable_unused_set(sb, gdp, 0); 104 ext4_itable_unused_set(sb, gdp, 0);
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
130 * to make sure we calculate the right free blocks 130 * to make sure we calculate the right free blocks
131 */ 131 */
132 group_blocks = ext4_blocks_count(sbi->s_es) - 132 group_blocks = ext4_blocks_count(sbi->s_es) -
133 le32_to_cpu(sbi->s_es->s_first_data_block) - 133 ext4_group_first_block_no(sb, ngroups - 1);
134 (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
135 } else { 134 } else {
136 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 135 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
137 } 136 }
@@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
189 * when a file system is mounted (see ext4_fill_super). 188 * when a file system is mounted (see ext4_fill_super).
190 */ 189 */
191 190
192
193#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
194
195/** 191/**
196 * ext4_get_group_desc() -- load group descriptor from disk 192 * ext4_get_group_desc() -- load group descriptor from disk
197 * @sb: super block 193 * @sb: super block
@@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
210 struct ext4_sb_info *sbi = EXT4_SB(sb); 206 struct ext4_sb_info *sbi = EXT4_SB(sb);
211 207
212 if (block_group >= ngroups) { 208 if (block_group >= ngroups) {
213 ext4_error(sb, "ext4_get_group_desc", 209 ext4_error(sb, "block_group >= groups_count - block_group = %u,"
214 "block_group >= groups_count - " 210 " groups_count = %u", block_group, ngroups);
215 "block_group = %u, groups_count = %u",
216 block_group, ngroups);
217 211
218 return NULL; 212 return NULL;
219 } 213 }
@@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
221 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 215 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
222 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 216 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
223 if (!sbi->s_group_desc[group_desc]) { 217 if (!sbi->s_group_desc[group_desc]) {
224 ext4_error(sb, "ext4_get_group_desc", 218 ext4_error(sb, "Group descriptor not loaded - "
225 "Group descriptor not loaded - "
226 "block_group = %u, group_desc = %u, desc = %u", 219 "block_group = %u, group_desc = %u, desc = %u",
227 block_group, group_desc, offset); 220 block_group, group_desc, offset);
228 return NULL; 221 return NULL;
@@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
282 return 1; 275 return 1;
283 276
284err_out: 277err_out:
285 ext4_error(sb, __func__, 278 ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
286 "Invalid block bitmap - "
287 "block_group = %d, block = %llu",
288 block_group, bitmap_blk); 279 block_group, bitmap_blk);
289 return 0; 280 return 0;
290} 281}
@@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
311 bitmap_blk = ext4_block_bitmap(sb, desc); 302 bitmap_blk = ext4_block_bitmap(sb, desc);
312 bh = sb_getblk(sb, bitmap_blk); 303 bh = sb_getblk(sb, bitmap_blk);
313 if (unlikely(!bh)) { 304 if (unlikely(!bh)) {
314 ext4_error(sb, __func__, 305 ext4_error(sb, "Cannot read block bitmap - "
315 "Cannot read block bitmap - "
316 "block_group = %u, block_bitmap = %llu", 306 "block_group = %u, block_bitmap = %llu",
317 block_group, bitmap_blk); 307 block_group, bitmap_blk);
318 return NULL; 308 return NULL;
@@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
354 set_bitmap_uptodate(bh); 344 set_bitmap_uptodate(bh);
355 if (bh_submit_read(bh) < 0) { 345 if (bh_submit_read(bh) < 0) {
356 put_bh(bh); 346 put_bh(bh);
357 ext4_error(sb, __func__, 347 ext4_error(sb, "Cannot read block bitmap - "
358 "Cannot read block bitmap - "
359 "block_group = %u, block_bitmap = %llu", 348 "block_group = %u, block_bitmap = %llu",
360 block_group, bitmap_blk); 349 block_group, bitmap_blk);
361 return NULL; 350 return NULL;
@@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
419 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 408 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
420 in_range(block + count - 1, ext4_inode_table(sb, desc), 409 in_range(block + count - 1, ext4_inode_table(sb, desc),
421 sbi->s_itb_per_group)) { 410 sbi->s_itb_per_group)) {
422 ext4_error(sb, __func__, 411 ext4_error(sb, "Adding blocks in system zones - "
423 "Adding blocks in system zones - "
424 "Block = %llu, count = %lu", 412 "Block = %llu, count = %lu",
425 block, count); 413 block, count);
426 goto error_return; 414 goto error_return;
@@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
453 BUFFER_TRACE(bitmap_bh, "clear bit"); 441 BUFFER_TRACE(bitmap_bh, "clear bit");
454 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 442 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
455 bit + i, bitmap_bh->b_data)) { 443 bit + i, bitmap_bh->b_data)) {
456 ext4_error(sb, __func__, 444 ext4_error(sb, "bit already cleared for block %llu",
457 "bit already cleared for block %llu",
458 (ext4_fsblk_t)(block + i)); 445 (ext4_fsblk_t)(block + i));
459 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 446 BUFFER_TRACE(bitmap_bh, "bit already cleared");
460 } else { 447 } else {
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index a60ab9aad57d..983f0e127493 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -205,14 +205,14 @@ void ext4_release_system_zone(struct super_block *sb)
205 entry = rb_entry(n, struct ext4_system_zone, node); 205 entry = rb_entry(n, struct ext4_system_zone, node);
206 kmem_cache_free(ext4_system_zone_cachep, entry); 206 kmem_cache_free(ext4_system_zone_cachep, entry);
207 if (!parent) 207 if (!parent)
208 EXT4_SB(sb)->system_blks.rb_node = NULL; 208 EXT4_SB(sb)->system_blks = RB_ROOT;
209 else if (parent->rb_left == n) 209 else if (parent->rb_left == n)
210 parent->rb_left = NULL; 210 parent->rb_left = NULL;
211 else if (parent->rb_right == n) 211 else if (parent->rb_right == n)
212 parent->rb_right = NULL; 212 parent->rb_right = NULL;
213 n = parent; 213 n = parent;
214 } 214 }
215 EXT4_SB(sb)->system_blks.rb_node = NULL; 215 EXT4_SB(sb)->system_blks = RB_ROOT;
216} 216}
217 217
218/* 218/*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e262..86cb6d86a048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 ext4_error(dir->i_sb, function, 86 __ext4_error(dir->i_sb, function,
87 "bad entry in directory #%lu: %s - " 87 "bad entry in directory #%lu: %s - block=%llu"
88 "offset=%u, inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, offset, 89 dir->i_ino, error_msg,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset,
90 le32_to_cpu(de->inode), 92 le32_to_cpu(de->inode),
91 rlen, de->name_len); 93 rlen, de->name_len);
92 return error_msg == NULL ? 1 : 0; 94 return error_msg == NULL ? 1 : 0;
@@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp,
150 */ 152 */
151 if (!bh) { 153 if (!bh) {
152 if (!dir_has_error) { 154 if (!dir_has_error) {
153 ext4_error(sb, __func__, "directory #%lu " 155 ext4_error(sb, "directory #%lu "
154 "contains a hole at offset %Lu", 156 "contains a hole at offset %Lu",
155 inode->i_ino, 157 inode->i_ino,
156 (unsigned long long) filp->f_pos); 158 (unsigned long long) filp->f_pos);
@@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root)
303 kfree(old); 305 kfree(old);
304 } 306 }
305 if (!parent) 307 if (!parent)
306 root->rb_node = NULL; 308 *root = RB_ROOT;
307 else if (parent->rb_left == n) 309 else if (parent->rb_left == n)
308 parent->rb_left = NULL; 310 parent->rb_left = NULL;
309 else if (parent->rb_right == n) 311 else if (parent->rb_right == n)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 874d169a193e..bf938cf7c5f0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,6 +53,12 @@
53#define ext4_debug(f, a...) do {} while (0) 53#define ext4_debug(f, a...) do {} while (0)
54#endif 54#endif
55 55
56#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a);
58
59#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a);
61
56/* data type for block offset of block group */ 62/* data type for block offset of block group */
57typedef int ext4_grpblk_t; 63typedef int ext4_grpblk_t;
58 64
@@ -133,14 +139,14 @@ struct mpage_da_data {
133 int pages_written; 139 int pages_written;
134 int retval; 140 int retval;
135}; 141};
136#define DIO_AIO_UNWRITTEN 0x1 142#define EXT4_IO_UNWRITTEN 0x1
137typedef struct ext4_io_end { 143typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */ 144 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */ 145 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */ 146 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */ 147 struct page *page; /* page struct for buffer write */
142 ext4_lblk_t offset; /* offset in the file */ 148 loff_t offset; /* offset in the file */
143 size_t size; /* size of the extent */ 149 ssize_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */ 150 struct work_struct work; /* data work queue */
145} ext4_io_end_t; 151} ext4_io_end_t;
146 152
@@ -284,10 +290,12 @@ struct flex_groups {
284#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 290#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
285#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 291#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
286#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 292#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
293#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
294#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
287#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 295#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
288 296
289#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 297#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
290#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ 298#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
291 299
292/* Flags that should be inherited by new inodes from their parent. */ 300/* Flags that should be inherited by new inodes from their parent. */
293#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 301#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
313 return flags & EXT4_OTHER_FLMASK; 321 return flags & EXT4_OTHER_FLMASK;
314} 322}
315 323
316/*
317 * Inode dynamic state flags
318 */
319#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
320#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
321#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
322#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
323#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
324#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
325#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
326
327/* Used to pass group descriptor data when online resize is done */ 324/* Used to pass group descriptor data when online resize is done */
328struct ext4_new_group_input { 325struct ext4_new_group_input {
329 __u32 group; /* Group number for this data */ 326 __u32 group; /* Group number for this data */
@@ -364,19 +361,20 @@ struct ext4_new_group_data {
364 /* caller is from the direct IO path, request to creation of an 361 /* caller is from the direct IO path, request to creation of an
365 unitialized extents if not allocated, split the uninitialized 362 unitialized extents if not allocated, split the uninitialized
366 extent if blocks has been preallocated already*/ 363 extent if blocks has been preallocated already*/
367#define EXT4_GET_BLOCKS_DIO 0x0008 364#define EXT4_GET_BLOCKS_PRE_IO 0x0008
368#define EXT4_GET_BLOCKS_CONVERT 0x0010 365#define EXT4_GET_BLOCKS_CONVERT 0x0010
369#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ 366#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
367 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
368 /* Convert extent to initialized after IO complete */
369#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
370 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 370 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
371 /* Convert extent to initialized after direct IO complete */
372#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
373 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
374 371
375/* 372/*
376 * Flags used by ext4_free_blocks 373 * Flags used by ext4_free_blocks
377 */ 374 */
378#define EXT4_FREE_BLOCKS_METADATA 0x0001 375#define EXT4_FREE_BLOCKS_METADATA 0x0001
379#define EXT4_FREE_BLOCKS_FORGET 0x0002 376#define EXT4_FREE_BLOCKS_FORGET 0x0002
377#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
380 378
381/* 379/*
382 * ioctl commands 380 * ioctl commands
@@ -630,7 +628,7 @@ struct ext4_inode_info {
630 * near to their parent directory's inode. 628 * near to their parent directory's inode.
631 */ 629 */
632 ext4_group_t i_block_group; 630 ext4_group_t i_block_group;
633 __u32 i_state; /* Dynamic state flags for ext4 */ 631 unsigned long i_state_flags; /* Dynamic state flags */
634 632
635 ext4_lblk_t i_dir_start_lookup; 633 ext4_lblk_t i_dir_start_lookup;
636#ifdef CONFIG_EXT4_FS_XATTR 634#ifdef CONFIG_EXT4_FS_XATTR
@@ -708,8 +706,9 @@ struct ext4_inode_info {
708 qsize_t i_reserved_quota; 706 qsize_t i_reserved_quota;
709#endif 707#endif
710 708
711 /* completed async DIOs that might need unwritten extents handling */ 709 /* completed IOs that might need unwritten extents handling */
712 struct list_head i_aio_dio_complete_list; 710 struct list_head i_completed_io_list;
711 spinlock_t i_completed_io_lock;
713 /* current io_end structure for async DIO write*/ 712 /* current io_end structure for async DIO write*/
714 ext4_io_end_t *cur_aio_dio; 713 ext4_io_end_t *cur_aio_dio;
715 714
@@ -760,6 +759,7 @@ struct ext4_inode_info {
760#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 759#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
761#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 760#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
762#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 761#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
762#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
763#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 763#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
764#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 764#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
765#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 765#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -1014,7 +1014,7 @@ struct ext4_sb_info {
1014 atomic_t s_lock_busy; 1014 atomic_t s_lock_busy;
1015 1015
1016 /* locality groups */ 1016 /* locality groups */
1017 struct ext4_locality_group *s_locality_groups; 1017 struct ext4_locality_group __percpu *s_locality_groups;
1018 1018
1019 /* for write statistics */ 1019 /* for write statistics */
1020 unsigned long s_sectors_written_start; 1020 unsigned long s_sectors_written_start;
@@ -1050,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1050 (ino >= EXT4_FIRST_INO(sb) && 1050 (ino >= EXT4_FIRST_INO(sb) &&
1051 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); 1051 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
1052} 1052}
1053
1054/*
1055 * Inode dynamic state flags
1056 */
1057enum {
1058 EXT4_STATE_JDATA, /* journaled data exists */
1059 EXT4_STATE_NEW, /* inode is newly created */
1060 EXT4_STATE_XATTR, /* has in-inode xattrs */
1061 EXT4_STATE_NO_EXPAND, /* No space for expansion */
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1065};
1066
1067static inline int ext4_test_inode_state(struct inode *inode, int bit)
1068{
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags);
1070}
1071
1072static inline void ext4_set_inode_state(struct inode *inode, int bit)
1073{
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags);
1075}
1076
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit)
1078{
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1053#else 1081#else
1054/* Assume that user mode programs are passing in an ext4fs superblock, not 1082/* Assume that user mode programs are passing in an ext4fs superblock, not
1055 * a kernel struct super_block. This will allow us to call the feature-test 1083 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1126,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1126#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 1154#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
1127#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 1155#define EXT4_FEATURE_INCOMPAT_MMP 0x0100
1128#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 1156#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
1157#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1158#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1129 1159
1130#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1160#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1131#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1161#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1416,7 +1446,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
1416 struct buffer_head *bh_result, int create); 1446 struct buffer_head *bh_result, int create);
1417 1447
1418extern struct inode *ext4_iget(struct super_block *, unsigned long); 1448extern struct inode *ext4_iget(struct super_block *, unsigned long);
1419extern int ext4_write_inode(struct inode *, int); 1449extern int ext4_write_inode(struct inode *, struct writeback_control *);
1420extern int ext4_setattr(struct dentry *, struct iattr *); 1450extern int ext4_setattr(struct dentry *, struct iattr *);
1421extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1451extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1422 struct kstat *stat); 1452 struct kstat *stat);
@@ -1439,7 +1469,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1439 struct address_space *mapping, loff_t from); 1469 struct address_space *mapping, loff_t from);
1440extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1470extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1441extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1471extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1442extern int flush_aio_dio_completed_IO(struct inode *inode); 1472extern int flush_completed_IO(struct inode *inode);
1443extern void ext4_da_update_reserve_space(struct inode *inode, 1473extern void ext4_da_update_reserve_space(struct inode *inode,
1444 int used, int quota_claim); 1474 int used, int quota_claim);
1445/* ioctl.c */ 1475/* ioctl.c */
@@ -1465,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb,
1465 ext4_fsblk_t n_blocks_count); 1495 ext4_fsblk_t n_blocks_count);
1466 1496
1467/* super.c */ 1497/* super.c */
1468extern void ext4_error(struct super_block *, const char *, const char *, ...) 1498extern void __ext4_error(struct super_block *, const char *, const char *, ...)
1499 __attribute__ ((format (printf, 3, 4)));
1500#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message)
1501extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
1502 __attribute__ ((format (printf, 3, 4)));
1503extern void ext4_error_file(const char *, struct file *, const char *, ...)
1469 __attribute__ ((format (printf, 3, 4))); 1504 __attribute__ ((format (printf, 3, 4)));
1470extern void __ext4_std_error(struct super_block *, const char *, int); 1505extern void __ext4_std_error(struct super_block *, const char *, int);
1471extern void ext4_abort(struct super_block *, const char *, const char *, ...) 1506extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1472 __attribute__ ((format (printf, 3, 4))); 1507 __attribute__ ((format (printf, 3, 4)));
1473extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1508extern void __ext4_warning(struct super_block *, const char *,
1509 const char *, ...)
1474 __attribute__ ((format (printf, 3, 4))); 1510 __attribute__ ((format (printf, 3, 4)));
1511#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message)
1475extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1512extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1476 __attribute__ ((format (printf, 3, 4))); 1513 __attribute__ ((format (printf, 3, 4)));
1477extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1514extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
@@ -1744,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *);
1744extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1781extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1745 loff_t len); 1782 loff_t len);
1746extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1747 loff_t len); 1784 ssize_t len);
1748extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1749 sector_t block, unsigned int max_blocks, 1786 sector_t block, unsigned int max_blocks,
1750 struct buffer_head *bh, int flags); 1787 struct buffer_head *bh, int flags);
@@ -1756,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
1756 __u64 len, __u64 *moved_len); 1793 __u64 len, __u64 *moved_len);
1757 1794
1758 1795
1796/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
1797enum ext4_state_bits {
1798 BH_Uninit /* blocks are allocated but uninitialized on disk */
1799 = BH_JBDPrivateStart,
1800};
1801
1802BUFFER_FNS(Uninit, uninit)
1803TAS_BUFFER_FNS(Uninit, uninit)
1804
1759/* 1805/*
1760 * Add new method to test wether block and inode bitmaps are properly 1806 * Add new method to test wether block and inode bitmaps are properly
1761 * initialized. With uninit_bg reading the block from disk is not enough 1807 * initialized. With uninit_bg reading the block from disk is not enough
@@ -1773,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
1773 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); 1819 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
1774} 1820}
1775 1821
1822#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
1823
1776#endif /* __KERNEL__ */ 1824#endif /* __KERNEL__ */
1777 1825
1778#endif /* _EXT4_H */ 1826#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b57e5c711b6d..53d2764d71ca 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -125,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
125 ext4_journal_abort_handle(where, __func__, bh, 125 ext4_journal_abort_handle(where, __func__, bh,
126 handle, err); 126 handle, err);
127 } else { 127 } else {
128 if (inode && bh) 128 if (inode)
129 mark_buffer_dirty_inode(bh, inode); 129 mark_buffer_dirty_inode(bh, inode);
130 else 130 else
131 mark_buffer_dirty(bh); 131 mark_buffer_dirty(bh);
132 if (inode && inode_needs_sync(inode)) { 132 if (inode && inode_needs_sync(inode)) {
133 sync_dirty_buffer(bh); 133 sync_dirty_buffer(bh);
134 if (buffer_req(bh) && !buffer_uptodate(bh)) { 134 if (buffer_req(bh) && !buffer_uptodate(bh)) {
135 ext4_error(inode->i_sb, __func__, 135 ext4_error(inode->i_sb,
136 "IO error syncing inode, " 136 "IO error syncing inode, "
137 "inode=%lu, block=%llu", 137 "inode=%lu, block=%llu",
138 inode->i_ino, 138 inode->i_ino,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca817d704..b79ad5126468 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
304 return 0; 304 return 0;
305} 305}
306 306
307/*
308 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is
312 * enabled, since the dioread_nolock code uses b_private to pass
313 * information back to the I/O completion handler, and this conflicts
314 * with the jbd's use of b_private.
315 */
316static inline int ext4_should_dioread_nolock(struct inode *inode)
317{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode))
323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
325 return 0;
326 if (ext4_should_journal_data(inode))
327 return 0;
328 return 1;
329}
330
307#endif /* _EXT4_JBD2_H */ 331#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 765a4826b118..94c8ee81f5e1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
195 if (S_ISREG(inode->i_mode)) 195 if (S_ISREG(inode->i_mode))
196 block_group++; 196 block_group++;
197 } 197 }
198 bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 198 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
199 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
200 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 199 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
201 200
202 /* 201 /*
@@ -440,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
440 return 0; 439 return 0;
441 440
442corrupted: 441corrupted:
443 ext4_error(inode->i_sb, function, 442 __ext4_error(inode->i_sb, function,
444 "bad header/extent in inode #%lu: %s - magic %x, " 443 "bad header/extent in inode #%lu: %s - magic %x, "
445 "entries %u, max %u(%u), depth %u(%u)", 444 "entries %u, max %u(%u), depth %u(%u)",
446 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
@@ -703,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
703 } 702 }
704 eh = ext_block_hdr(bh); 703 eh = ext_block_hdr(bh);
705 ppos++; 704 ppos++;
706 BUG_ON(ppos > depth); 705 if (unlikely(ppos > depth)) {
706 put_bh(bh);
707 EXT4_ERROR_INODE(inode,
708 "ppos %d > depth %d", ppos, depth);
709 goto err;
710 }
707 path[ppos].p_bh = bh; 711 path[ppos].p_bh = bh;
708 path[ppos].p_hdr = eh; 712 path[ppos].p_hdr = eh;
709 i--; 713 i--;
@@ -749,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
749 if (err) 753 if (err)
750 return err; 754 return err;
751 755
752 BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); 756 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
757 EXT4_ERROR_INODE(inode,
758 "logical %d == ei_block %d!",
759 logical, le32_to_cpu(curp->p_idx->ei_block));
760 return -EIO;
761 }
753 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 762 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
754 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 763 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
755 /* insert after */ 764 /* insert after */
@@ -779,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
779 ext4_idx_store_pblock(ix, ptr); 788 ext4_idx_store_pblock(ix, ptr);
780 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 789 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
781 790
782 BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) 791 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
783 > le16_to_cpu(curp->p_hdr->eh_max)); 792 > le16_to_cpu(curp->p_hdr->eh_max))) {
784 BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); 793 EXT4_ERROR_INODE(inode,
794 "logical %d == ei_block %d!",
795 logical, le32_to_cpu(curp->p_idx->ei_block));
796 return -EIO;
797 }
798 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
799 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
800 return -EIO;
801 }
785 802
786 err = ext4_ext_dirty(handle, inode, curp); 803 err = ext4_ext_dirty(handle, inode, curp);
787 ext4_std_error(inode->i_sb, err); 804 ext4_std_error(inode->i_sb, err);
@@ -819,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
819 836
820 /* if current leaf will be split, then we should use 837 /* if current leaf will be split, then we should use
821 * border from split point */ 838 * border from split point */
822 BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); 839 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
840 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
841 return -EIO;
842 }
823 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 843 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
824 border = path[depth].p_ext[1].ee_block; 844 border = path[depth].p_ext[1].ee_block;
825 ext_debug("leaf will be split." 845 ext_debug("leaf will be split."
@@ -860,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
860 880
861 /* initialize new leaf */ 881 /* initialize new leaf */
862 newblock = ablocks[--a]; 882 newblock = ablocks[--a];
863 BUG_ON(newblock == 0); 883 if (unlikely(newblock == 0)) {
884 EXT4_ERROR_INODE(inode, "newblock == 0!");
885 err = -EIO;
886 goto cleanup;
887 }
864 bh = sb_getblk(inode->i_sb, newblock); 888 bh = sb_getblk(inode->i_sb, newblock);
865 if (!bh) { 889 if (!bh) {
866 err = -EIO; 890 err = -EIO;
@@ -880,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
880 ex = EXT_FIRST_EXTENT(neh); 904 ex = EXT_FIRST_EXTENT(neh);
881 905
882 /* move remainder of path[depth] to the new leaf */ 906 /* move remainder of path[depth] to the new leaf */
883 BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); 907 if (unlikely(path[depth].p_hdr->eh_entries !=
908 path[depth].p_hdr->eh_max)) {
909 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
910 path[depth].p_hdr->eh_entries,
911 path[depth].p_hdr->eh_max);
912 err = -EIO;
913 goto cleanup;
914 }
884 /* start copy from next extent */ 915 /* start copy from next extent */
885 /* TODO: we could do it by single memmove */ 916 /* TODO: we could do it by single memmove */
886 m = 0; 917 m = 0;
@@ -927,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
927 958
928 /* create intermediate indexes */ 959 /* create intermediate indexes */
929 k = depth - at - 1; 960 k = depth - at - 1;
930 BUG_ON(k < 0); 961 if (unlikely(k < 0)) {
962 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
963 err = -EIO;
964 goto cleanup;
965 }
931 if (k) 966 if (k)
932 ext_debug("create %d intermediate indices\n", k); 967 ext_debug("create %d intermediate indices\n", k);
933 /* insert new index into current index block */ 968 /* insert new index into current index block */
@@ -964,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
964 999
965 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
966 EXT_MAX_INDEX(path[i].p_hdr)); 1001 EXT_MAX_INDEX(path[i].p_hdr));
967 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != 1002 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
968 EXT_LAST_INDEX(path[i].p_hdr)); 1003 EXT_LAST_INDEX(path[i].p_hdr))) {
1004 EXT4_ERROR_INODE(inode,
1005 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1006 le32_to_cpu(path[i].p_ext->ee_block));
1007 err = -EIO;
1008 goto cleanup;
1009 }
969 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1010 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
970 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1011 ext_debug("%d: move %d:%llu in new index %llu\n", i,
971 le32_to_cpu(path[i].p_idx->ei_block), 1012 le32_to_cpu(path[i].p_idx->ei_block),
@@ -1203,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1203 struct ext4_extent *ex; 1244 struct ext4_extent *ex;
1204 int depth, ee_len; 1245 int depth, ee_len;
1205 1246
1206 BUG_ON(path == NULL); 1247 if (unlikely(path == NULL)) {
1248 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1249 return -EIO;
1250 }
1207 depth = path->p_depth; 1251 depth = path->p_depth;
1208 *phys = 0; 1252 *phys = 0;
1209 1253
@@ -1217,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1217 ex = path[depth].p_ext; 1261 ex = path[depth].p_ext;
1218 ee_len = ext4_ext_get_actual_len(ex); 1262 ee_len = ext4_ext_get_actual_len(ex);
1219 if (*logical < le32_to_cpu(ex->ee_block)) { 1263 if (*logical < le32_to_cpu(ex->ee_block)) {
1220 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1264 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1265 EXT4_ERROR_INODE(inode,
1266 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1267 *logical, le32_to_cpu(ex->ee_block));
1268 return -EIO;
1269 }
1221 while (--depth >= 0) { 1270 while (--depth >= 0) {
1222 ix = path[depth].p_idx; 1271 ix = path[depth].p_idx;
1223 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1272 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1273 EXT4_ERROR_INODE(inode,
1274 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1275 ix != NULL ? ix->ei_block : 0,
1276 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1277 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1278 depth);
1279 return -EIO;
1280 }
1224 } 1281 }
1225 return 0; 1282 return 0;
1226 } 1283 }
1227 1284
1228 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1285 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1286 EXT4_ERROR_INODE(inode,
1287 "logical %d < ee_block %d + ee_len %d!",
1288 *logical, le32_to_cpu(ex->ee_block), ee_len);
1289 return -EIO;
1290 }
1229 1291
1230 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1292 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1231 *phys = ext_pblock(ex) + ee_len - 1; 1293 *phys = ext_pblock(ex) + ee_len - 1;
@@ -1251,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1251 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1313 int depth; /* Note, NOT eh_depth; depth from top of tree */
1252 int ee_len; 1314 int ee_len;
1253 1315
1254 BUG_ON(path == NULL); 1316 if (unlikely(path == NULL)) {
1317 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1318 return -EIO;
1319 }
1255 depth = path->p_depth; 1320 depth = path->p_depth;
1256 *phys = 0; 1321 *phys = 0;
1257 1322
@@ -1265,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1265 ex = path[depth].p_ext; 1330 ex = path[depth].p_ext;
1266 ee_len = ext4_ext_get_actual_len(ex); 1331 ee_len = ext4_ext_get_actual_len(ex);
1267 if (*logical < le32_to_cpu(ex->ee_block)) { 1332 if (*logical < le32_to_cpu(ex->ee_block)) {
1268 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1333 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1334 EXT4_ERROR_INODE(inode,
1335 "first_extent(path[%d].p_hdr) != ex",
1336 depth);
1337 return -EIO;
1338 }
1269 while (--depth >= 0) { 1339 while (--depth >= 0) {
1270 ix = path[depth].p_idx; 1340 ix = path[depth].p_idx;
1271 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1341 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1342 EXT4_ERROR_INODE(inode,
1343 "ix != EXT_FIRST_INDEX *logical %d!",
1344 *logical);
1345 return -EIO;
1346 }
1272 } 1347 }
1273 *logical = le32_to_cpu(ex->ee_block); 1348 *logical = le32_to_cpu(ex->ee_block);
1274 *phys = ext_pblock(ex); 1349 *phys = ext_pblock(ex);
1275 return 0; 1350 return 0;
1276 } 1351 }
1277 1352
1278 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1353 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1354 EXT4_ERROR_INODE(inode,
1355 "logical %d < ee_block %d + ee_len %d!",
1356 *logical, le32_to_cpu(ex->ee_block), ee_len);
1357 return -EIO;
1358 }
1279 1359
1280 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1360 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1281 /* next allocated block in this leaf */ 1361 /* next allocated block in this leaf */
@@ -1414,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1414 1494
1415 eh = path[depth].p_hdr; 1495 eh = path[depth].p_hdr;
1416 ex = path[depth].p_ext; 1496 ex = path[depth].p_ext;
1417 BUG_ON(ex == NULL); 1497
1418 BUG_ON(eh == NULL); 1498 if (unlikely(ex == NULL || eh == NULL)) {
1499 EXT4_ERROR_INODE(inode,
1500 "ex %p == NULL or eh %p == NULL", ex, eh);
1501 return -EIO;
1502 }
1419 1503
1420 if (depth == 0) { 1504 if (depth == 0) {
1421 /* there is no tree at all */ 1505 /* there is no tree at all */
@@ -1538,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
1538 merge_done = 1; 1622 merge_done = 1;
1539 WARN_ON(eh->eh_entries == 0); 1623 WARN_ON(eh->eh_entries == 0);
1540 if (!eh->eh_entries) 1624 if (!eh->eh_entries)
1541 ext4_error(inode->i_sb, "ext4_ext_try_to_merge", 1625 ext4_error(inode->i_sb,
1542 "inode#%lu, eh->eh_entries = 0!", inode->i_ino); 1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1543 } 1628 }
1544 1629
1545 return merge_done; 1630 return merge_done;
@@ -1612,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1612 ext4_lblk_t next; 1697 ext4_lblk_t next;
1613 unsigned uninitialized = 0; 1698 unsigned uninitialized = 0;
1614 1699
1615 BUG_ON(ext4_ext_get_actual_len(newext) == 0); 1700 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1701 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1702 return -EIO;
1703 }
1616 depth = ext_depth(inode); 1704 depth = ext_depth(inode);
1617 ex = path[depth].p_ext; 1705 ex = path[depth].p_ext;
1618 BUG_ON(path[depth].p_hdr == NULL); 1706 if (unlikely(path[depth].p_hdr == NULL)) {
1707 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1708 return -EIO;
1709 }
1619 1710
1620 /* try to insert block into found extent and return */ 1711 /* try to insert block into found extent and return */
1621 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1712 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1622 && ext4_can_extents_be_merged(inode, ex, newext)) { 1713 && ext4_can_extents_be_merged(inode, ex, newext)) {
1623 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1714 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1624 ext4_ext_is_uninitialized(newext), 1715 ext4_ext_is_uninitialized(newext),
@@ -1739,7 +1830,7 @@ has_space:
1739 1830
1740merge: 1831merge:
1741 /* try to merge extents to the right */ 1832 /* try to merge extents to the right */
1742 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1833 if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1743 ext4_ext_try_to_merge(inode, path, nearex); 1834 ext4_ext_try_to_merge(inode, path, nearex);
1744 1835
1745 /* try to merge extents to the left */ 1836 /* try to merge extents to the left */
@@ -1787,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1787 } 1878 }
1788 1879
1789 depth = ext_depth(inode); 1880 depth = ext_depth(inode);
1790 BUG_ON(path[depth].p_hdr == NULL); 1881 if (unlikely(path[depth].p_hdr == NULL)) {
1882 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1883 err = -EIO;
1884 break;
1885 }
1791 ex = path[depth].p_ext; 1886 ex = path[depth].p_ext;
1792 next = ext4_ext_next_allocated_block(path); 1887 next = ext4_ext_next_allocated_block(path);
1793 1888
@@ -1838,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1838 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1933 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1839 } 1934 }
1840 1935
1841 BUG_ON(cbex.ec_len == 0); 1936 if (unlikely(cbex.ec_len == 0)) {
1937 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1938 err = -EIO;
1939 break;
1940 }
1842 err = func(inode, path, &cbex, ex, cbdata); 1941 err = func(inode, path, &cbex, ex, cbdata);
1843 ext4_ext_drop_refs(path); 1942 ext4_ext_drop_refs(path);
1844 1943
@@ -1952,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1952 2051
1953 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 2052 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1954 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 2053 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1955 if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { 2054 if (in_range(block, cex->ec_block, cex->ec_len)) {
1956 ex->ee_block = cpu_to_le32(cex->ec_block); 2055 ex->ee_block = cpu_to_le32(cex->ec_block);
1957 ext4_ext_store_pblock(ex, cex->ec_start); 2056 ext4_ext_store_pblock(ex, cex->ec_start);
1958 ex->ee_len = cpu_to_le16(cex->ec_len); 2057 ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -1981,7 +2080,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1981 /* free index block */ 2080 /* free index block */
1982 path--; 2081 path--;
1983 leaf = idx_pblock(path->p_idx); 2082 leaf = idx_pblock(path->p_idx);
1984 BUG_ON(path->p_hdr->eh_entries == 0); 2083 if (unlikely(path->p_hdr->eh_entries == 0)) {
2084 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2085 return -EIO;
2086 }
1985 err = ext4_ext_get_access(handle, inode, path); 2087 err = ext4_ext_get_access(handle, inode, path);
1986 if (err) 2088 if (err)
1987 return err; 2089 return err;
@@ -2119,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2119 if (!path[depth].p_hdr) 2221 if (!path[depth].p_hdr)
2120 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2222 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2121 eh = path[depth].p_hdr; 2223 eh = path[depth].p_hdr;
2122 BUG_ON(eh == NULL); 2224 if (unlikely(path[depth].p_hdr == NULL)) {
2123 2225 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2226 return -EIO;
2227 }
2124 /* find where to start removing */ 2228 /* find where to start removing */
2125 ex = EXT_LAST_EXTENT(eh); 2229 ex = EXT_LAST_EXTENT(eh);
2126 2230
@@ -2983,7 +3087,7 @@ fix_extent_len:
2983 ext4_ext_dirty(handle, inode, path + depth); 3087 ext4_ext_dirty(handle, inode, path + depth);
2984 return err; 3088 return err;
2985} 3089}
2986static int ext4_convert_unwritten_extents_dio(handle_t *handle, 3090static int ext4_convert_unwritten_extents_endio(handle_t *handle,
2987 struct inode *inode, 3091 struct inode *inode,
2988 struct ext4_ext_path *path) 3092 struct ext4_ext_path *path)
2989{ 3093{
@@ -3063,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3063 flags, allocated); 3167 flags, allocated);
3064 ext4_ext_show_leaf(inode, path); 3168 ext4_ext_show_leaf(inode, path);
3065 3169
3066 /* DIO get_block() before submit the IO, split the extent */ 3170 /* get_block() before submit the IO, split the extent */
3067 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3068 ret = ext4_split_unwritten_extents(handle, 3172 ret = ext4_split_unwritten_extents(handle,
3069 inode, path, iblock, 3173 inode, path, iblock,
3070 max_blocks, flags); 3174 max_blocks, flags);
@@ -3074,14 +3178,16 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3074 * completed 3178 * completed
3075 */ 3179 */
3076 if (io) 3180 if (io)
3077 io->flag = DIO_AIO_UNWRITTEN; 3181 io->flag = EXT4_IO_UNWRITTEN;
3078 else 3182 else
3079 EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; 3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result);
3080 goto out; 3186 goto out;
3081 } 3187 }
3082 /* async DIO end_io complete, convert the filled extent to written */ 3188 /* IO end_io complete, convert the filled extent to written */
3083 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { 3189 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3084 ret = ext4_convert_unwritten_extents_dio(handle, inode, 3190 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3085 path); 3191 path);
3086 if (ret >= 0) 3192 if (ret >= 0)
3087 ext4_update_inode_fsync_trans(handle, inode, 1); 3193 ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3185,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3185{ 3291{
3186 struct ext4_ext_path *path = NULL; 3292 struct ext4_ext_path *path = NULL;
3187 struct ext4_extent_header *eh; 3293 struct ext4_extent_header *eh;
3188 struct ext4_extent newex, *ex; 3294 struct ext4_extent newex, *ex, *last_ex;
3189 ext4_fsblk_t newblock; 3295 ext4_fsblk_t newblock;
3190 int err = 0, depth, ret, cache_type; 3296 int err = 0, depth, ret, cache_type;
3191 unsigned int allocated = 0; 3297 unsigned int allocated = 0;
@@ -3237,10 +3343,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3237 * this situation is possible, though, _during_ tree modification; 3343 * this situation is possible, though, _during_ tree modification;
3238 * this is why assert can't be put in ext4_ext_find_extent() 3344 * this is why assert can't be put in ext4_ext_find_extent()
3239 */ 3345 */
3240 if (path[depth].p_ext == NULL && depth != 0) { 3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3241 ext4_error(inode->i_sb, __func__, "bad extent address " 3347 EXT4_ERROR_INODE(inode, "bad extent address "
3242 "inode: %lu, iblock: %d, depth: %d", 3348 "iblock: %d, depth: %d pblock %lld",
3243 inode->i_ino, iblock, depth); 3349 iblock, depth, path[depth].p_block);
3244 err = -EIO; 3350 err = -EIO;
3245 goto out2; 3351 goto out2;
3246 } 3352 }
@@ -3258,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3258 */ 3364 */
3259 ee_len = ext4_ext_get_actual_len(ex); 3365 ee_len = ext4_ext_get_actual_len(ex);
3260 /* if found extent covers block, simply return it */ 3366 /* if found extent covers block, simply return it */
3261 if (iblock >= ee_block && iblock < ee_block + ee_len) { 3367 if (in_range(iblock, ee_block, ee_len)) {
3262 newblock = iblock - ee_block + ee_start; 3368 newblock = iblock - ee_block + ee_start;
3263 /* number of remaining blocks in the extent */ 3369 /* number of remaining blocks in the extent */
3264 allocated = ee_len - (iblock - ee_block); 3370 allocated = ee_len - (iblock - ee_block);
@@ -3350,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3350 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3456 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3351 ext4_ext_mark_uninitialized(&newex); 3457 ext4_ext_mark_uninitialized(&newex);
3352 /* 3458 /*
3353 * io_end structure was created for every async 3459 * io_end structure was created for every IO write to an
3354 * direct IO write to the middle of the file. 3460 * uninitialized extent. To avoid unecessary conversion,
3355 * To avoid unecessary convertion for every aio dio rewrite 3461 * here we flag the IO that really needs the conversion.
3356 * to the mid of file, here we flag the IO that is really
3357 * need the convertion.
3358 * For non asycn direct IO case, flag the inode state 3462 * For non asycn direct IO case, flag the inode state
3359 * that we need to perform convertion when IO is done. 3463 * that we need to perform convertion when IO is done.
3360 */ 3464 */
3361 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3362 if (io) 3466 if (io)
3363 io->flag = DIO_AIO_UNWRITTEN; 3467 io->flag = EXT4_IO_UNWRITTEN;
3364 else 3468 else
3365 EXT4_I(inode)->i_state |= 3469 ext4_set_inode_state(inode,
3366 EXT4_STATE_DIO_UNWRITTEN;; 3470 EXT4_STATE_DIO_UNWRITTEN);
3471 }
3472 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result);
3474 }
3475
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
3477 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d",
3480 ex->ee_block);
3481 err = -EIO;
3482 goto out2;
3367 } 3483 }
3484 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
3486 + ext4_ext_get_actual_len(last_ex))
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
3368 } 3488 }
3369 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3370 if (err) { 3490 if (err) {
@@ -3499,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
3499 i_size_write(inode, new_size); 3619 i_size_write(inode, new_size);
3500 if (new_size > EXT4_I(inode)->i_disksize) 3620 if (new_size > EXT4_I(inode)->i_disksize)
3501 ext4_update_i_disksize(inode, new_size); 3621 ext4_update_i_disksize(inode, new_size);
3622 } else {
3623 /*
3624 * Mark that we allocate beyond EOF so the subsequent truncate
3625 * can proceed even if the new size is the same as i_size.
3626 */
3627 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
3502 } 3629 }
3503 3630
3504} 3631}
@@ -3603,7 +3730,7 @@ retry:
3603 * Returns 0 on success. 3730 * Returns 0 on success.
3604 */ 3731 */
3605int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3732int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3606 loff_t len) 3733 ssize_t len)
3607{ 3734{
3608 handle_t *handle; 3735 handle_t *handle;
3609 ext4_lblk_t block; 3736 ext4_lblk_t block;
@@ -3635,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3635 map_bh.b_state = 0; 3762 map_bh.b_state = 0;
3636 ret = ext4_get_blocks(handle, inode, block, 3763 ret = ext4_get_blocks(handle, inode, block,
3637 max_blocks, &map_bh, 3764 max_blocks, &map_bh,
3638 EXT4_GET_BLOCKS_DIO_CONVERT_EXT); 3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3639 if (ret <= 0) { 3766 if (ret <= 0) {
3640 WARN_ON(ret <= 0); 3767 WARN_ON(ret <= 0);
3641 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3768 printk(KERN_ERR "%s: ext4_ext_get_blocks "
@@ -3739,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
3739 int error = 0; 3866 int error = 0;
3740 3867
3741 /* in-inode? */ 3868 /* in-inode? */
3742 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 3869 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
3743 struct ext4_iloc iloc; 3870 struct ext4_iloc iloc;
3744 int offset; /* offset of xattr in inode */ 3871 int offset; /* offset of xattr in inode */
3745 3872
@@ -3767,7 +3894,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3767 __u64 start, __u64 len) 3894 __u64 start, __u64 len)
3768{ 3895{
3769 ext4_lblk_t start_blk; 3896 ext4_lblk_t start_blk;
3770 ext4_lblk_t len_blks;
3771 int error = 0; 3897 int error = 0;
3772 3898
3773 /* fallback to generic here if not in extents fmt */ 3899 /* fallback to generic here if not in extents fmt */
@@ -3781,8 +3907,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3781 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 3907 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3782 error = ext4_xattr_fiemap(inode, fieinfo); 3908 error = ext4_xattr_fiemap(inode, fieinfo);
3783 } else { 3909 } else {
3910 ext4_lblk_t len_blks;
3911 __u64 last_blk;
3912
3784 start_blk = start >> inode->i_sb->s_blocksize_bits; 3913 start_blk = start >> inode->i_sb->s_blocksize_bits;
3785 len_blks = len >> inode->i_sb->s_blocksize_bits; 3914 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
3915 if (last_blk >= EXT_MAX_BLOCK)
3916 last_blk = EXT_MAX_BLOCK-1;
3917 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
3786 3918
3787 /* 3919 /*
3788 * Walk the extent tree gathering extent information. 3920 * Walk the extent tree gathering extent information.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583cef28..503a48927402 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -35,9 +35,9 @@
35 */ 35 */
36static int ext4_release_file(struct inode *inode, struct file *filp) 36static int ext4_release_file(struct inode *inode, struct file *filp)
37{ 37{
38 if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { 38 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
39 ext4_alloc_da_blocks(inode); 39 ext4_alloc_da_blocks(inode);
40 EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; 40 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
41 } 41 }
42 /* if we are the last writer on the inode, drop the block reservation */ 42 /* if we are the last writer on the inode, drop the block reservation */
43 if ((filp->f_mode & FMODE_WRITE) && 43 if ((filp->f_mode & FMODE_WRITE) &&
@@ -116,11 +116,9 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
116 * devices or filesystem images. 116 * devices or filesystem images.
117 */ 117 */
118 memset(buf, 0, sizeof(buf)); 118 memset(buf, 0, sizeof(buf));
119 path.mnt = mnt->mnt_parent; 119 path.mnt = mnt;
120 path.dentry = mnt->mnt_mountpoint; 120 path.dentry = mnt->mnt_root;
121 path_get(&path);
122 cp = d_path(&path, buf, sizeof(buf)); 121 cp = d_path(&path, buf, sizeof(buf));
123 path_put(&path);
124 if (!IS_ERR(cp)) { 122 if (!IS_ERR(cp)) {
125 memcpy(sbi->s_es->s_last_mounted, cp, 123 memcpy(sbi->s_es->s_last_mounted, cp,
126 sizeof(sbi->s_es->s_last_mounted)); 124 sizeof(sbi->s_es->s_last_mounted));
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 98bd140aad01..0d0c3239c1cd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
63 if (inode->i_sb->s_flags & MS_RDONLY) 63 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0; 64 return 0;
65 65
66 ret = flush_aio_dio_completed_IO(inode); 66 ret = flush_completed_IO(inode);
67 if (ret < 0) 67 if (ret < 0)
68 return ret; 68 return ret;
69 69
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6c..004c9da9e5c6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
76 /* If checksum is bad mark all blocks and inodes use to prevent 76 /* If checksum is bad mark all blocks and inodes use to prevent
77 * allocation, essentially implementing a per-group read-only flag. */ 77 * allocation, essentially implementing a per-group read-only flag. */
78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
79 ext4_error(sb, __func__, "Checksum bad for group %u", 79 ext4_error(sb, "Checksum bad for group %u", block_group);
80 block_group);
81 ext4_free_blks_set(sb, gdp, 0); 80 ext4_free_blks_set(sb, gdp, 0);
82 ext4_free_inodes_set(sb, gdp, 0); 81 ext4_free_inodes_set(sb, gdp, 0);
83 ext4_itable_unused_set(sb, gdp, 0); 82 ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
111 bitmap_blk = ext4_inode_bitmap(sb, desc); 110 bitmap_blk = ext4_inode_bitmap(sb, desc);
112 bh = sb_getblk(sb, bitmap_blk); 111 bh = sb_getblk(sb, bitmap_blk);
113 if (unlikely(!bh)) { 112 if (unlikely(!bh)) {
114 ext4_error(sb, __func__, 113 ext4_error(sb, "Cannot read inode bitmap - "
115 "Cannot read inode bitmap - "
116 "block_group = %u, inode_bitmap = %llu", 114 "block_group = %u, inode_bitmap = %llu",
117 block_group, bitmap_blk); 115 block_group, bitmap_blk);
118 return NULL; 116 return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
153 set_bitmap_uptodate(bh); 151 set_bitmap_uptodate(bh);
154 if (bh_submit_read(bh) < 0) { 152 if (bh_submit_read(bh) < 0) {
155 put_bh(bh); 153 put_bh(bh);
156 ext4_error(sb, __func__, 154 ext4_error(sb, "Cannot read inode bitmap - "
157 "Cannot read inode bitmap - "
158 "block_group = %u, inode_bitmap = %llu", 155 "block_group = %u, inode_bitmap = %llu",
159 block_group, bitmap_blk); 156 block_group, bitmap_blk);
160 return NULL; 157 return NULL;
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
229 226
230 es = EXT4_SB(sb)->s_es; 227 es = EXT4_SB(sb)->s_es;
231 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 228 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
232 ext4_error(sb, "ext4_free_inode", 229 ext4_error(sb, "reserved or nonexistent inode %lu", ino);
233 "reserved or nonexistent inode %lu", ino);
234 goto error_return; 230 goto error_return;
235 } 231 }
236 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 232 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
248 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
249 bit, bitmap_bh->b_data); 245 bit, bitmap_bh->b_data);
250 if (!cleared) 246 if (!cleared)
251 ext4_error(sb, "ext4_free_inode", 247 ext4_error(sb, "bit already cleared for inode %lu", ino);
252 "bit already cleared for inode %lu", ino);
253 else { 248 else {
254 gdp = ext4_get_group_desc(sb, block_group, &bh2); 249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
255 250
@@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb,
736 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 731 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
737 ino > EXT4_INODES_PER_GROUP(sb)) { 732 ino > EXT4_INODES_PER_GROUP(sb)) {
738 ext4_unlock_group(sb, group); 733 ext4_unlock_group(sb, group);
739 ext4_error(sb, __func__, 734 ext4_error(sb, "reserved inode or inode > inodes count - "
740 "reserved inode or inode > inodes count - "
741 "block_group = %u, inode=%lu", group, 735 "block_group = %u, inode=%lu", group,
742 ino + group * EXT4_INODES_PER_GROUP(sb)); 736 ino + group * EXT4_INODES_PER_GROUP(sb));
743 return 1; 737 return 1;
@@ -904,7 +898,7 @@ repeat_in_this_group:
904 BUFFER_TRACE(inode_bitmap_bh, 898 BUFFER_TRACE(inode_bitmap_bh,
905 "call ext4_handle_dirty_metadata"); 899 "call ext4_handle_dirty_metadata");
906 err = ext4_handle_dirty_metadata(handle, 900 err = ext4_handle_dirty_metadata(handle,
907 inode, 901 NULL,
908 inode_bitmap_bh); 902 inode_bitmap_bh);
909 if (err) 903 if (err)
910 goto fail; 904 goto fail;
@@ -1029,7 +1023,8 @@ got:
1029 inode->i_generation = sbi->s_next_generation++; 1023 inode->i_generation = sbi->s_next_generation++;
1030 spin_unlock(&sbi->s_next_gen_lock); 1024 spin_unlock(&sbi->s_next_gen_lock);
1031 1025
1032 ei->i_state = EXT4_STATE_NEW; 1026 ei->i_state_flags = 0;
1027 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1033 1028
1034 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1029 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
1035 1030
@@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1098 1093
1099 /* Error cases - e2fsck has already cleaned up for us */ 1094 /* Error cases - e2fsck has already cleaned up for us */
1100 if (ino > max_ino) { 1095 if (ino > max_ino) {
1101 ext4_warning(sb, __func__, 1096 ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
1102 "bad orphan ino %lu! e2fsck was run?", ino);
1103 goto error; 1097 goto error;
1104 } 1098 }
1105 1099
@@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1107 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); 1101 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
1108 bitmap_bh = ext4_read_inode_bitmap(sb, block_group); 1102 bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
1109 if (!bitmap_bh) { 1103 if (!bitmap_bh) {
1110 ext4_warning(sb, __func__, 1104 ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
1111 "inode bitmap error for orphan %lu", ino);
1112 goto error; 1105 goto error;
1113 } 1106 }
1114 1107
@@ -1140,8 +1133,7 @@ iget_failed:
1140 err = PTR_ERR(inode); 1133 err = PTR_ERR(inode);
1141 inode = NULL; 1134 inode = NULL;
1142bad_orphan: 1135bad_orphan:
1143 ext4_warning(sb, __func__, 1136 ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
1144 "bad orphan inode %lu! e2fsck was run?", ino);
1145 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", 1137 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1146 bit, (unsigned long long)bitmap_bh->b_blocknr, 1138 bit, (unsigned long long)bitmap_bh->b_blocknr,
1147 ext4_test_bit(bit, bitmap_bh->b_data)); 1139 ext4_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e11952404e02..f977aade0d1b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h>
41 42
42#include "ext4_jbd2.h" 43#include "ext4_jbd2.h"
43#include "xattr.h" 44#include "xattr.h"
@@ -194,7 +195,7 @@ void ext4_delete_inode(struct inode *inode)
194 inode->i_size = 0; 195 inode->i_size = 0;
195 err = ext4_mark_inode_dirty(handle, inode); 196 err = ext4_mark_inode_dirty(handle, inode);
196 if (err) { 197 if (err) {
197 ext4_warning(inode->i_sb, __func__, 198 ext4_warning(inode->i_sb,
198 "couldn't mark inode dirty (err %d)", err); 199 "couldn't mark inode dirty (err %d)", err);
199 goto stop_handle; 200 goto stop_handle;
200 } 201 }
@@ -212,7 +213,7 @@ void ext4_delete_inode(struct inode *inode)
212 if (err > 0) 213 if (err > 0)
213 err = ext4_journal_restart(handle, 3); 214 err = ext4_journal_restart(handle, 3);
214 if (err != 0) { 215 if (err != 0) {
215 ext4_warning(inode->i_sb, __func__, 216 ext4_warning(inode->i_sb,
216 "couldn't extend journal (err %d)", err); 217 "couldn't extend journal (err %d)", err);
217 stop_handle: 218 stop_handle:
218 ext4_journal_stop(handle); 219 ext4_journal_stop(handle);
@@ -323,8 +324,7 @@ static int ext4_block_to_path(struct inode *inode,
323 offsets[n++] = i_block & (ptrs - 1); 324 offsets[n++] = i_block & (ptrs - 1);
324 final = ptrs; 325 final = ptrs;
325 } else { 326 } else {
326 ext4_warning(inode->i_sb, "ext4_block_to_path", 327 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
327 "block %lu > max in inode %lu",
328 i_block + direct_blocks + 328 i_block + direct_blocks +
329 indirect_blocks + double_blocks, inode->i_ino); 329 indirect_blocks + double_blocks, inode->i_ino);
330 } 330 }
@@ -344,7 +344,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
344 if (blk && 344 if (blk &&
345 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 345 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
346 blk, 1))) { 346 blk, 1))) {
347 ext4_error(inode->i_sb, function, 347 __ext4_error(inode->i_sb, function,
348 "invalid block reference %u " 348 "invalid block reference %u "
349 "in inode #%lu", blk, inode->i_ino); 349 "in inode #%lu", blk, inode->i_ino);
350 return -EIO; 350 return -EIO;
@@ -607,7 +607,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
607 if (*err) 607 if (*err)
608 goto failed_out; 608 goto failed_out;
609 609
610 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); 610 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
611 EXT4_ERROR_INODE(inode,
612 "current_block %llu + count %lu > %d!",
613 current_block, count,
614 EXT4_MAX_BLOCK_FILE_PHYS);
615 *err = -EIO;
616 goto failed_out;
617 }
611 618
612 target -= count; 619 target -= count;
613 /* allocate blocks for indirect blocks */ 620 /* allocate blocks for indirect blocks */
@@ -643,7 +650,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
643 ar.flags = EXT4_MB_HINT_DATA; 650 ar.flags = EXT4_MB_HINT_DATA;
644 651
645 current_block = ext4_mb_new_blocks(handle, &ar, err); 652 current_block = ext4_mb_new_blocks(handle, &ar, err);
646 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); 653 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
654 EXT4_ERROR_INODE(inode,
655 "current_block %llu + ar.len %d > %d!",
656 current_block, ar.len,
657 EXT4_MAX_BLOCK_FILE_PHYS);
658 *err = -EIO;
659 goto failed_out;
660 }
647 661
648 if (*err && (target == blks)) { 662 if (*err && (target == blks)) {
649 /* 663 /*
@@ -1061,6 +1075,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
1061 int mdb_free = 0, allocated_meta_blocks = 0; 1075 int mdb_free = 0, allocated_meta_blocks = 0;
1062 1076
1063 spin_lock(&ei->i_block_reservation_lock); 1077 spin_lock(&ei->i_block_reservation_lock);
1078 trace_ext4_da_update_reserve_space(inode, used);
1064 if (unlikely(used > ei->i_reserved_data_blocks)) { 1079 if (unlikely(used > ei->i_reserved_data_blocks)) {
1065 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 1080 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1066 "with only %d reserved data blocks\n", 1081 "with only %d reserved data blocks\n",
@@ -1124,7 +1139,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
1124 sector_t logical, sector_t phys, int len) 1139 sector_t logical, sector_t phys, int len)
1125{ 1140{
1126 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1141 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1127 ext4_error(inode->i_sb, msg, 1142 __ext4_error(inode->i_sb, msg,
1128 "inode #%lu logical block %llu mapped to %llu " 1143 "inode #%lu logical block %llu mapped to %llu "
1129 "(size %d)", inode->i_ino, 1144 "(size %d)", inode->i_ino,
1130 (unsigned long long) logical, 1145 (unsigned long long) logical,
@@ -1306,7 +1321,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1306 * i_data's format changing. Force the migrate 1321 * i_data's format changing. Force the migrate
1307 * to fail by clearing migrate flags 1322 * to fail by clearing migrate flags
1308 */ 1323 */
1309 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 1324 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1310 } 1325 }
1311 1326
1312 /* 1327 /*
@@ -1534,6 +1549,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
1534 ext4_truncate(inode); 1549 ext4_truncate(inode);
1535} 1550}
1536 1551
1552static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1553 struct buffer_head *bh_result, int create);
1537static int ext4_write_begin(struct file *file, struct address_space *mapping, 1554static int ext4_write_begin(struct file *file, struct address_space *mapping,
1538 loff_t pos, unsigned len, unsigned flags, 1555 loff_t pos, unsigned len, unsigned flags,
1539 struct page **pagep, void **fsdata) 1556 struct page **pagep, void **fsdata)
@@ -1575,8 +1592,12 @@ retry:
1575 } 1592 }
1576 *pagep = page; 1593 *pagep = page;
1577 1594
1578 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1595 if (ext4_should_dioread_nolock(inode))
1579 ext4_get_block); 1596 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1597 fsdata, ext4_get_block_write);
1598 else
1599 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1600 fsdata, ext4_get_block);
1580 1601
1581 if (!ret && ext4_should_journal_data(inode)) { 1602 if (!ret && ext4_should_journal_data(inode)) {
1582 ret = walk_page_buffers(handle, page_buffers(page), 1603 ret = walk_page_buffers(handle, page_buffers(page),
@@ -1793,7 +1814,7 @@ static int ext4_journalled_write_end(struct file *file,
1793 new_i_size = pos + copied; 1814 new_i_size = pos + copied;
1794 if (new_i_size > inode->i_size) 1815 if (new_i_size > inode->i_size)
1795 i_size_write(inode, pos+copied); 1816 i_size_write(inode, pos+copied);
1796 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1817 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1797 if (new_i_size > EXT4_I(inode)->i_disksize) { 1818 if (new_i_size > EXT4_I(inode)->i_disksize) {
1798 ext4_update_i_disksize(inode, new_i_size); 1819 ext4_update_i_disksize(inode, new_i_size);
1799 ret2 = ext4_mark_inode_dirty(handle, inode); 1820 ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1846,6 +1867,7 @@ repeat:
1846 spin_lock(&ei->i_block_reservation_lock); 1867 spin_lock(&ei->i_block_reservation_lock);
1847 md_reserved = ei->i_reserved_meta_blocks; 1868 md_reserved = ei->i_reserved_meta_blocks;
1848 md_needed = ext4_calc_metadata_amount(inode, lblock); 1869 md_needed = ext4_calc_metadata_amount(inode, lblock);
1870 trace_ext4_da_reserve_space(inode, md_needed);
1849 spin_unlock(&ei->i_block_reservation_lock); 1871 spin_unlock(&ei->i_block_reservation_lock);
1850 1872
1851 /* 1873 /*
@@ -2091,6 +2113,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2091 } else if (buffer_mapped(bh)) 2113 } else if (buffer_mapped(bh))
2092 BUG_ON(bh->b_blocknr != pblock); 2114 BUG_ON(bh->b_blocknr != pblock);
2093 2115
2116 if (buffer_uninit(exbh))
2117 set_buffer_uninit(bh);
2094 cur_logical++; 2118 cur_logical++;
2095 pblock++; 2119 pblock++;
2096 } while ((bh = bh->b_this_page) != head); 2120 } while ((bh = bh->b_this_page) != head);
@@ -2133,17 +2157,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2133 break; 2157 break;
2134 for (i = 0; i < nr_pages; i++) { 2158 for (i = 0; i < nr_pages; i++) {
2135 struct page *page = pvec.pages[i]; 2159 struct page *page = pvec.pages[i];
2136 index = page->index; 2160 if (page->index > end)
2137 if (index > end)
2138 break; 2161 break;
2139 index++;
2140
2141 BUG_ON(!PageLocked(page)); 2162 BUG_ON(!PageLocked(page));
2142 BUG_ON(PageWriteback(page)); 2163 BUG_ON(PageWriteback(page));
2143 block_invalidatepage(page, 0); 2164 block_invalidatepage(page, 0);
2144 ClearPageUptodate(page); 2165 ClearPageUptodate(page);
2145 unlock_page(page); 2166 unlock_page(page);
2146 } 2167 }
2168 index = pvec.pages[nr_pages - 1]->index + 1;
2169 pagevec_release(&pvec);
2147 } 2170 }
2148 return; 2171 return;
2149} 2172}
@@ -2220,6 +2243,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2220 */ 2243 */
2221 new.b_state = 0; 2244 new.b_state = 0;
2222 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2245 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2246 if (ext4_should_dioread_nolock(mpd->inode))
2247 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2223 if (mpd->b_state & (1 << BH_Delay)) 2248 if (mpd->b_state & (1 << BH_Delay))
2224 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2249 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2225 2250
@@ -2630,11 +2655,14 @@ static int __ext4_journalled_writepage(struct page *page,
2630 ret = err; 2655 ret = err;
2631 2656
2632 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 2657 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2633 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 2658 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2634out: 2659out:
2635 return ret; 2660 return ret;
2636} 2661}
2637 2662
2663static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2664static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2665
2638/* 2666/*
2639 * Note that we don't need to start a transaction unless we're journaling data 2667 * Note that we don't need to start a transaction unless we're journaling data
2640 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2668 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2682,7 +2710,7 @@ static int ext4_writepage(struct page *page,
2682 int ret = 0; 2710 int ret = 0;
2683 loff_t size; 2711 loff_t size;
2684 unsigned int len; 2712 unsigned int len;
2685 struct buffer_head *page_bufs; 2713 struct buffer_head *page_bufs = NULL;
2686 struct inode *inode = page->mapping->host; 2714 struct inode *inode = page->mapping->host;
2687 2715
2688 trace_ext4_writepage(inode, page); 2716 trace_ext4_writepage(inode, page);
@@ -2758,7 +2786,11 @@ static int ext4_writepage(struct page *page,
2758 2786
2759 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2787 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2760 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2788 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2761 else 2789 else if (page_bufs && buffer_uninit(page_bufs)) {
2790 ext4_set_bh_endio(page_bufs, inode);
2791 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2792 wbc, ext4_end_io_buffer_write);
2793 } else
2762 ret = block_write_full_page(page, noalloc_get_block_write, 2794 ret = block_write_full_page(page, noalloc_get_block_write,
2763 wbc); 2795 wbc);
2764 2796
@@ -3301,7 +3333,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3301 filemap_write_and_wait(mapping); 3333 filemap_write_and_wait(mapping);
3302 } 3334 }
3303 3335
3304 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 3336 if (EXT4_JOURNAL(inode) &&
3337 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3305 /* 3338 /*
3306 * This is a REALLY heavyweight approach, but the use of 3339 * This is a REALLY heavyweight approach, but the use of
3307 * bmap on dirty files is expected to be extremely rare: 3340 * bmap on dirty files is expected to be extremely rare:
@@ -3320,7 +3353,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3320 * everything they get. 3353 * everything they get.
3321 */ 3354 */
3322 3355
3323 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 3356 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3324 journal = EXT4_JOURNAL(inode); 3357 journal = EXT4_JOURNAL(inode);
3325 jbd2_journal_lock_updates(journal); 3358 jbd2_journal_lock_updates(journal);
3326 err = jbd2_journal_flush(journal); 3359 err = jbd2_journal_flush(journal);
@@ -3345,11 +3378,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3345 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3378 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3346} 3379}
3347 3380
3381static void ext4_free_io_end(ext4_io_end_t *io)
3382{
3383 BUG_ON(!io);
3384 if (io->page)
3385 put_page(io->page);
3386 iput(io->inode);
3387 kfree(io);
3388}
3389
3390static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3391{
3392 struct buffer_head *head, *bh;
3393 unsigned int curr_off = 0;
3394
3395 if (!page_has_buffers(page))
3396 return;
3397 head = bh = page_buffers(page);
3398 do {
3399 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3400 && bh->b_private) {
3401 ext4_free_io_end(bh->b_private);
3402 bh->b_private = NULL;
3403 bh->b_end_io = NULL;
3404 }
3405 curr_off = curr_off + bh->b_size;
3406 bh = bh->b_this_page;
3407 } while (bh != head);
3408}
3409
3348static void ext4_invalidatepage(struct page *page, unsigned long offset) 3410static void ext4_invalidatepage(struct page *page, unsigned long offset)
3349{ 3411{
3350 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3412 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3351 3413
3352 /* 3414 /*
3415 * free any io_end structure allocated for buffers to be discarded
3416 */
3417 if (ext4_should_dioread_nolock(page->mapping->host))
3418 ext4_invalidatepage_free_endio(page, offset);
3419 /*
3353 * If it's a full truncate we just forget about the pending dirtying 3420 * If it's a full truncate we just forget about the pending dirtying
3354 */ 3421 */
3355 if (offset == 0) 3422 if (offset == 0)
@@ -3420,7 +3487,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3420 } 3487 }
3421 3488
3422retry: 3489retry:
3423 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3490 if (rw == READ && ext4_should_dioread_nolock(inode))
3491 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
3492 inode->i_sb->s_bdev, iov,
3493 offset, nr_segs,
3494 ext4_get_block, NULL);
3495 else
3496 ret = blockdev_direct_IO(rw, iocb, inode,
3497 inode->i_sb->s_bdev, iov,
3424 offset, nr_segs, 3498 offset, nr_segs,
3425 ext4_get_block, NULL); 3499 ext4_get_block, NULL);
3426 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3500 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3436,6 +3510,9 @@ retry:
3436 * but cannot extend i_size. Bail out and pretend 3510 * but cannot extend i_size. Bail out and pretend
3437 * the write failed... */ 3511 * the write failed... */
3438 ret = PTR_ERR(handle); 3512 ret = PTR_ERR(handle);
3513 if (inode->i_nlink)
3514 ext4_orphan_del(NULL, inode);
3515
3439 goto out; 3516 goto out;
3440 } 3517 }
3441 if (inode->i_nlink) 3518 if (inode->i_nlink)
@@ -3463,75 +3540,63 @@ out:
3463 return ret; 3540 return ret;
3464} 3541}
3465 3542
3466static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, 3543static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3467 struct buffer_head *bh_result, int create) 3544 struct buffer_head *bh_result, int create)
3468{ 3545{
3469 handle_t *handle = NULL; 3546 handle_t *handle = ext4_journal_current_handle();
3470 int ret = 0; 3547 int ret = 0;
3471 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3548 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3472 int dio_credits; 3549 int dio_credits;
3550 int started = 0;
3473 3551
3474 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", 3552 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3475 inode->i_ino, create); 3553 inode->i_ino, create);
3476 /* 3554 /*
3477 * DIO VFS code passes create = 0 flag for write to 3555 * ext4_get_block in prepare for a DIO write or buffer write.
3478 * the middle of file. It does this to avoid block 3556 * We allocate an uinitialized extent if blocks haven't been allocated.
3479 * allocation for holes, to prevent expose stale data 3557 * The extent will be converted to initialized after IO complete.
3480 * out when there is parallel buffered read (which does
3481 * not hold the i_mutex lock) while direct IO write has
3482 * not completed. DIO request on holes finally falls back
3483 * to buffered IO for this reason.
3484 *
3485 * For ext4 extent based file, since we support fallocate,
3486 * new allocated extent as uninitialized, for holes, we
3487 * could fallocate blocks for holes, thus parallel
3488 * buffered IO read will zero out the page when read on
3489 * a hole while parallel DIO write to the hole has not completed.
3490 *
3491 * when we come here, we know it's a direct IO write to
3492 * to the middle of file (<i_size)
3493 * so it's safe to override the create flag from VFS.
3494 */ 3558 */
3495 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; 3559 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3496 3560
3497 if (max_blocks > DIO_MAX_BLOCKS) 3561 if (!handle) {
3498 max_blocks = DIO_MAX_BLOCKS; 3562 if (max_blocks > DIO_MAX_BLOCKS)
3499 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 3563 max_blocks = DIO_MAX_BLOCKS;
3500 handle = ext4_journal_start(inode, dio_credits); 3564 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3501 if (IS_ERR(handle)) { 3565 handle = ext4_journal_start(inode, dio_credits);
3502 ret = PTR_ERR(handle); 3566 if (IS_ERR(handle)) {
3503 goto out; 3567 ret = PTR_ERR(handle);
3568 goto out;
3569 }
3570 started = 1;
3504 } 3571 }
3572
3505 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 3573 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3506 create); 3574 create);
3507 if (ret > 0) { 3575 if (ret > 0) {
3508 bh_result->b_size = (ret << inode->i_blkbits); 3576 bh_result->b_size = (ret << inode->i_blkbits);
3509 ret = 0; 3577 ret = 0;
3510 } 3578 }
3511 ext4_journal_stop(handle); 3579 if (started)
3580 ext4_journal_stop(handle);
3512out: 3581out:
3513 return ret; 3582 return ret;
3514} 3583}
3515 3584
3516static void ext4_free_io_end(ext4_io_end_t *io) 3585static void dump_completed_IO(struct inode * inode)
3517{
3518 BUG_ON(!io);
3519 iput(io->inode);
3520 kfree(io);
3521}
3522static void dump_aio_dio_list(struct inode * inode)
3523{ 3586{
3524#ifdef EXT4_DEBUG 3587#ifdef EXT4_DEBUG
3525 struct list_head *cur, *before, *after; 3588 struct list_head *cur, *before, *after;
3526 ext4_io_end_t *io, *io0, *io1; 3589 ext4_io_end_t *io, *io0, *io1;
3590 unsigned long flags;
3527 3591
3528 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3592 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3529 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); 3593 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3530 return; 3594 return;
3531 } 3595 }
3532 3596
3533 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); 3597 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3534 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ 3598 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3599 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3535 cur = &io->list; 3600 cur = &io->list;
3536 before = cur->prev; 3601 before = cur->prev;
3537 io0 = container_of(before, ext4_io_end_t, list); 3602 io0 = container_of(before, ext4_io_end_t, list);
@@ -3541,32 +3606,31 @@ static void dump_aio_dio_list(struct inode * inode)
3541 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 3606 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3542 io, inode->i_ino, io0, io1); 3607 io, inode->i_ino, io0, io1);
3543 } 3608 }
3609 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3544#endif 3610#endif
3545} 3611}
3546 3612
3547/* 3613/*
3548 * check a range of space and convert unwritten extents to written. 3614 * check a range of space and convert unwritten extents to written.
3549 */ 3615 */
3550static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) 3616static int ext4_end_io_nolock(ext4_io_end_t *io)
3551{ 3617{
3552 struct inode *inode = io->inode; 3618 struct inode *inode = io->inode;
3553 loff_t offset = io->offset; 3619 loff_t offset = io->offset;
3554 size_t size = io->size; 3620 ssize_t size = io->size;
3555 int ret = 0; 3621 int ret = 0;
3556 3622
3557 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," 3623 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3558 "list->prev 0x%p\n", 3624 "list->prev 0x%p\n",
3559 io, inode->i_ino, io->list.next, io->list.prev); 3625 io, inode->i_ino, io->list.next, io->list.prev);
3560 3626
3561 if (list_empty(&io->list)) 3627 if (list_empty(&io->list))
3562 return ret; 3628 return ret;
3563 3629
3564 if (io->flag != DIO_AIO_UNWRITTEN) 3630 if (io->flag != EXT4_IO_UNWRITTEN)
3565 return ret; 3631 return ret;
3566 3632
3567 if (offset + size <= i_size_read(inode)) 3633 ret = ext4_convert_unwritten_extents(inode, offset, size);
3568 ret = ext4_convert_unwritten_extents(inode, offset, size);
3569
3570 if (ret < 0) { 3634 if (ret < 0) {
3571 printk(KERN_EMERG "%s: failed to convert unwritten" 3635 printk(KERN_EMERG "%s: failed to convert unwritten"
3572 "extents to written extents, error is %d" 3636 "extents to written extents, error is %d"
@@ -3579,50 +3643,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3579 io->flag = 0; 3643 io->flag = 0;
3580 return ret; 3644 return ret;
3581} 3645}
3646
3582/* 3647/*
3583 * work on completed aio dio IO, to convert unwritten extents to extents 3648 * work on completed aio dio IO, to convert unwritten extents to extents
3584 */ 3649 */
3585static void ext4_end_aio_dio_work(struct work_struct *work) 3650static void ext4_end_io_work(struct work_struct *work)
3586{ 3651{
3587 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 3652 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3588 struct inode *inode = io->inode; 3653 struct inode *inode = io->inode;
3589 int ret = 0; 3654 struct ext4_inode_info *ei = EXT4_I(inode);
3655 unsigned long flags;
3656 int ret;
3590 3657
3591 mutex_lock(&inode->i_mutex); 3658 mutex_lock(&inode->i_mutex);
3592 ret = ext4_end_aio_dio_nolock(io); 3659 ret = ext4_end_io_nolock(io);
3593 if (ret >= 0) { 3660 if (ret < 0) {
3594 if (!list_empty(&io->list)) 3661 mutex_unlock(&inode->i_mutex);
3595 list_del_init(&io->list); 3662 return;
3596 ext4_free_io_end(io);
3597 } 3663 }
3664
3665 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3666 if (!list_empty(&io->list))
3667 list_del_init(&io->list);
3668 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3598 mutex_unlock(&inode->i_mutex); 3669 mutex_unlock(&inode->i_mutex);
3670 ext4_free_io_end(io);
3599} 3671}
3672
3600/* 3673/*
3601 * This function is called from ext4_sync_file(). 3674 * This function is called from ext4_sync_file().
3602 * 3675 *
3603 * When AIO DIO IO is completed, the work to convert unwritten 3676 * When IO is completed, the work to convert unwritten extents to
3604 * extents to written is queued on workqueue but may not get immediately 3677 * written is queued on workqueue but may not get immediately
3605 * scheduled. When fsync is called, we need to ensure the 3678 * scheduled. When fsync is called, we need to ensure the
3606 * conversion is complete before fsync returns. 3679 * conversion is complete before fsync returns.
3607 * The inode keeps track of a list of completed AIO from DIO path 3680 * The inode keeps track of a list of pending/completed IO that
3608 * that might needs to do the conversion. This function walks through 3681 * might needs to do the conversion. This function walks through
3609 * the list and convert the related unwritten extents to written. 3682 * the list and convert the related unwritten extents for completed IO
3683 * to written.
3684 * The function return the number of pending IOs on success.
3610 */ 3685 */
3611int flush_aio_dio_completed_IO(struct inode *inode) 3686int flush_completed_IO(struct inode *inode)
3612{ 3687{
3613 ext4_io_end_t *io; 3688 ext4_io_end_t *io;
3689 struct ext4_inode_info *ei = EXT4_I(inode);
3690 unsigned long flags;
3614 int ret = 0; 3691 int ret = 0;
3615 int ret2 = 0; 3692 int ret2 = 0;
3616 3693
3617 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) 3694 if (list_empty(&ei->i_completed_io_list))
3618 return ret; 3695 return ret;
3619 3696
3620 dump_aio_dio_list(inode); 3697 dump_completed_IO(inode);
3621 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3698 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3622 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, 3699 while (!list_empty(&ei->i_completed_io_list)){
3700 io = list_entry(ei->i_completed_io_list.next,
3623 ext4_io_end_t, list); 3701 ext4_io_end_t, list);
3624 /* 3702 /*
3625 * Calling ext4_end_aio_dio_nolock() to convert completed 3703 * Calling ext4_end_io_nolock() to convert completed
3626 * IO to written. 3704 * IO to written.
3627 * 3705 *
3628 * When ext4_sync_file() is called, run_queue() may already 3706 * When ext4_sync_file() is called, run_queue() may already
@@ -3635,20 +3713,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
3635 * avoid double converting from both fsync and background work 3713 * avoid double converting from both fsync and background work
3636 * queue work. 3714 * queue work.
3637 */ 3715 */
3638 ret = ext4_end_aio_dio_nolock(io); 3716 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3717 ret = ext4_end_io_nolock(io);
3718 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3639 if (ret < 0) 3719 if (ret < 0)
3640 ret2 = ret; 3720 ret2 = ret;
3641 else 3721 else
3642 list_del_init(&io->list); 3722 list_del_init(&io->list);
3643 } 3723 }
3724 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3644 return (ret2 < 0) ? ret2 : 0; 3725 return (ret2 < 0) ? ret2 : 0;
3645} 3726}
3646 3727
3647static ext4_io_end_t *ext4_init_io_end (struct inode *inode) 3728static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3648{ 3729{
3649 ext4_io_end_t *io = NULL; 3730 ext4_io_end_t *io = NULL;
3650 3731
3651 io = kmalloc(sizeof(*io), GFP_NOFS); 3732 io = kmalloc(sizeof(*io), flags);
3652 3733
3653 if (io) { 3734 if (io) {
3654 igrab(inode); 3735 igrab(inode);
@@ -3656,8 +3737,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3656 io->flag = 0; 3737 io->flag = 0;
3657 io->offset = 0; 3738 io->offset = 0;
3658 io->size = 0; 3739 io->size = 0;
3659 io->error = 0; 3740 io->page = NULL;
3660 INIT_WORK(&io->work, ext4_end_aio_dio_work); 3741 INIT_WORK(&io->work, ext4_end_io_work);
3661 INIT_LIST_HEAD(&io->list); 3742 INIT_LIST_HEAD(&io->list);
3662 } 3743 }
3663 3744
@@ -3669,6 +3750,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3669{ 3750{
3670 ext4_io_end_t *io_end = iocb->private; 3751 ext4_io_end_t *io_end = iocb->private;
3671 struct workqueue_struct *wq; 3752 struct workqueue_struct *wq;
3753 unsigned long flags;
3754 struct ext4_inode_info *ei;
3672 3755
3673 /* if not async direct IO or dio with 0 bytes write, just return */ 3756 /* if not async direct IO or dio with 0 bytes write, just return */
3674 if (!io_end || !size) 3757 if (!io_end || !size)
@@ -3680,7 +3763,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3680 size); 3763 size);
3681 3764
3682 /* if not aio dio with unwritten extents, just free io and return */ 3765 /* if not aio dio with unwritten extents, just free io and return */
3683 if (io_end->flag != DIO_AIO_UNWRITTEN){ 3766 if (io_end->flag != EXT4_IO_UNWRITTEN){
3684 ext4_free_io_end(io_end); 3767 ext4_free_io_end(io_end);
3685 iocb->private = NULL; 3768 iocb->private = NULL;
3686 return; 3769 return;
@@ -3688,16 +3771,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3688 3771
3689 io_end->offset = offset; 3772 io_end->offset = offset;
3690 io_end->size = size; 3773 io_end->size = size;
3774 io_end->flag = EXT4_IO_UNWRITTEN;
3691 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3775 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3692 3776
3693 /* queue the work to convert unwritten extents to written */ 3777 /* queue the work to convert unwritten extents to written */
3694 queue_work(wq, &io_end->work); 3778 queue_work(wq, &io_end->work);
3695 3779
3696 /* Add the io_end to per-inode completed aio dio list*/ 3780 /* Add the io_end to per-inode completed aio dio list*/
3697 list_add_tail(&io_end->list, 3781 ei = EXT4_I(io_end->inode);
3698 &EXT4_I(io_end->inode)->i_aio_dio_complete_list); 3782 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3783 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3784 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3699 iocb->private = NULL; 3785 iocb->private = NULL;
3700} 3786}
3787
3788static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3789{
3790 ext4_io_end_t *io_end = bh->b_private;
3791 struct workqueue_struct *wq;
3792 struct inode *inode;
3793 unsigned long flags;
3794
3795 if (!test_clear_buffer_uninit(bh) || !io_end)
3796 goto out;
3797
3798 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3799 printk("sb umounted, discard end_io request for inode %lu\n",
3800 io_end->inode->i_ino);
3801 ext4_free_io_end(io_end);
3802 goto out;
3803 }
3804
3805 io_end->flag = EXT4_IO_UNWRITTEN;
3806 inode = io_end->inode;
3807
3808 /* Add the io_end to per-inode completed io list*/
3809 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3810 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3811 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3812
3813 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3814 /* queue the work to convert unwritten extents to written */
3815 queue_work(wq, &io_end->work);
3816out:
3817 bh->b_private = NULL;
3818 bh->b_end_io = NULL;
3819 clear_buffer_uninit(bh);
3820 end_buffer_async_write(bh, uptodate);
3821}
3822
3823static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3824{
3825 ext4_io_end_t *io_end;
3826 struct page *page = bh->b_page;
3827 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3828 size_t size = bh->b_size;
3829
3830retry:
3831 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3832 if (!io_end) {
3833 if (printk_ratelimit())
3834 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3835 schedule();
3836 goto retry;
3837 }
3838 io_end->offset = offset;
3839 io_end->size = size;
3840 /*
3841 * We need to hold a reference to the page to make sure it
3842 * doesn't get evicted before ext4_end_io_work() has a chance
3843 * to convert the extent from written to unwritten.
3844 */
3845 io_end->page = page;
3846 get_page(io_end->page);
3847
3848 bh->b_private = io_end;
3849 bh->b_end_io = ext4_end_io_buffer_write;
3850 return 0;
3851}
3852
3701/* 3853/*
3702 * For ext4 extent files, ext4 will do direct-io write to holes, 3854 * For ext4 extent files, ext4 will do direct-io write to holes,
3703 * preallocated extents, and those write extend the file, no need to 3855 * preallocated extents, and those write extend the file, no need to
@@ -3751,7 +3903,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3751 iocb->private = NULL; 3903 iocb->private = NULL;
3752 EXT4_I(inode)->cur_aio_dio = NULL; 3904 EXT4_I(inode)->cur_aio_dio = NULL;
3753 if (!is_sync_kiocb(iocb)) { 3905 if (!is_sync_kiocb(iocb)) {
3754 iocb->private = ext4_init_io_end(inode); 3906 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3755 if (!iocb->private) 3907 if (!iocb->private)
3756 return -ENOMEM; 3908 return -ENOMEM;
3757 /* 3909 /*
@@ -3767,7 +3919,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3767 ret = blockdev_direct_IO(rw, iocb, inode, 3919 ret = blockdev_direct_IO(rw, iocb, inode,
3768 inode->i_sb->s_bdev, iov, 3920 inode->i_sb->s_bdev, iov,
3769 offset, nr_segs, 3921 offset, nr_segs,
3770 ext4_get_block_dio_write, 3922 ext4_get_block_write,
3771 ext4_end_io_dio); 3923 ext4_end_io_dio);
3772 if (iocb->private) 3924 if (iocb->private)
3773 EXT4_I(inode)->cur_aio_dio = NULL; 3925 EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3788,8 +3940,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3788 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3940 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3789 ext4_free_io_end(iocb->private); 3941 ext4_free_io_end(iocb->private);
3790 iocb->private = NULL; 3942 iocb->private = NULL;
3791 } else if (ret > 0 && (EXT4_I(inode)->i_state & 3943 } else if (ret > 0 && ext4_test_inode_state(inode,
3792 EXT4_STATE_DIO_UNWRITTEN)) { 3944 EXT4_STATE_DIO_UNWRITTEN)) {
3793 int err; 3945 int err;
3794 /* 3946 /*
3795 * for non AIO case, since the IO is already 3947 * for non AIO case, since the IO is already
@@ -3799,7 +3951,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3799 offset, ret); 3951 offset, ret);
3800 if (err < 0) 3952 if (err < 0)
3801 ret = err; 3953 ret = err;
3802 EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; 3954 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3803 } 3955 }
3804 return ret; 3956 return ret;
3805 } 3957 }
@@ -4130,18 +4282,27 @@ no_top:
4130 * We release `count' blocks on disk, but (last - first) may be greater 4282 * We release `count' blocks on disk, but (last - first) may be greater
4131 * than `count' because there can be holes in there. 4283 * than `count' because there can be holes in there.
4132 */ 4284 */
4133static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 4285static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4134 struct buffer_head *bh, 4286 struct buffer_head *bh,
4135 ext4_fsblk_t block_to_free, 4287 ext4_fsblk_t block_to_free,
4136 unsigned long count, __le32 *first, 4288 unsigned long count, __le32 *first,
4137 __le32 *last) 4289 __le32 *last)
4138{ 4290{
4139 __le32 *p; 4291 __le32 *p;
4140 int flags = EXT4_FREE_BLOCKS_FORGET; 4292 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4141 4293
4142 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4294 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4143 flags |= EXT4_FREE_BLOCKS_METADATA; 4295 flags |= EXT4_FREE_BLOCKS_METADATA;
4144 4296
4297 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4298 count)) {
4299 ext4_error(inode->i_sb, "inode #%lu: "
4300 "attempt to clear blocks %llu len %lu, invalid",
4301 inode->i_ino, (unsigned long long) block_to_free,
4302 count);
4303 return 1;
4304 }
4305
4145 if (try_to_extend_transaction(handle, inode)) { 4306 if (try_to_extend_transaction(handle, inode)) {
4146 if (bh) { 4307 if (bh) {
4147 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4308 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4160,6 +4321,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
4160 *p = 0; 4321 *p = 0;
4161 4322
4162 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4323 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
4324 return 0;
4163} 4325}
4164 4326
4165/** 4327/**
@@ -4215,9 +4377,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4215 } else if (nr == block_to_free + count) { 4377 } else if (nr == block_to_free + count) {
4216 count++; 4378 count++;
4217 } else { 4379 } else {
4218 ext4_clear_blocks(handle, inode, this_bh, 4380 if (ext4_clear_blocks(handle, inode, this_bh,
4219 block_to_free, 4381 block_to_free, count,
4220 count, block_to_free_p, p); 4382 block_to_free_p, p))
4383 break;
4221 block_to_free = nr; 4384 block_to_free = nr;
4222 block_to_free_p = p; 4385 block_to_free_p = p;
4223 count = 1; 4386 count = 1;
@@ -4241,7 +4404,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4241 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4404 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4242 ext4_handle_dirty_metadata(handle, inode, this_bh); 4405 ext4_handle_dirty_metadata(handle, inode, this_bh);
4243 else 4406 else
4244 ext4_error(inode->i_sb, __func__, 4407 ext4_error(inode->i_sb,
4245 "circular indirect block detected, " 4408 "circular indirect block detected, "
4246 "inode=%lu, block=%llu", 4409 "inode=%lu, block=%llu",
4247 inode->i_ino, 4410 inode->i_ino,
@@ -4281,6 +4444,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4281 if (!nr) 4444 if (!nr)
4282 continue; /* A hole */ 4445 continue; /* A hole */
4283 4446
4447 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4448 nr, 1)) {
4449 ext4_error(inode->i_sb,
4450 "indirect mapped block in inode "
4451 "#%lu invalid (level %d, blk #%lu)",
4452 inode->i_ino, depth,
4453 (unsigned long) nr);
4454 break;
4455 }
4456
4284 /* Go read the buffer for the next level down */ 4457 /* Go read the buffer for the next level down */
4285 bh = sb_bread(inode->i_sb, nr); 4458 bh = sb_bread(inode->i_sb, nr);
4286 4459
@@ -4289,7 +4462,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4289 * (should be rare). 4462 * (should be rare).
4290 */ 4463 */
4291 if (!bh) { 4464 if (!bh) {
4292 ext4_error(inode->i_sb, "ext4_free_branches", 4465 ext4_error(inode->i_sb,
4293 "Read failure, inode=%lu, block=%llu", 4466 "Read failure, inode=%lu, block=%llu",
4294 inode->i_ino, nr); 4467 inode->i_ino, nr);
4295 continue; 4468 continue;
@@ -4433,8 +4606,10 @@ void ext4_truncate(struct inode *inode)
4433 if (!ext4_can_truncate(inode)) 4606 if (!ext4_can_truncate(inode))
4434 return; 4607 return;
4435 4608
4609 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
4610
4436 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4611 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4437 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4612 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4438 4613
4439 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4614 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4440 ext4_ext_truncate(inode); 4615 ext4_ext_truncate(inode);
@@ -4604,9 +4779,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4604 4779
4605 bh = sb_getblk(sb, block); 4780 bh = sb_getblk(sb, block);
4606 if (!bh) { 4781 if (!bh) {
4607 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4782 ext4_error(sb, "unable to read inode block - "
4608 "inode block - inode=%lu, block=%llu", 4783 "inode=%lu, block=%llu", inode->i_ino, block);
4609 inode->i_ino, block);
4610 return -EIO; 4784 return -EIO;
4611 } 4785 }
4612 if (!buffer_uptodate(bh)) { 4786 if (!buffer_uptodate(bh)) {
@@ -4704,9 +4878,8 @@ make_io:
4704 submit_bh(READ_META, bh); 4878 submit_bh(READ_META, bh);
4705 wait_on_buffer(bh); 4879 wait_on_buffer(bh);
4706 if (!buffer_uptodate(bh)) { 4880 if (!buffer_uptodate(bh)) {
4707 ext4_error(sb, __func__, 4881 ext4_error(sb, "unable to read inode block - inode=%lu,"
4708 "unable to read inode block - inode=%lu, " 4882 " block=%llu", inode->i_ino, block);
4709 "block=%llu", inode->i_ino, block);
4710 brelse(bh); 4883 brelse(bh);
4711 return -EIO; 4884 return -EIO;
4712 } 4885 }
@@ -4720,7 +4893,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4720{ 4893{
4721 /* We have all inode data except xattrs in memory here. */ 4894 /* We have all inode data except xattrs in memory here. */
4722 return __ext4_get_inode_loc(inode, iloc, 4895 return __ext4_get_inode_loc(inode, iloc,
4723 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4896 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4724} 4897}
4725 4898
4726void ext4_set_inode_flags(struct inode *inode) 4899void ext4_set_inode_flags(struct inode *inode)
@@ -4814,7 +4987,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4814 } 4987 }
4815 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4988 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4816 4989
4817 ei->i_state = 0; 4990 ei->i_state_flags = 0;
4818 ei->i_dir_start_lookup = 0; 4991 ei->i_dir_start_lookup = 0;
4819 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4992 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4820 /* We now have enough fields to check if the inode was active or not. 4993 /* We now have enough fields to check if the inode was active or not.
@@ -4897,7 +5070,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4897 EXT4_GOOD_OLD_INODE_SIZE + 5070 EXT4_GOOD_OLD_INODE_SIZE +
4898 ei->i_extra_isize; 5071 ei->i_extra_isize;
4899 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 5072 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4900 ei->i_state |= EXT4_STATE_XATTR; 5073 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4901 } 5074 }
4902 } else 5075 } else
4903 ei->i_extra_isize = 0; 5076 ei->i_extra_isize = 0;
@@ -4917,8 +5090,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4917 ret = 0; 5090 ret = 0;
4918 if (ei->i_file_acl && 5091 if (ei->i_file_acl &&
4919 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5092 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4920 ext4_error(sb, __func__, 5093 ext4_error(sb, "bad extended attribute block %llu inode #%lu",
4921 "bad extended attribute block %llu in inode #%lu",
4922 ei->i_file_acl, inode->i_ino); 5094 ei->i_file_acl, inode->i_ino);
4923 ret = -EIO; 5095 ret = -EIO;
4924 goto bad_inode; 5096 goto bad_inode;
@@ -4964,8 +5136,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4964 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5136 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4965 } else { 5137 } else {
4966 ret = -EIO; 5138 ret = -EIO;
4967 ext4_error(inode->i_sb, __func__, 5139 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
4968 "bogus i_mode (%o) for inode=%lu",
4969 inode->i_mode, inode->i_ino); 5140 inode->i_mode, inode->i_ino);
4970 goto bad_inode; 5141 goto bad_inode;
4971 } 5142 }
@@ -5037,7 +5208,7 @@ static int ext4_do_update_inode(handle_t *handle,
5037 5208
5038 /* For fields not not tracking in the in-memory inode, 5209 /* For fields not not tracking in the in-memory inode,
5039 * initialise them to zero for new inodes. */ 5210 * initialise them to zero for new inodes. */
5040 if (ei->i_state & EXT4_STATE_NEW) 5211 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5041 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5212 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5042 5213
5043 ext4_get_inode_flags(ei); 5214 ext4_get_inode_flags(ei);
@@ -5101,7 +5272,7 @@ static int ext4_do_update_inode(handle_t *handle,
5101 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 5272 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5102 sb->s_dirt = 1; 5273 sb->s_dirt = 1;
5103 ext4_handle_sync(handle); 5274 ext4_handle_sync(handle);
5104 err = ext4_handle_dirty_metadata(handle, inode, 5275 err = ext4_handle_dirty_metadata(handle, NULL,
5105 EXT4_SB(sb)->s_sbh); 5276 EXT4_SB(sb)->s_sbh);
5106 } 5277 }
5107 } 5278 }
@@ -5130,10 +5301,10 @@ static int ext4_do_update_inode(handle_t *handle,
5130 } 5301 }
5131 5302
5132 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5303 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5133 rc = ext4_handle_dirty_metadata(handle, inode, bh); 5304 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5134 if (!err) 5305 if (!err)
5135 err = rc; 5306 err = rc;
5136 ei->i_state &= ~EXT4_STATE_NEW; 5307 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5137 5308
5138 ext4_update_inode_fsync_trans(handle, inode, 0); 5309 ext4_update_inode_fsync_trans(handle, inode, 0);
5139out_brelse: 5310out_brelse:
@@ -5177,7 +5348,7 @@ out_brelse:
5177 * `stuff()' is running, and the new i_size will be lost. Plus the inode 5348 * `stuff()' is running, and the new i_size will be lost. Plus the inode
5178 * will no longer be on the superblock's dirty inode list. 5349 * will no longer be on the superblock's dirty inode list.
5179 */ 5350 */
5180int ext4_write_inode(struct inode *inode, int wait) 5351int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5181{ 5352{
5182 int err; 5353 int err;
5183 5354
@@ -5191,7 +5362,7 @@ int ext4_write_inode(struct inode *inode, int wait)
5191 return -EIO; 5362 return -EIO;
5192 } 5363 }
5193 5364
5194 if (!wait) 5365 if (wbc->sync_mode != WB_SYNC_ALL)
5195 return 0; 5366 return 0;
5196 5367
5197 err = ext4_force_commit(inode->i_sb); 5368 err = ext4_force_commit(inode->i_sb);
@@ -5201,13 +5372,11 @@ int ext4_write_inode(struct inode *inode, int wait)
5201 err = ext4_get_inode_loc(inode, &iloc); 5372 err = ext4_get_inode_loc(inode, &iloc);
5202 if (err) 5373 if (err)
5203 return err; 5374 return err;
5204 if (wait) 5375 if (wbc->sync_mode == WB_SYNC_ALL)
5205 sync_dirty_buffer(iloc.bh); 5376 sync_dirty_buffer(iloc.bh);
5206 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5377 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5207 ext4_error(inode->i_sb, __func__, 5378 ext4_error(inode->i_sb, "IO error syncing inode, "
5208 "IO error syncing inode, " 5379 "inode=%lu, block=%llu", inode->i_ino,
5209 "inode=%lu, block=%llu",
5210 inode->i_ino,
5211 (unsigned long long)iloc.bh->b_blocknr); 5380 (unsigned long long)iloc.bh->b_blocknr);
5212 err = -EIO; 5381 err = -EIO;
5213 } 5382 }
@@ -5288,7 +5457,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5288 } 5457 }
5289 5458
5290 if (S_ISREG(inode->i_mode) && 5459 if (S_ISREG(inode->i_mode) &&
5291 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 5460 attr->ia_valid & ATTR_SIZE &&
5461 (attr->ia_size < inode->i_size ||
5462 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
5292 handle_t *handle; 5463 handle_t *handle;
5293 5464
5294 handle = ext4_journal_start(inode, 3); 5465 handle = ext4_journal_start(inode, 3);
@@ -5319,6 +5490,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5319 goto err_out; 5490 goto err_out;
5320 } 5491 }
5321 } 5492 }
5493 /* ext4_truncate will clear the flag */
5494 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
5495 ext4_truncate(inode);
5322 } 5496 }
5323 5497
5324 rc = inode_setattr(inode, attr); 5498 rc = inode_setattr(inode, attr);
@@ -5557,8 +5731,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
5557 entry = IFIRST(header); 5731 entry = IFIRST(header);
5558 5732
5559 /* No extended attributes present */ 5733 /* No extended attributes present */
5560 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 5734 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5561 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5735 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5562 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5736 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5563 new_extra_isize); 5737 new_extra_isize);
5564 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5738 EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5602,7 +5776,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5602 err = ext4_reserve_inode_write(handle, inode, &iloc); 5776 err = ext4_reserve_inode_write(handle, inode, &iloc);
5603 if (ext4_handle_valid(handle) && 5777 if (ext4_handle_valid(handle) &&
5604 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5778 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5605 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 5779 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5606 /* 5780 /*
5607 * We need extra buffer credits since we may write into EA block 5781 * We need extra buffer credits since we may write into EA block
5608 * with this same handle. If journal_extend fails, then it will 5782 * with this same handle. If journal_extend fails, then it will
@@ -5616,10 +5790,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5616 sbi->s_want_extra_isize, 5790 sbi->s_want_extra_isize,
5617 iloc, handle); 5791 iloc, handle);
5618 if (ret) { 5792 if (ret) {
5619 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 5793 ext4_set_inode_state(inode,
5794 EXT4_STATE_NO_EXPAND);
5620 if (mnt_count != 5795 if (mnt_count !=
5621 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5796 le16_to_cpu(sbi->s_es->s_mnt_count)) {
5622 ext4_warning(inode->i_sb, __func__, 5797 ext4_warning(inode->i_sb,
5623 "Unable to expand inode %lu. Delete" 5798 "Unable to expand inode %lu. Delete"
5624 " some EAs or run e2fsck.", 5799 " some EAs or run e2fsck.",
5625 inode->i_ino); 5800 inode->i_ino);
@@ -5683,7 +5858,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5683 err = jbd2_journal_get_write_access(handle, iloc.bh); 5858 err = jbd2_journal_get_write_access(handle, iloc.bh);
5684 if (!err) 5859 if (!err)
5685 err = ext4_handle_dirty_metadata(handle, 5860 err = ext4_handle_dirty_metadata(handle,
5686 inode, 5861 NULL,
5687 iloc.bh); 5862 iloc.bh);
5688 brelse(iloc.bh); 5863 brelse(iloc.bh);
5689 } 5864 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b63d193126db..016d0249294f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
92 flags &= ~EXT4_EXTENTS_FL; 92 flags &= ~EXT4_EXTENTS_FL;
93 } 93 }
94 94
95 if (flags & EXT4_EOFBLOCKS_FL) {
96 /* we don't support adding EOFBLOCKS flag */
97 if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
98 err = -EOPNOTSUPP;
99 goto flags_out;
100 }
101 } else if (oldflags & EXT4_EOFBLOCKS_FL)
102 ext4_truncate(inode);
103
95 handle = ext4_journal_start(inode, 1); 104 handle = ext4_journal_start(inode, 1);
96 if (IS_ERR(handle)) { 105 if (IS_ERR(handle)) {
97 err = PTR_ERR(handle); 106 err = PTR_ERR(handle);
@@ -249,7 +258,8 @@ setversion_out:
249 if (me.moved_len > 0) 258 if (me.moved_len > 0)
250 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
251 260
252 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me)))
253 err = -EFAULT; 263 err = -EFAULT;
254mext_out: 264mext_out:
255 fput(donor_filp); 265 fput(donor_filp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d34afad3e137..abb11e328b65 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -441,10 +441,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
441 for (i = 0; i < count; i++) { 441 for (i = 0; i < count; i++) {
442 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 442 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
443 ext4_fsblk_t blocknr; 443 ext4_fsblk_t blocknr;
444 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 444
445 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
445 blocknr += first + i; 446 blocknr += first + i;
446 blocknr +=
447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
448 ext4_grp_locked_error(sb, e4b->bd_group, 447 ext4_grp_locked_error(sb, e4b->bd_group,
449 __func__, "double-free of inode" 448 __func__, "double-free of inode"
450 " %lu's block %llu(bit %u in group %u)", 449 " %lu's block %llu(bit %u in group %u)",
@@ -1255,10 +1254,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1255 1254
1256 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1255 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1257 ext4_fsblk_t blocknr; 1256 ext4_fsblk_t blocknr;
1258 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 1257
1258 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1259 blocknr += block; 1259 blocknr += block;
1260 blocknr +=
1261 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1262 ext4_grp_locked_error(sb, e4b->bd_group, 1260 ext4_grp_locked_error(sb, e4b->bd_group,
1263 __func__, "double-free of inode" 1261 __func__, "double-free of inode"
1264 " %lu's block %llu(bit %u in group %u)", 1262 " %lu's block %llu(bit %u in group %u)",
@@ -1631,7 +1629,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1631 int max; 1629 int max;
1632 int err; 1630 int err;
1633 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1631 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1634 struct ext4_super_block *es = sbi->s_es;
1635 struct ext4_free_extent ex; 1632 struct ext4_free_extent ex;
1636 1633
1637 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1634 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1645,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1648 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1645 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1649 ext4_fsblk_t start; 1646 ext4_fsblk_t start;
1650 1647
1651 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + 1648 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1652 ex.fe_start + le32_to_cpu(es->s_first_data_block); 1649 ex.fe_start;
1653 /* use do_div to get remainder (would be 64-bit modulo) */ 1650 /* use do_div to get remainder (would be 64-bit modulo) */
1654 if (do_div(start, sbi->s_stripe) == 0) { 1651 if (do_div(start, sbi->s_stripe) == 0) {
1655 ac->ac_found++; 1652 ac->ac_found++;
@@ -1803,8 +1800,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1803 BUG_ON(sbi->s_stripe == 0); 1800 BUG_ON(sbi->s_stripe == 0);
1804 1801
1805 /* find first stripe-aligned block in group */ 1802 /* find first stripe-aligned block in group */
1806 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) 1803 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1807 + le32_to_cpu(sbi->s_es->s_first_data_block); 1804
1808 a = first_group_block + sbi->s_stripe - 1; 1805 a = first_group_block + sbi->s_stripe - 1;
1809 do_div(a, sbi->s_stripe); 1806 do_div(a, sbi->s_stripe);
1810 i = (a * sbi->s_stripe) - first_group_block; 1807 i = (a * sbi->s_stripe) - first_group_block;
@@ -2256,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2256 2253
2257 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2254 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2258 init_rwsem(&meta_group_info[i]->alloc_sem); 2255 init_rwsem(&meta_group_info[i]->alloc_sem);
2259 meta_group_info[i]->bb_free_root.rb_node = NULL; 2256 meta_group_info[i]->bb_free_root = RB_ROOT;
2260 2257
2261#ifdef DOUBLE_CHECK 2258#ifdef DOUBLE_CHECK
2262 { 2259 {
@@ -2560,12 +2557,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2560 ext4_unlock_group(sb, entry->group); 2557 ext4_unlock_group(sb, entry->group);
2561 if (test_opt(sb, DISCARD)) { 2558 if (test_opt(sb, DISCARD)) {
2562 ext4_fsblk_t discard_block; 2559 ext4_fsblk_t discard_block;
2563 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
2564 2560
2565 discard_block = (ext4_fsblk_t)entry->group * 2561 discard_block = entry->start_blk +
2566 EXT4_BLOCKS_PER_GROUP(sb) 2562 ext4_group_first_block_no(sb, entry->group);
2567 + entry->start_blk
2568 + le32_to_cpu(es->s_first_data_block);
2569 trace_ext4_discard_blocks(sb, 2563 trace_ext4_discard_blocks(sb,
2570 (unsigned long long)discard_block, 2564 (unsigned long long)discard_block,
2571 entry->count); 2565 entry->count);
@@ -2703,14 +2697,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2703 if (err) 2697 if (err)
2704 goto out_err; 2698 goto out_err;
2705 2699
2706 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) 2700 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2707 + ac->ac_b_ex.fe_start
2708 + le32_to_cpu(es->s_first_data_block);
2709 2701
2710 len = ac->ac_b_ex.fe_len; 2702 len = ac->ac_b_ex.fe_len;
2711 if (!ext4_data_block_valid(sbi, block, len)) { 2703 if (!ext4_data_block_valid(sbi, block, len)) {
2712 ext4_error(sb, __func__, 2704 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2713 "Allocating blocks %llu-%llu which overlap "
2714 "fs metadata\n", block, block+len); 2705 "fs metadata\n", block, block+len);
2715 /* File system mounted not to panic on error 2706 /* File system mounted not to panic on error
2716 * Fix the bitmap and repeat the block allocation 2707 * Fix the bitmap and repeat the block allocation
@@ -3161,9 +3152,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3161 /* The max size of hash table is PREALLOC_TB_SIZE */ 3152 /* The max size of hash table is PREALLOC_TB_SIZE */
3162 order = PREALLOC_TB_SIZE - 1; 3153 order = PREALLOC_TB_SIZE - 1;
3163 3154
3164 goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + 3155 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3165 ac->ac_g_ex.fe_start +
3166 le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
3167 /* 3156 /*
3168 * search for the prealloc space that is having 3157 * search for the prealloc space that is having
3169 * minimal distance from the goal block. 3158 * minimal distance from the goal block.
@@ -3526,8 +3515,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3526 if (bit >= end) 3515 if (bit >= end)
3527 break; 3516 break;
3528 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3517 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3529 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3518 start = ext4_group_first_block_no(sb, group) + bit;
3530 le32_to_cpu(sbi->s_es->s_first_data_block);
3531 mb_debug(1, " free preallocated %u/%u in group %u\n", 3519 mb_debug(1, " free preallocated %u/%u in group %u\n",
3532 (unsigned) start, (unsigned) next - bit, 3520 (unsigned) start, (unsigned) next - bit,
3533 (unsigned) group); 3521 (unsigned) group);
@@ -3623,15 +3611,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3623 3611
3624 bitmap_bh = ext4_read_block_bitmap(sb, group); 3612 bitmap_bh = ext4_read_block_bitmap(sb, group);
3625 if (bitmap_bh == NULL) { 3613 if (bitmap_bh == NULL) {
3626 ext4_error(sb, __func__, "Error in reading block " 3614 ext4_error(sb, "Error reading block bitmap for %u", group);
3627 "bitmap for %u", group);
3628 return 0; 3615 return 0;
3629 } 3616 }
3630 3617
3631 err = ext4_mb_load_buddy(sb, group, &e4b); 3618 err = ext4_mb_load_buddy(sb, group, &e4b);
3632 if (err) { 3619 if (err) {
3633 ext4_error(sb, __func__, "Error in loading buddy " 3620 ext4_error(sb, "Error loading buddy information for %u", group);
3634 "information for %u", group);
3635 put_bh(bitmap_bh); 3621 put_bh(bitmap_bh);
3636 return 0; 3622 return 0;
3637 } 3623 }
@@ -3804,15 +3790,15 @@ repeat:
3804 3790
3805 err = ext4_mb_load_buddy(sb, group, &e4b); 3791 err = ext4_mb_load_buddy(sb, group, &e4b);
3806 if (err) { 3792 if (err) {
3807 ext4_error(sb, __func__, "Error in loading buddy " 3793 ext4_error(sb, "Error loading buddy information for %u",
3808 "information for %u", group); 3794 group);
3809 continue; 3795 continue;
3810 } 3796 }
3811 3797
3812 bitmap_bh = ext4_read_block_bitmap(sb, group); 3798 bitmap_bh = ext4_read_block_bitmap(sb, group);
3813 if (bitmap_bh == NULL) { 3799 if (bitmap_bh == NULL) {
3814 ext4_error(sb, __func__, "Error in reading block " 3800 ext4_error(sb, "Error reading block bitmap for %u",
3815 "bitmap for %u", group); 3801 group);
3816 ext4_mb_release_desc(&e4b); 3802 ext4_mb_release_desc(&e4b);
3817 continue; 3803 continue;
3818 } 3804 }
@@ -3938,7 +3924,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3938 3924
3939 /* don't use group allocation for large files */ 3925 /* don't use group allocation for large files */
3940 size = max(size, isize); 3926 size = max(size, isize);
3941 if (size >= sbi->s_mb_stream_request) { 3927 if (size > sbi->s_mb_stream_request) {
3942 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3928 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
3943 return; 3929 return;
3944 } 3930 }
@@ -4077,8 +4063,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4077 4063
4078 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4064 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4079 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4065 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4080 ext4_error(sb, __func__, "Error in loading buddy " 4066 ext4_error(sb, "Error loading buddy information for %u",
4081 "information for %u", group); 4067 group);
4082 continue; 4068 continue;
4083 } 4069 }
4084 ext4_lock_group(sb, group); 4070 ext4_lock_group(sb, group);
@@ -4476,10 +4462,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4476 4462
4477 sbi = EXT4_SB(sb); 4463 sbi = EXT4_SB(sb);
4478 es = EXT4_SB(sb)->s_es; 4464 es = EXT4_SB(sb)->s_es;
4479 if (!ext4_data_block_valid(sbi, block, count)) { 4465 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4480 ext4_error(sb, __func__, 4466 !ext4_data_block_valid(sbi, block, count)) {
4481 "Freeing blocks not in datazone - " 4467 ext4_error(sb, "Freeing blocks not in datazone - "
4482 "block = %llu, count = %lu", block, count); 4468 "block = %llu, count = %lu", block, count);
4483 goto error_return; 4469 goto error_return;
4484 } 4470 }
4485 4471
@@ -4547,8 +4533,7 @@ do_more:
4547 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4533 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4548 EXT4_SB(sb)->s_itb_per_group)) { 4534 EXT4_SB(sb)->s_itb_per_group)) {
4549 4535
4550 ext4_error(sb, __func__, 4536 ext4_error(sb, "Freeing blocks in system zone - "
4551 "Freeing blocks in system zone - "
4552 "Block = %llu, count = %lu", block, count); 4537 "Block = %llu, count = %lu", block, count);
4553 /* err = 0. ext4_std_error should be a no op */ 4538 /* err = 0. ext4_std_error should be a no op */
4554 goto error_return; 4539 goto error_return;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 436521cae456..b619322c76f0 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -220,16 +220,9 @@ struct ext4_buddy {
220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
222 222
223#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
224
225static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 223static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
226 struct ext4_free_extent *fex) 224 struct ext4_free_extent *fex)
227{ 225{
228 ext4_fsblk_t block; 226 return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
229
230 block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
231 + fex->fe_start
232 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
233 return block;
234} 227}
235#endif 228#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 81415814b00b..8b87bd0eac95 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
365 * happened after we started the migrate. We need to 365 * happened after we started the migrate. We need to
366 * fail the migrate 366 * fail the migrate
367 */ 367 */
368 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { 368 if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
369 retval = -EAGAIN; 369 retval = -EAGAIN;
370 up_write(&EXT4_I(inode)->i_data_sem); 370 up_write(&EXT4_I(inode)->i_data_sem);
371 goto err_out; 371 goto err_out;
372 } else 372 } else
373 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 373 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
374 /* 374 /*
375 * We have the extent map build with the tmp inode. 375 * We have the extent map build with the tmp inode.
376 * Now copy the i_data across 376 * Now copy the i_data across
@@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode)
503 } 503 }
504 i_size_write(tmp_inode, i_size_read(inode)); 504 i_size_write(tmp_inode, i_size_read(inode));
505 /* 505 /*
506 * We don't want the inode to be reclaimed 506 * Set the i_nlink to zero so it will be deleted later
507 * if we got interrupted in between. We have 507 * when we drop inode reference.
508 * this tmp inode carrying reference to the
509 * data blocks of the original file. We set
510 * the i_nlink to zero at the last stage after
511 * switching the original file to extent format
512 */ 508 */
513 tmp_inode->i_nlink = 1; 509 tmp_inode->i_nlink = 0;
514 510
515 ext4_ext_tree_init(handle, tmp_inode); 511 ext4_ext_tree_init(handle, tmp_inode);
516 ext4_orphan_add(handle, tmp_inode); 512 ext4_orphan_add(handle, tmp_inode);
@@ -533,10 +529,20 @@ int ext4_ext_migrate(struct inode *inode)
533 * allocation. 529 * allocation.
534 */ 530 */
535 down_read((&EXT4_I(inode)->i_data_sem)); 531 down_read((&EXT4_I(inode)->i_data_sem));
536 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; 532 ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
537 up_read((&EXT4_I(inode)->i_data_sem)); 533 up_read((&EXT4_I(inode)->i_data_sem));
538 534
539 handle = ext4_journal_start(inode, 1); 535 handle = ext4_journal_start(inode, 1);
536 if (IS_ERR(handle)) {
537 /*
538 * It is impossible to update on-disk structures without
539 * a handle, so just rollback in-core changes and live other
540 * work to orphan_list_cleanup()
541 */
542 ext4_orphan_del(NULL, tmp_inode);
543 retval = PTR_ERR(handle);
544 goto out;
545 }
540 546
541 ei = EXT4_I(inode); 547 ei = EXT4_I(inode);
542 i_data = ei->i_data; 548 i_data = ei->i_data;
@@ -618,15 +624,8 @@ err_out:
618 624
619 /* Reset the extent details */ 625 /* Reset the extent details */
620 ext4_ext_tree_init(handle, tmp_inode); 626 ext4_ext_tree_init(handle, tmp_inode);
621
622 /*
623 * Set the i_nlink to zero so that
624 * generic_drop_inode really deletes the
625 * inode
626 */
627 tmp_inode->i_nlink = 0;
628
629 ext4_journal_stop(handle); 627 ext4_journal_stop(handle);
628out:
630 unlock_new_inode(tmp_inode); 629 unlock_new_inode(tmp_inode);
631 iput(tmp_inode); 630 iput(tmp_inode);
632 631
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 82c415be87a4..aa5fe28d180f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -152,12 +152,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
152 int ret = 0; 152 int ret = 0;
153 153
154 if (inode1 == NULL) { 154 if (inode1 == NULL) {
155 ext4_error(inode2->i_sb, function, 155 __ext4_error(inode2->i_sb, function,
156 "Both inodes should not be NULL: " 156 "Both inodes should not be NULL: "
157 "inode1 NULL inode2 %lu", inode2->i_ino); 157 "inode1 NULL inode2 %lu", inode2->i_ino);
158 ret = -EIO; 158 ret = -EIO;
159 } else if (inode2 == NULL) { 159 } else if (inode2 == NULL) {
160 ext4_error(inode1->i_sb, function, 160 __ext4_error(inode1->i_sb, function,
161 "Both inodes should not be NULL: " 161 "Both inodes should not be NULL: "
162 "inode1 %lu inode2 NULL", inode1->i_ino); 162 "inode1 %lu inode2 NULL", inode1->i_ino);
163 ret = -EIO; 163 ret = -EIO;
@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
252 } 252 }
253 253
254 o_start->ee_len = start_ext->ee_len; 254 o_start->ee_len = start_ext->ee_len;
255 eblock = le32_to_cpu(start_ext->ee_block);
255 new_flag = 1; 256 new_flag = 1;
256 257
257 } else if (start_ext->ee_len && new_ext->ee_len && 258 } else if (start_ext->ee_len && new_ext->ee_len &&
@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
262 * orig |------------------------------| 263 * orig |------------------------------|
263 */ 264 */
264 o_start->ee_len = start_ext->ee_len; 265 o_start->ee_len = start_ext->ee_len;
266 eblock = le32_to_cpu(start_ext->ee_block);
265 new_flag = 1; 267 new_flag = 1;
266 268
267 } else if (!start_ext->ee_len && new_ext->ee_len && 269 } else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -475,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
475 struct ext4_extent *oext, *o_start, *o_end, *prev_ext; 477 struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
476 struct ext4_extent new_ext, start_ext, end_ext; 478 struct ext4_extent new_ext, start_ext, end_ext;
477 ext4_lblk_t new_ext_end; 479 ext4_lblk_t new_ext_end;
478 ext4_fsblk_t new_phys_end;
479 int oext_alen, new_ext_alen, end_ext_alen; 480 int oext_alen, new_ext_alen, end_ext_alen;
480 int depth = ext_depth(orig_inode); 481 int depth = ext_depth(orig_inode);
481 int ret; 482 int ret;
@@ -489,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
489 new_ext.ee_len = dext->ee_len; 490 new_ext.ee_len = dext->ee_len;
490 new_ext_alen = ext4_ext_get_actual_len(&new_ext); 491 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
491 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 492 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
492 new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
493 493
494 /* 494 /*
495 * Case: original extent is first 495 * Case: original extent is first
@@ -502,6 +502,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
502 le32_to_cpu(oext->ee_block) + oext_alen) { 502 le32_to_cpu(oext->ee_block) + oext_alen) {
503 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - 503 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
504 le32_to_cpu(oext->ee_block)); 504 le32_to_cpu(oext->ee_block));
505 start_ext.ee_block = oext->ee_block;
505 copy_extent_status(oext, &start_ext); 506 copy_extent_status(oext, &start_ext);
506 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { 507 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
507 prev_ext = oext - 1; 508 prev_ext = oext - 1;
@@ -515,6 +516,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
515 start_ext.ee_len = cpu_to_le16( 516 start_ext.ee_len = cpu_to_le16(
516 ext4_ext_get_actual_len(prev_ext) + 517 ext4_ext_get_actual_len(prev_ext) +
517 new_ext_alen); 518 new_ext_alen);
519 start_ext.ee_block = oext->ee_block;
518 copy_extent_status(prev_ext, &start_ext); 520 copy_extent_status(prev_ext, &start_ext);
519 new_ext.ee_len = 0; 521 new_ext.ee_len = 0;
520 } 522 }
@@ -526,7 +528,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
526 * new_ext |-------| 528 * new_ext |-------|
527 */ 529 */
528 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 530 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
529 ext4_error(orig_inode->i_sb, __func__, 531 ext4_error(orig_inode->i_sb,
530 "new_ext_end(%u) should be less than or equal to " 532 "new_ext_end(%u) should be less than or equal to "
531 "oext->ee_block(%u) + oext_alen(%d) - 1", 533 "oext->ee_block(%u) + oext_alen(%d) - 1",
532 new_ext_end, le32_to_cpu(oext->ee_block), 534 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -689,12 +691,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
689 while (1) { 691 while (1) {
690 /* The extent for donor must be found. */ 692 /* The extent for donor must be found. */
691 if (!dext) { 693 if (!dext) {
692 ext4_error(donor_inode->i_sb, __func__, 694 ext4_error(donor_inode->i_sb,
693 "The extent for donor must be found"); 695 "The extent for donor must be found");
694 *err = -EIO; 696 *err = -EIO;
695 goto out; 697 goto out;
696 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 698 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
697 ext4_error(donor_inode->i_sb, __func__, 699 ext4_error(donor_inode->i_sb,
698 "Donor offset(%u) and the first block of donor " 700 "Donor offset(%u) and the first block of donor "
699 "extent(%u) should be equal", 701 "extent(%u) should be equal",
700 donor_off, 702 donor_off,
@@ -928,7 +930,7 @@ out2:
928} 930}
929 931
930/** 932/**
931 * mext_check_argumants - Check whether move extent can be done 933 * mext_check_arguments - Check whether move extent can be done
932 * 934 *
933 * @orig_inode: original inode 935 * @orig_inode: original inode
934 * @donor_inode: donor inode 936 * @donor_inode: donor inode
@@ -949,14 +951,6 @@ mext_check_arguments(struct inode *orig_inode,
949 unsigned int blkbits = orig_inode->i_blkbits; 951 unsigned int blkbits = orig_inode->i_blkbits;
950 unsigned int blocksize = 1 << blkbits; 952 unsigned int blocksize = 1 << blkbits;
951 953
952 /* Regular file check */
953 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
954 ext4_debug("ext4 move extent: The argument files should be "
955 "regular file [ino:orig %lu, donor %lu]\n",
956 orig_inode->i_ino, donor_inode->i_ino);
957 return -EINVAL;
958 }
959
960 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 954 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
961 ext4_debug("ext4 move extent: suid or sgid is set" 955 ext4_debug("ext4 move extent: suid or sgid is set"
962 " to donor file [ino:orig %lu, donor %lu]\n", 956 " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1204,6 +1198,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1204 return -EINVAL; 1198 return -EINVAL;
1205 } 1199 }
1206 1200
1201 /* Regular file check */
1202 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
1203 ext4_debug("ext4 move extent: The argument files should be "
1204 "regular file [ino:orig %lu, donor %lu]\n",
1205 orig_inode->i_ino, donor_inode->i_ino);
1206 return -EINVAL;
1207 }
1208
1207 /* Protect orig and donor inodes against a truncate */ 1209 /* Protect orig and donor inodes against a truncate */
1208 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1210 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1209 if (ret1 < 0) 1211 if (ret1 < 0)
@@ -1351,7 +1353,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1351 if (ret1 < 0) 1353 if (ret1 < 0)
1352 break; 1354 break;
1353 if (*moved_len > len) { 1355 if (*moved_len > len) {
1354 ext4_error(orig_inode->i_sb, __func__, 1356 ext4_error(orig_inode->i_sb,
1355 "We replaced blocks too much! " 1357 "We replaced blocks too much! "
1356 "sum of replaced: %llu requested: %llu", 1358 "sum of replaced: %llu requested: %llu",
1357 *moved_len, len); 1359 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 17a17e10dd60..608d21f873ec 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
383 if (root->info.hash_version != DX_HASH_TEA && 383 if (root->info.hash_version != DX_HASH_TEA &&
384 root->info.hash_version != DX_HASH_HALF_MD4 && 384 root->info.hash_version != DX_HASH_HALF_MD4 &&
385 root->info.hash_version != DX_HASH_LEGACY) { 385 root->info.hash_version != DX_HASH_LEGACY) {
386 ext4_warning(dir->i_sb, __func__, 386 ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
387 "Unrecognised inode hash code %d",
388 root->info.hash_version); 387 root->info.hash_version);
389 brelse(bh); 388 brelse(bh);
390 *err = ERR_BAD_DX_DIR; 389 *err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
399 hash = hinfo->hash; 398 hash = hinfo->hash;
400 399
401 if (root->info.unused_flags & 1) { 400 if (root->info.unused_flags & 1) {
402 ext4_warning(dir->i_sb, __func__, 401 ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
403 "Unimplemented inode hash flags: %#06x",
404 root->info.unused_flags); 402 root->info.unused_flags);
405 brelse(bh); 403 brelse(bh);
406 *err = ERR_BAD_DX_DIR; 404 *err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
408 } 406 }
409 407
410 if ((indirect = root->info.indirect_levels) > 1) { 408 if ((indirect = root->info.indirect_levels) > 1) {
411 ext4_warning(dir->i_sb, __func__, 409 ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
412 "Unimplemented inode hash depth: %#06x",
413 root->info.indirect_levels); 410 root->info.indirect_levels);
414 brelse(bh); 411 brelse(bh);
415 *err = ERR_BAD_DX_DIR; 412 *err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
421 418
422 if (dx_get_limit(entries) != dx_root_limit(dir, 419 if (dx_get_limit(entries) != dx_root_limit(dir,
423 root->info.info_length)) { 420 root->info.info_length)) {
424 ext4_warning(dir->i_sb, __func__, 421 ext4_warning(dir->i_sb, "dx entry: limit != root limit");
425 "dx entry: limit != root limit");
426 brelse(bh); 422 brelse(bh);
427 *err = ERR_BAD_DX_DIR; 423 *err = ERR_BAD_DX_DIR;
428 goto fail; 424 goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
433 { 429 {
434 count = dx_get_count(entries); 430 count = dx_get_count(entries);
435 if (!count || count > dx_get_limit(entries)) { 431 if (!count || count > dx_get_limit(entries)) {
436 ext4_warning(dir->i_sb, __func__, 432 ext4_warning(dir->i_sb,
437 "dx entry: no count or count > limit"); 433 "dx entry: no count or count > limit");
438 brelse(bh); 434 brelse(bh);
439 *err = ERR_BAD_DX_DIR; 435 *err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
478 goto fail2; 474 goto fail2;
479 at = entries = ((struct dx_node *) bh->b_data)->entries; 475 at = entries = ((struct dx_node *) bh->b_data)->entries;
480 if (dx_get_limit(entries) != dx_node_limit (dir)) { 476 if (dx_get_limit(entries) != dx_node_limit (dir)) {
481 ext4_warning(dir->i_sb, __func__, 477 ext4_warning(dir->i_sb,
482 "dx entry: limit != node limit"); 478 "dx entry: limit != node limit");
483 brelse(bh); 479 brelse(bh);
484 *err = ERR_BAD_DX_DIR; 480 *err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
494 } 490 }
495fail: 491fail:
496 if (*err == ERR_BAD_DX_DIR) 492 if (*err == ERR_BAD_DX_DIR)
497 ext4_warning(dir->i_sb, __func__, 493 ext4_warning(dir->i_sb,
498 "Corrupt dir inode %ld, running e2fsck is " 494 "Corrupt dir inode %ld, running e2fsck is "
499 "recommended.", dir->i_ino); 495 "recommended.", dir->i_ino);
500 return NULL; 496 return NULL;
@@ -947,9 +943,8 @@ restart:
947 wait_on_buffer(bh); 943 wait_on_buffer(bh);
948 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
949 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
950 ext4_error(sb, __func__, "reading directory #%lu " 946 ext4_error(sb, "reading directory #%lu offset %lu",
951 "offset %lu", dir->i_ino, 947 dir->i_ino, (unsigned long)block);
952 (unsigned long)block);
953 brelse(bh); 948 brelse(bh);
954 goto next; 949 goto next;
955 } 950 }
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1041 retval = ext4_htree_next_block(dir, hash, frame, 1036 retval = ext4_htree_next_block(dir, hash, frame,
1042 frames, NULL); 1037 frames, NULL);
1043 if (retval < 0) { 1038 if (retval < 0) {
1044 ext4_warning(sb, __func__, 1039 ext4_warning(sb,
1045 "error reading index page in directory #%lu", 1040 "error reading index page in directory #%lu",
1046 dir->i_ino); 1041 dir->i_ino);
1047 *err = retval; 1042 *err = retval;
@@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1071 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1072 brelse(bh); 1067 brelse(bh);
1073 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1074 ext4_error(dir->i_sb, "ext4_lookup", 1069 ext4_error(dir->i_sb, "bad inode number: %u", ino);
1075 "bad inode number: %u", ino);
1076 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1077 } 1071 }
1078 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1079 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1080 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1081 ext4_error(dir->i_sb, __func__, 1075 ext4_error(dir->i_sb,
1082 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1083 ino); 1077 ino);
1084 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
@@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1110 brelse(bh); 1104 brelse(bh);
1111 1105
1112 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1113 ext4_error(child->d_inode->i_sb, "ext4_get_parent", 1107 ext4_error(child->d_inode->i_sb,
1114 "bad inode number: %u", ino); 1108 "bad inode number: %u", ino);
1115 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1116 } 1110 }
@@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1410 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1411 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1412 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1413 ext4_error(dir->i_sb, __func__, 1407 ext4_error(dir->i_sb,
1414 "invalid rec_len for '..' in inode %lu", 1408 "invalid rec_len for '..' in inode %lu",
1415 dir->i_ino); 1409 dir->i_ino);
1416 brelse(bh); 1410 brelse(bh);
@@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1575 1569
1576 if (levels && (dx_get_count(frames->entries) == 1570 if (levels && (dx_get_count(frames->entries) ==
1577 dx_get_limit(frames->entries))) { 1571 dx_get_limit(frames->entries))) {
1578 ext4_warning(sb, __func__, 1572 ext4_warning(sb, "Directory index full!");
1579 "Directory index full!");
1580 err = -ENOSPC; 1573 err = -ENOSPC;
1581 goto cleanup; 1574 goto cleanup;
1582 } 1575 }
@@ -1916,11 +1909,11 @@ static int empty_dir(struct inode *inode)
1916 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1909 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1917 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1910 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1918 if (err) 1911 if (err)
1919 ext4_error(inode->i_sb, __func__, 1912 ext4_error(inode->i_sb,
1920 "error %d reading directory #%lu offset 0", 1913 "error %d reading directory #%lu offset 0",
1921 err, inode->i_ino); 1914 err, inode->i_ino);
1922 else 1915 else
1923 ext4_warning(inode->i_sb, __func__, 1916 ext4_warning(inode->i_sb,
1924 "bad directory (dir #%lu) - no data block", 1917 "bad directory (dir #%lu) - no data block",
1925 inode->i_ino); 1918 inode->i_ino);
1926 return 1; 1919 return 1;
@@ -1931,7 +1924,7 @@ static int empty_dir(struct inode *inode)
1931 !le32_to_cpu(de1->inode) || 1924 !le32_to_cpu(de1->inode) ||
1932 strcmp(".", de->name) || 1925 strcmp(".", de->name) ||
1933 strcmp("..", de1->name)) { 1926 strcmp("..", de1->name)) {
1934 ext4_warning(inode->i_sb, "empty_dir", 1927 ext4_warning(inode->i_sb,
1935 "bad directory (dir #%lu) - no `.' or `..'", 1928 "bad directory (dir #%lu) - no `.' or `..'",
1936 inode->i_ino); 1929 inode->i_ino);
1937 brelse(bh); 1930 brelse(bh);
@@ -1949,7 +1942,7 @@ static int empty_dir(struct inode *inode)
1949 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1942 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1950 if (!bh) { 1943 if (!bh) {
1951 if (err) 1944 if (err)
1952 ext4_error(sb, __func__, 1945 ext4_error(sb,
1953 "error %d reading directory" 1946 "error %d reading directory"
1954 " #%lu offset %u", 1947 " #%lu offset %u",
1955 err, inode->i_ino, offset); 1948 err, inode->i_ino, offset);
@@ -2020,11 +2013,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2020 err = ext4_reserve_inode_write(handle, inode, &iloc); 2013 err = ext4_reserve_inode_write(handle, inode, &iloc);
2021 if (err) 2014 if (err)
2022 goto out_unlock; 2015 goto out_unlock;
2016 /*
2017 * Due to previous errors inode may be already a part of on-disk
2018 * orphan list. If so skip on-disk list modification.
2019 */
2020 if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
2021 (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
2022 goto mem_insert;
2023 2023
2024 /* Insert this inode at the head of the on-disk orphan list... */ 2024 /* Insert this inode at the head of the on-disk orphan list... */
2025 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 2025 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
2026 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 2026 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2027 err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); 2027 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
2028 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 2028 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2029 if (!err) 2029 if (!err)
2030 err = rc; 2030 err = rc;
@@ -2037,6 +2037,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2037 * 2037 *
2038 * This is safe: on error we're going to ignore the orphan list 2038 * This is safe: on error we're going to ignore the orphan list
2039 * anyway on the next recovery. */ 2039 * anyway on the next recovery. */
2040mem_insert:
2040 if (!err) 2041 if (!err)
2041 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2042 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2042 2043
@@ -2096,7 +2097,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2096 if (err) 2097 if (err)
2097 goto out_brelse; 2098 goto out_brelse;
2098 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2099 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2099 err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); 2100 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
2100 } else { 2101 } else {
2101 struct ext4_iloc iloc2; 2102 struct ext4_iloc iloc2;
2102 struct inode *i_prev = 2103 struct inode *i_prev =
@@ -2163,7 +2164,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2163 if (retval) 2164 if (retval)
2164 goto end_rmdir; 2165 goto end_rmdir;
2165 if (!EXT4_DIR_LINK_EMPTY(inode)) 2166 if (!EXT4_DIR_LINK_EMPTY(inode))
2166 ext4_warning(inode->i_sb, "ext4_rmdir", 2167 ext4_warning(inode->i_sb,
2167 "empty directory has too many links (%d)", 2168 "empty directory has too many links (%d)",
2168 inode->i_nlink); 2169 inode->i_nlink);
2169 inode->i_version++; 2170 inode->i_version++;
@@ -2215,7 +2216,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2215 goto end_unlink; 2216 goto end_unlink;
2216 2217
2217 if (!inode->i_nlink) { 2218 if (!inode->i_nlink) {
2218 ext4_warning(inode->i_sb, "ext4_unlink", 2219 ext4_warning(inode->i_sb,
2219 "Deleting nonexistent file (%lu), %d", 2220 "Deleting nonexistent file (%lu), %d",
2220 inode->i_ino, inode->i_nlink); 2221 inode->i_ino, inode->i_nlink);
2221 inode->i_nlink = 1; 2222 inode->i_nlink = 1;
@@ -2462,7 +2463,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2462 } 2463 }
2463 } 2464 }
2464 if (retval) { 2465 if (retval) {
2465 ext4_warning(old_dir->i_sb, "ext4_rename", 2466 ext4_warning(old_dir->i_sb,
2466 "Deleting old file (%lu), %d, error=%d", 2467 "Deleting old file (%lu), %d, error=%d",
2467 old_dir->i_ino, old_dir->i_nlink, retval); 2468 old_dir->i_ino, old_dir->i_nlink, retval);
2468 } 2469 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3b2c5541d8a6..5692c48754a0 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
48 48
49 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 49 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
50 if (group != sbi->s_groups_count) 50 if (group != sbi->s_groups_count)
51 ext4_warning(sb, __func__, 51 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
52 "Cannot add at group %u (only %u groups)",
53 input->group, sbi->s_groups_count); 52 input->group, sbi->s_groups_count);
54 else if (offset != 0) 53 else if (offset != 0)
55 ext4_warning(sb, __func__, "Last group not full"); 54 ext4_warning(sb, "Last group not full");
56 else if (input->reserved_blocks > input->blocks_count / 5) 55 else if (input->reserved_blocks > input->blocks_count / 5)
57 ext4_warning(sb, __func__, "Reserved blocks too high (%u)", 56 ext4_warning(sb, "Reserved blocks too high (%u)",
58 input->reserved_blocks); 57 input->reserved_blocks);
59 else if (free_blocks_count < 0) 58 else if (free_blocks_count < 0)
60 ext4_warning(sb, __func__, "Bad blocks count %u", 59 ext4_warning(sb, "Bad blocks count %u",
61 input->blocks_count); 60 input->blocks_count);
62 else if (!(bh = sb_bread(sb, end - 1))) 61 else if (!(bh = sb_bread(sb, end - 1)))
63 ext4_warning(sb, __func__, 62 ext4_warning(sb, "Cannot read last block (%llu)",
64 "Cannot read last block (%llu)",
65 end - 1); 63 end - 1);
66 else if (outside(input->block_bitmap, start, end)) 64 else if (outside(input->block_bitmap, start, end))
67 ext4_warning(sb, __func__, 65 ext4_warning(sb, "Block bitmap not in group (block %llu)",
68 "Block bitmap not in group (block %llu)",
69 (unsigned long long)input->block_bitmap); 66 (unsigned long long)input->block_bitmap);
70 else if (outside(input->inode_bitmap, start, end)) 67 else if (outside(input->inode_bitmap, start, end))
71 ext4_warning(sb, __func__, 68 ext4_warning(sb, "Inode bitmap not in group (block %llu)",
72 "Inode bitmap not in group (block %llu)",
73 (unsigned long long)input->inode_bitmap); 69 (unsigned long long)input->inode_bitmap);
74 else if (outside(input->inode_table, start, end) || 70 else if (outside(input->inode_table, start, end) ||
75 outside(itend - 1, start, end)) 71 outside(itend - 1, start, end))
76 ext4_warning(sb, __func__, 72 ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
77 "Inode table not in group (blocks %llu-%llu)",
78 (unsigned long long)input->inode_table, itend - 1); 73 (unsigned long long)input->inode_table, itend - 1);
79 else if (input->inode_bitmap == input->block_bitmap) 74 else if (input->inode_bitmap == input->block_bitmap)
80 ext4_warning(sb, __func__, 75 ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
81 "Block bitmap same as inode bitmap (%llu)",
82 (unsigned long long)input->block_bitmap); 76 (unsigned long long)input->block_bitmap);
83 else if (inside(input->block_bitmap, input->inode_table, itend)) 77 else if (inside(input->block_bitmap, input->inode_table, itend))
84 ext4_warning(sb, __func__, 78 ext4_warning(sb, "Block bitmap (%llu) in inode table "
85 "Block bitmap (%llu) in inode table (%llu-%llu)", 79 "(%llu-%llu)",
86 (unsigned long long)input->block_bitmap, 80 (unsigned long long)input->block_bitmap,
87 (unsigned long long)input->inode_table, itend - 1); 81 (unsigned long long)input->inode_table, itend - 1);
88 else if (inside(input->inode_bitmap, input->inode_table, itend)) 82 else if (inside(input->inode_bitmap, input->inode_table, itend))
89 ext4_warning(sb, __func__, 83 ext4_warning(sb, "Inode bitmap (%llu) in inode table "
90 "Inode bitmap (%llu) in inode table (%llu-%llu)", 84 "(%llu-%llu)",
91 (unsigned long long)input->inode_bitmap, 85 (unsigned long long)input->inode_bitmap,
92 (unsigned long long)input->inode_table, itend - 1); 86 (unsigned long long)input->inode_table, itend - 1);
93 else if (inside(input->block_bitmap, start, metaend)) 87 else if (inside(input->block_bitmap, start, metaend))
94 ext4_warning(sb, __func__, 88 ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
95 "Block bitmap (%llu) in GDT table"
96 " (%llu-%llu)",
97 (unsigned long long)input->block_bitmap, 89 (unsigned long long)input->block_bitmap,
98 start, metaend - 1); 90 start, metaend - 1);
99 else if (inside(input->inode_bitmap, start, metaend)) 91 else if (inside(input->inode_bitmap, start, metaend))
100 ext4_warning(sb, __func__, 92 ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
101 "Inode bitmap (%llu) in GDT table"
102 " (%llu-%llu)",
103 (unsigned long long)input->inode_bitmap, 93 (unsigned long long)input->inode_bitmap,
104 start, metaend - 1); 94 start, metaend - 1);
105 else if (inside(input->inode_table, start, metaend) || 95 else if (inside(input->inode_table, start, metaend) ||
106 inside(itend - 1, start, metaend)) 96 inside(itend - 1, start, metaend))
107 ext4_warning(sb, __func__, 97 ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
108 "Inode table (%llu-%llu) overlaps" 98 "(%llu-%llu)",
109 "GDT table (%llu-%llu)",
110 (unsigned long long)input->inode_table, 99 (unsigned long long)input->inode_table,
111 itend - 1, start, metaend - 1); 100 itend - 1, start, metaend - 1);
112 else 101 else
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
364 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { 353 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
365 if (le32_to_cpu(*p++) != 354 if (le32_to_cpu(*p++) !=
366 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ 355 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
367 ext4_warning(sb, __func__, 356 ext4_warning(sb, "reserved GDT %llu"
368 "reserved GDT %llu"
369 " missing grp %d (%llu)", 357 " missing grp %d (%llu)",
370 blk, grp, 358 blk, grp,
371 grp * 359 grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
420 */ 408 */
421 if (EXT4_SB(sb)->s_sbh->b_blocknr != 409 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
422 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { 410 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
423 ext4_warning(sb, __func__, 411 ext4_warning(sb, "won't resize using backup superblock at %llu",
424 "won't resize using backup superblock at %llu",
425 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); 412 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
426 return -EPERM; 413 return -EPERM;
427 } 414 }
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
444 431
445 data = (__le32 *)dind->b_data; 432 data = (__le32 *)dind->b_data;
446 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { 433 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
447 ext4_warning(sb, __func__, 434 ext4_warning(sb, "new group %u GDT block %llu not reserved",
448 "new group %u GDT block %llu not reserved",
449 input->group, gdblock); 435 input->group, gdblock);
450 err = -EINVAL; 436 err = -EINVAL;
451 goto exit_dind; 437 goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
468 GFP_NOFS); 454 GFP_NOFS);
469 if (!n_group_desc) { 455 if (!n_group_desc) {
470 err = -ENOMEM; 456 err = -ENOMEM;
471 ext4_warning(sb, __func__, 457 ext4_warning(sb,
472 "not enough memory for %lu groups", gdb_num + 1); 458 "not enough memory for %lu groups", gdb_num + 1);
473 goto exit_inode; 459 goto exit_inode;
474 } 460 }
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
567 /* Get each reserved primary GDT block and verify it holds backups */ 553 /* Get each reserved primary GDT block and verify it holds backups */
568 for (res = 0; res < reserved_gdb; res++, blk++) { 554 for (res = 0; res < reserved_gdb; res++, blk++) {
569 if (le32_to_cpu(*data) != blk) { 555 if (le32_to_cpu(*data) != blk) {
570 ext4_warning(sb, __func__, 556 ext4_warning(sb, "reserved block %llu"
571 "reserved block %llu"
572 " not at offset %ld", 557 " not at offset %ld",
573 blk, 558 blk,
574 (long)(data - (__le32 *)dind->b_data)); 559 (long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
713 */ 698 */
714exit_err: 699exit_err:
715 if (err) { 700 if (err) {
716 ext4_warning(sb, __func__, 701 ext4_warning(sb, "can't update backup for group %u (err %d), "
717 "can't update backup for group %u (err %d), "
718 "forcing fsck on next reboot", group, err); 702 "forcing fsck on next reboot", group, err);
719 sbi->s_mount_state &= ~EXT4_VALID_FS; 703 sbi->s_mount_state &= ~EXT4_VALID_FS;
720 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 704 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
753 737
754 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 738 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
755 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { 739 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
756 ext4_warning(sb, __func__, 740 ext4_warning(sb, "Can't resize non-sparse filesystem further");
757 "Can't resize non-sparse filesystem further");
758 return -EPERM; 741 return -EPERM;
759 } 742 }
760 743
761 if (ext4_blocks_count(es) + input->blocks_count < 744 if (ext4_blocks_count(es) + input->blocks_count <
762 ext4_blocks_count(es)) { 745 ext4_blocks_count(es)) {
763 ext4_warning(sb, __func__, "blocks_count overflow"); 746 ext4_warning(sb, "blocks_count overflow");
764 return -EINVAL; 747 return -EINVAL;
765 } 748 }
766 749
767 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < 750 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
768 le32_to_cpu(es->s_inodes_count)) { 751 le32_to_cpu(es->s_inodes_count)) {
769 ext4_warning(sb, __func__, "inodes_count overflow"); 752 ext4_warning(sb, "inodes_count overflow");
770 return -EINVAL; 753 return -EINVAL;
771 } 754 }
772 755
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
774 if (!EXT4_HAS_COMPAT_FEATURE(sb, 757 if (!EXT4_HAS_COMPAT_FEATURE(sb,
775 EXT4_FEATURE_COMPAT_RESIZE_INODE) 758 EXT4_FEATURE_COMPAT_RESIZE_INODE)
776 || !le16_to_cpu(es->s_reserved_gdt_blocks)) { 759 || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
777 ext4_warning(sb, __func__, 760 ext4_warning(sb,
778 "No reserved GDT blocks, can't resize"); 761 "No reserved GDT blocks, can't resize");
779 return -EPERM; 762 return -EPERM;
780 } 763 }
781 inode = ext4_iget(sb, EXT4_RESIZE_INO); 764 inode = ext4_iget(sb, EXT4_RESIZE_INO);
782 if (IS_ERR(inode)) { 765 if (IS_ERR(inode)) {
783 ext4_warning(sb, __func__, 766 ext4_warning(sb, "Error opening resize inode");
784 "Error opening resize inode");
785 return PTR_ERR(inode); 767 return PTR_ERR(inode);
786 } 768 }
787 } 769 }
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
810 792
811 mutex_lock(&sbi->s_resize_lock); 793 mutex_lock(&sbi->s_resize_lock);
812 if (input->group != sbi->s_groups_count) { 794 if (input->group != sbi->s_groups_count) {
813 ext4_warning(sb, __func__, 795 ext4_warning(sb, "multiple resizers run on filesystem!");
814 "multiple resizers run on filesystem!");
815 err = -EBUSY; 796 err = -EBUSY;
816 goto exit_journal; 797 goto exit_journal;
817 } 798 }
@@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
997 " too large to resize to %llu blocks safely\n", 978 " too large to resize to %llu blocks safely\n",
998 sb->s_id, n_blocks_count); 979 sb->s_id, n_blocks_count);
999 if (sizeof(sector_t) < 8) 980 if (sizeof(sector_t) < 8)
1000 ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled"); 981 ext4_warning(sb, "CONFIG_LBDAF not enabled");
1001 return -EINVAL; 982 return -EINVAL;
1002 } 983 }
1003 984
1004 if (n_blocks_count < o_blocks_count) { 985 if (n_blocks_count < o_blocks_count) {
1005 ext4_warning(sb, __func__, 986 ext4_warning(sb, "can't shrink FS - resize aborted");
1006 "can't shrink FS - resize aborted");
1007 return -EBUSY; 987 return -EBUSY;
1008 } 988 }
1009 989
@@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1011 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); 991 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
1012 992
1013 if (last == 0) { 993 if (last == 0) {
1014 ext4_warning(sb, __func__, 994 ext4_warning(sb, "need to use ext2online to resize further");
1015 "need to use ext2online to resize further");
1016 return -EPERM; 995 return -EPERM;
1017 } 996 }
1018 997
1019 add = EXT4_BLOCKS_PER_GROUP(sb) - last; 998 add = EXT4_BLOCKS_PER_GROUP(sb) - last;
1020 999
1021 if (o_blocks_count + add < o_blocks_count) { 1000 if (o_blocks_count + add < o_blocks_count) {
1022 ext4_warning(sb, __func__, "blocks_count overflow"); 1001 ext4_warning(sb, "blocks_count overflow");
1023 return -EINVAL; 1002 return -EINVAL;
1024 } 1003 }
1025 1004
@@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1027 add = n_blocks_count - o_blocks_count; 1006 add = n_blocks_count - o_blocks_count;
1028 1007
1029 if (o_blocks_count + add < n_blocks_count) 1008 if (o_blocks_count + add < n_blocks_count)
1030 ext4_warning(sb, __func__, 1009 ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
1031 "will only finish group (%llu"
1032 " blocks, %u new)",
1033 o_blocks_count + add, add); 1010 o_blocks_count + add, add);
1034 1011
1035 /* See if the device is actually as big as what was requested */ 1012 /* See if the device is actually as big as what was requested */
1036 bh = sb_bread(sb, o_blocks_count + add - 1); 1013 bh = sb_bread(sb, o_blocks_count + add - 1);
1037 if (!bh) { 1014 if (!bh) {
1038 ext4_warning(sb, __func__, 1015 ext4_warning(sb, "can't read last block, resize aborted");
1039 "can't read last block, resize aborted");
1040 return -ENOSPC; 1016 return -ENOSPC;
1041 } 1017 }
1042 brelse(bh); 1018 brelse(bh);
@@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1047 handle = ext4_journal_start_sb(sb, 3); 1023 handle = ext4_journal_start_sb(sb, 3);
1048 if (IS_ERR(handle)) { 1024 if (IS_ERR(handle)) {
1049 err = PTR_ERR(handle); 1025 err = PTR_ERR(handle);
1050 ext4_warning(sb, __func__, "error %d on journal start", err); 1026 ext4_warning(sb, "error %d on journal start", err);
1051 goto exit_put; 1027 goto exit_put;
1052 } 1028 }
1053 1029
1054 mutex_lock(&EXT4_SB(sb)->s_resize_lock); 1030 mutex_lock(&EXT4_SB(sb)->s_resize_lock);
1055 if (o_blocks_count != ext4_blocks_count(es)) { 1031 if (o_blocks_count != ext4_blocks_count(es)) {
1056 ext4_warning(sb, __func__, 1032 ext4_warning(sb, "multiple resizers run on filesystem!");
1057 "multiple resizers run on filesystem!");
1058 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1033 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1059 ext4_journal_stop(handle); 1034 ext4_journal_stop(handle);
1060 err = -EBUSY; 1035 err = -EBUSY;
@@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1063 1038
1064 if ((err = ext4_journal_get_write_access(handle, 1039 if ((err = ext4_journal_get_write_access(handle,
1065 EXT4_SB(sb)->s_sbh))) { 1040 EXT4_SB(sb)->s_sbh))) {
1066 ext4_warning(sb, __func__, 1041 ext4_warning(sb, "error %d on journal write access", err);
1067 "error %d on journal write access", err);
1068 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1042 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1069 ext4_journal_stop(handle); 1043 ext4_journal_stop(handle);
1070 goto exit_put; 1044 goto exit_put;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 735c20d5fd56..ad1ee5f21bab 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -333,7 +333,7 @@ static void ext4_handle_error(struct super_block *sb)
333 sb->s_id); 333 sb->s_id);
334} 334}
335 335
336void ext4_error(struct super_block *sb, const char *function, 336void __ext4_error(struct super_block *sb, const char *function,
337 const char *fmt, ...) 337 const char *fmt, ...)
338{ 338{
339 va_list args; 339 va_list args;
@@ -347,6 +347,42 @@ void ext4_error(struct super_block *sb, const char *function,
347 ext4_handle_error(sb); 347 ext4_handle_error(sb);
348} 348}
349 349
350void ext4_error_inode(const char *function, struct inode *inode,
351 const char *fmt, ...)
352{
353 va_list args;
354
355 va_start(args, fmt);
356 printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
357 inode->i_sb->s_id, function, inode->i_ino, current->comm);
358 vprintk(fmt, args);
359 printk("\n");
360 va_end(args);
361
362 ext4_handle_error(inode->i_sb);
363}
364
365void ext4_error_file(const char *function, struct file *file,
366 const char *fmt, ...)
367{
368 va_list args;
369 struct inode *inode = file->f_dentry->d_inode;
370 char pathname[80], *path;
371
372 va_start(args, fmt);
373 path = d_path(&(file->f_path), pathname, sizeof(pathname));
374 if (!path)
375 path = "(unknown)";
376 printk(KERN_CRIT
377 "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
378 inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
379 vprintk(fmt, args);
380 printk("\n");
381 va_end(args);
382
383 ext4_handle_error(inode->i_sb);
384}
385
350static const char *ext4_decode_error(struct super_block *sb, int errno, 386static const char *ext4_decode_error(struct super_block *sb, int errno,
351 char nbuf[16]) 387 char nbuf[16])
352{ 388{
@@ -450,7 +486,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
450 va_end(args); 486 va_end(args);
451} 487}
452 488
453void ext4_warning(struct super_block *sb, const char *function, 489void __ext4_warning(struct super_block *sb, const char *function,
454 const char *fmt, ...) 490 const char *fmt, ...)
455{ 491{
456 va_list args; 492 va_list args;
@@ -507,7 +543,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
507 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) 543 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
508 return; 544 return;
509 545
510 ext4_warning(sb, __func__, 546 ext4_warning(sb,
511 "updating to rev %d because of new feature flag, " 547 "updating to rev %d because of new feature flag, "
512 "running e2fsck is recommended", 548 "running e2fsck is recommended",
513 EXT4_DYNAMIC_REV); 549 EXT4_DYNAMIC_REV);
@@ -708,7 +744,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
708#ifdef CONFIG_QUOTA 744#ifdef CONFIG_QUOTA
709 ei->i_reserved_quota = 0; 745 ei->i_reserved_quota = 0;
710#endif 746#endif
711 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); 747 INIT_LIST_HEAD(&ei->i_completed_io_list);
748 spin_lock_init(&ei->i_completed_io_lock);
712 ei->cur_aio_dio = NULL; 749 ei->cur_aio_dio = NULL;
713 ei->i_sync_tid = 0; 750 ei->i_sync_tid = 0;
714 ei->i_datasync_tid = 0; 751 ei->i_datasync_tid = 0;
@@ -796,10 +833,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
796 if (sbi->s_qf_names[GRPQUOTA]) 833 if (sbi->s_qf_names[GRPQUOTA])
797 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 834 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
798 835
799 if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) 836 if (test_opt(sb, USRQUOTA))
800 seq_puts(seq, ",usrquota"); 837 seq_puts(seq, ",usrquota");
801 838
802 if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) 839 if (test_opt(sb, GRPQUOTA))
803 seq_puts(seq, ",grpquota"); 840 seq_puts(seq, ",grpquota");
804#endif 841#endif
805} 842}
@@ -926,6 +963,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
926 if (test_opt(sb, NOLOAD)) 963 if (test_opt(sb, NOLOAD))
927 seq_puts(seq, ",norecovery"); 964 seq_puts(seq, ",norecovery");
928 965
966 if (test_opt(sb, DIOREAD_NOLOCK))
967 seq_puts(seq, ",dioread_nolock");
968
929 ext4_show_quota_options(seq, sb); 969 ext4_show_quota_options(seq, sb);
930 970
931 return 0; 971 return 0;
@@ -1109,6 +1149,7 @@ enum {
1109 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1149 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1110 Opt_block_validity, Opt_noblock_validity, 1150 Opt_block_validity, Opt_noblock_validity,
1111 Opt_inode_readahead_blks, Opt_journal_ioprio, 1151 Opt_inode_readahead_blks, Opt_journal_ioprio,
1152 Opt_dioread_nolock, Opt_dioread_lock,
1112 Opt_discard, Opt_nodiscard, 1153 Opt_discard, Opt_nodiscard,
1113}; 1154};
1114 1155
@@ -1176,6 +1217,8 @@ static const match_table_t tokens = {
1176 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1217 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1177 {Opt_auto_da_alloc, "auto_da_alloc"}, 1218 {Opt_auto_da_alloc, "auto_da_alloc"},
1178 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1219 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1220 {Opt_dioread_nolock, "dioread_nolock"},
1221 {Opt_dioread_lock, "dioread_lock"},
1179 {Opt_discard, "discard"}, 1222 {Opt_discard, "discard"},
1180 {Opt_nodiscard, "nodiscard"}, 1223 {Opt_nodiscard, "nodiscard"},
1181 {Opt_err, NULL}, 1224 {Opt_err, NULL},
@@ -1205,6 +1248,66 @@ static ext4_fsblk_t get_sb_block(void **data)
1205} 1248}
1206 1249
1207#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) 1250#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1251static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
1252 "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1253
1254#ifdef CONFIG_QUOTA
1255static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1256{
1257 struct ext4_sb_info *sbi = EXT4_SB(sb);
1258 char *qname;
1259
1260 if (sb_any_quota_loaded(sb) &&
1261 !sbi->s_qf_names[qtype]) {
1262 ext4_msg(sb, KERN_ERR,
1263 "Cannot change journaled "
1264 "quota options when quota turned on");
1265 return 0;
1266 }
1267 qname = match_strdup(args);
1268 if (!qname) {
1269 ext4_msg(sb, KERN_ERR,
1270 "Not enough memory for storing quotafile name");
1271 return 0;
1272 }
1273 if (sbi->s_qf_names[qtype] &&
1274 strcmp(sbi->s_qf_names[qtype], qname)) {
1275 ext4_msg(sb, KERN_ERR,
1276 "%s quota file already specified", QTYPE2NAME(qtype));
1277 kfree(qname);
1278 return 0;
1279 }
1280 sbi->s_qf_names[qtype] = qname;
1281 if (strchr(sbi->s_qf_names[qtype], '/')) {
1282 ext4_msg(sb, KERN_ERR,
1283 "quotafile must be on filesystem root");
1284 kfree(sbi->s_qf_names[qtype]);
1285 sbi->s_qf_names[qtype] = NULL;
1286 return 0;
1287 }
1288 set_opt(sbi->s_mount_opt, QUOTA);
1289 return 1;
1290}
1291
1292static int clear_qf_name(struct super_block *sb, int qtype)
1293{
1294
1295 struct ext4_sb_info *sbi = EXT4_SB(sb);
1296
1297 if (sb_any_quota_loaded(sb) &&
1298 sbi->s_qf_names[qtype]) {
1299 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1300 " when quota turned on");
1301 return 0;
1302 }
1303 /*
1304 * The space will be released later when all options are confirmed
1305 * to be correct
1306 */
1307 sbi->s_qf_names[qtype] = NULL;
1308 return 1;
1309}
1310#endif
1208 1311
1209static int parse_options(char *options, struct super_block *sb, 1312static int parse_options(char *options, struct super_block *sb,
1210 unsigned long *journal_devnum, 1313 unsigned long *journal_devnum,
@@ -1217,8 +1320,7 @@ static int parse_options(char *options, struct super_block *sb,
1217 int data_opt = 0; 1320 int data_opt = 0;
1218 int option; 1321 int option;
1219#ifdef CONFIG_QUOTA 1322#ifdef CONFIG_QUOTA
1220 int qtype, qfmt; 1323 int qfmt;
1221 char *qname;
1222#endif 1324#endif
1223 1325
1224 if (!options) 1326 if (!options)
@@ -1229,19 +1331,31 @@ static int parse_options(char *options, struct super_block *sb,
1229 if (!*p) 1331 if (!*p)
1230 continue; 1332 continue;
1231 1333
1334 /*
1335 * Initialize args struct so we know whether arg was
1336 * found; some options take optional arguments.
1337 */
1338 args[0].to = args[0].from = 0;
1232 token = match_token(p, tokens, args); 1339 token = match_token(p, tokens, args);
1233 switch (token) { 1340 switch (token) {
1234 case Opt_bsd_df: 1341 case Opt_bsd_df:
1342 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1235 clear_opt(sbi->s_mount_opt, MINIX_DF); 1343 clear_opt(sbi->s_mount_opt, MINIX_DF);
1236 break; 1344 break;
1237 case Opt_minix_df: 1345 case Opt_minix_df:
1346 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1238 set_opt(sbi->s_mount_opt, MINIX_DF); 1347 set_opt(sbi->s_mount_opt, MINIX_DF);
1348
1239 break; 1349 break;
1240 case Opt_grpid: 1350 case Opt_grpid:
1351 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1241 set_opt(sbi->s_mount_opt, GRPID); 1352 set_opt(sbi->s_mount_opt, GRPID);
1353
1242 break; 1354 break;
1243 case Opt_nogrpid: 1355 case Opt_nogrpid:
1356 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1244 clear_opt(sbi->s_mount_opt, GRPID); 1357 clear_opt(sbi->s_mount_opt, GRPID);
1358
1245 break; 1359 break;
1246 case Opt_resuid: 1360 case Opt_resuid:
1247 if (match_int(&args[0], &option)) 1361 if (match_int(&args[0], &option))
@@ -1378,14 +1492,13 @@ static int parse_options(char *options, struct super_block *sb,
1378 data_opt = EXT4_MOUNT_WRITEBACK_DATA; 1492 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1379 datacheck: 1493 datacheck:
1380 if (is_remount) { 1494 if (is_remount) {
1381 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) 1495 if (test_opt(sb, DATA_FLAGS) != data_opt) {
1382 != data_opt) {
1383 ext4_msg(sb, KERN_ERR, 1496 ext4_msg(sb, KERN_ERR,
1384 "Cannot change data mode on remount"); 1497 "Cannot change data mode on remount");
1385 return 0; 1498 return 0;
1386 } 1499 }
1387 } else { 1500 } else {
1388 sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS; 1501 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1389 sbi->s_mount_opt |= data_opt; 1502 sbi->s_mount_opt |= data_opt;
1390 } 1503 }
1391 break; 1504 break;
@@ -1397,63 +1510,22 @@ static int parse_options(char *options, struct super_block *sb,
1397 break; 1510 break;
1398#ifdef CONFIG_QUOTA 1511#ifdef CONFIG_QUOTA
1399 case Opt_usrjquota: 1512 case Opt_usrjquota:
1400 qtype = USRQUOTA; 1513 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1401 goto set_qf_name;
1402 case Opt_grpjquota:
1403 qtype = GRPQUOTA;
1404set_qf_name:
1405 if (sb_any_quota_loaded(sb) &&
1406 !sbi->s_qf_names[qtype]) {
1407 ext4_msg(sb, KERN_ERR,
1408 "Cannot change journaled "
1409 "quota options when quota turned on");
1410 return 0; 1514 return 0;
1411 } 1515 break;
1412 qname = match_strdup(&args[0]); 1516 case Opt_grpjquota:
1413 if (!qname) { 1517 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1414 ext4_msg(sb, KERN_ERR,
1415 "Not enough memory for "
1416 "storing quotafile name");
1417 return 0;
1418 }
1419 if (sbi->s_qf_names[qtype] &&
1420 strcmp(sbi->s_qf_names[qtype], qname)) {
1421 ext4_msg(sb, KERN_ERR,
1422 "%s quota file already "
1423 "specified", QTYPE2NAME(qtype));
1424 kfree(qname);
1425 return 0;
1426 }
1427 sbi->s_qf_names[qtype] = qname;
1428 if (strchr(sbi->s_qf_names[qtype], '/')) {
1429 ext4_msg(sb, KERN_ERR,
1430 "quotafile must be on "
1431 "filesystem root");
1432 kfree(sbi->s_qf_names[qtype]);
1433 sbi->s_qf_names[qtype] = NULL;
1434 return 0; 1518 return 0;
1435 }
1436 set_opt(sbi->s_mount_opt, QUOTA);
1437 break; 1519 break;
1438 case Opt_offusrjquota: 1520 case Opt_offusrjquota:
1439 qtype = USRQUOTA; 1521 if (!clear_qf_name(sb, USRQUOTA))
1440 goto clear_qf_name; 1522 return 0;
1523 break;
1441 case Opt_offgrpjquota: 1524 case Opt_offgrpjquota:
1442 qtype = GRPQUOTA; 1525 if (!clear_qf_name(sb, GRPQUOTA))
1443clear_qf_name:
1444 if (sb_any_quota_loaded(sb) &&
1445 sbi->s_qf_names[qtype]) {
1446 ext4_msg(sb, KERN_ERR, "Cannot change "
1447 "journaled quota options when "
1448 "quota turned on");
1449 return 0; 1526 return 0;
1450 }
1451 /*
1452 * The space will be released later when all options
1453 * are confirmed to be correct
1454 */
1455 sbi->s_qf_names[qtype] = NULL;
1456 break; 1527 break;
1528
1457 case Opt_jqfmt_vfsold: 1529 case Opt_jqfmt_vfsold:
1458 qfmt = QFMT_VFS_OLD; 1530 qfmt = QFMT_VFS_OLD;
1459 goto set_qf_format; 1531 goto set_qf_format;
@@ -1518,10 +1590,11 @@ set_qf_format:
1518 clear_opt(sbi->s_mount_opt, BARRIER); 1590 clear_opt(sbi->s_mount_opt, BARRIER);
1519 break; 1591 break;
1520 case Opt_barrier: 1592 case Opt_barrier:
1521 if (match_int(&args[0], &option)) { 1593 if (args[0].from) {
1522 set_opt(sbi->s_mount_opt, BARRIER); 1594 if (match_int(&args[0], &option))
1523 break; 1595 return 0;
1524 } 1596 } else
1597 option = 1; /* No argument, default to 1 */
1525 if (option) 1598 if (option)
1526 set_opt(sbi->s_mount_opt, BARRIER); 1599 set_opt(sbi->s_mount_opt, BARRIER);
1527 else 1600 else
@@ -1594,10 +1667,11 @@ set_qf_format:
1594 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1667 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1595 break; 1668 break;
1596 case Opt_auto_da_alloc: 1669 case Opt_auto_da_alloc:
1597 if (match_int(&args[0], &option)) { 1670 if (args[0].from) {
1598 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1671 if (match_int(&args[0], &option))
1599 break; 1672 return 0;
1600 } 1673 } else
1674 option = 1; /* No argument, default to 1 */
1601 if (option) 1675 if (option)
1602 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1676 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1603 else 1677 else
@@ -1609,6 +1683,12 @@ set_qf_format:
1609 case Opt_nodiscard: 1683 case Opt_nodiscard:
1610 clear_opt(sbi->s_mount_opt, DISCARD); 1684 clear_opt(sbi->s_mount_opt, DISCARD);
1611 break; 1685 break;
1686 case Opt_dioread_nolock:
1687 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1688 break;
1689 case Opt_dioread_lock:
1690 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1691 break;
1612 default: 1692 default:
1613 ext4_msg(sb, KERN_ERR, 1693 ext4_msg(sb, KERN_ERR,
1614 "Unrecognized mount option \"%s\" " 1694 "Unrecognized mount option \"%s\" "
@@ -1618,18 +1698,13 @@ set_qf_format:
1618 } 1698 }
1619#ifdef CONFIG_QUOTA 1699#ifdef CONFIG_QUOTA
1620 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1700 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1621 if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) && 1701 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1622 sbi->s_qf_names[USRQUOTA])
1623 clear_opt(sbi->s_mount_opt, USRQUOTA); 1702 clear_opt(sbi->s_mount_opt, USRQUOTA);
1624 1703
1625 if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) && 1704 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1626 sbi->s_qf_names[GRPQUOTA])
1627 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1705 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1628 1706
1629 if ((sbi->s_qf_names[USRQUOTA] && 1707 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1630 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1631 (sbi->s_qf_names[GRPQUOTA] &&
1632 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1633 ext4_msg(sb, KERN_ERR, "old and new quota " 1708 ext4_msg(sb, KERN_ERR, "old and new quota "
1634 "format mixing"); 1709 "format mixing");
1635 return 0; 1710 return 0;
@@ -2432,8 +2507,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2432 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 2507 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
2433 if (def_mount_opts & EXT4_DEFM_DEBUG) 2508 if (def_mount_opts & EXT4_DEFM_DEBUG)
2434 set_opt(sbi->s_mount_opt, DEBUG); 2509 set_opt(sbi->s_mount_opt, DEBUG);
2435 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) 2510 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
2511 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
2512 "2.6.38");
2436 set_opt(sbi->s_mount_opt, GRPID); 2513 set_opt(sbi->s_mount_opt, GRPID);
2514 }
2437 if (def_mount_opts & EXT4_DEFM_UID16) 2515 if (def_mount_opts & EXT4_DEFM_UID16)
2438 set_opt(sbi->s_mount_opt, NO_UID32); 2516 set_opt(sbi->s_mount_opt, NO_UID32);
2439#ifdef CONFIG_EXT4_FS_XATTR 2517#ifdef CONFIG_EXT4_FS_XATTR
@@ -2445,11 +2523,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2445 set_opt(sbi->s_mount_opt, POSIX_ACL); 2523 set_opt(sbi->s_mount_opt, POSIX_ACL);
2446#endif 2524#endif
2447 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 2525 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
2448 sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; 2526 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
2449 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 2527 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
2450 sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA; 2528 set_opt(sbi->s_mount_opt, ORDERED_DATA);
2451 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 2529 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
2452 sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA; 2530 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2453 2531
2454 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 2532 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
2455 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 2533 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2477,7 +2555,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2477 goto failed_mount; 2555 goto failed_mount;
2478 2556
2479 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2557 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2480 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 2558 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2481 2559
2482 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && 2560 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
2483 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || 2561 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2766,7 +2844,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2766 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2844 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2767 ext4_msg(sb, KERN_ERR, "required journal recovery " 2845 ext4_msg(sb, KERN_ERR, "required journal recovery "
2768 "suppressed and not mounted read-only"); 2846 "suppressed and not mounted read-only");
2769 goto failed_mount4; 2847 goto failed_mount_wq;
2770 } else { 2848 } else {
2771 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 2849 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2772 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 2850 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2779,7 +2857,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2779 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2857 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2780 JBD2_FEATURE_INCOMPAT_64BIT)) { 2858 JBD2_FEATURE_INCOMPAT_64BIT)) {
2781 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 2859 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
2782 goto failed_mount4; 2860 goto failed_mount_wq;
2783 } 2861 }
2784 2862
2785 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2863 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2818,7 +2896,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2818 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 2896 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2819 ext4_msg(sb, KERN_ERR, "Journal does not support " 2897 ext4_msg(sb, KERN_ERR, "Journal does not support "
2820 "requested data journaling mode"); 2898 "requested data journaling mode");
2821 goto failed_mount4; 2899 goto failed_mount_wq;
2822 } 2900 }
2823 default: 2901 default:
2824 break; 2902 break;
@@ -2826,13 +2904,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2826 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2904 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2827 2905
2828no_journal: 2906no_journal:
2829
2830 if (test_opt(sb, NOBH)) { 2907 if (test_opt(sb, NOBH)) {
2831 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2908 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2832 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2909 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2833 "its supported only with writeback mode"); 2910 "its supported only with writeback mode");
2834 clear_opt(sbi->s_mount_opt, NOBH); 2911 clear_opt(sbi->s_mount_opt, NOBH);
2835 } 2912 }
2913 if (test_opt(sb, DIOREAD_NOLOCK)) {
2914 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
2915 "not supported with nobh mode");
2916 goto failed_mount_wq;
2917 }
2836 } 2918 }
2837 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 2919 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2838 if (!EXT4_SB(sb)->dio_unwritten_wq) { 2920 if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2897,6 +2979,18 @@ no_journal:
2897 "requested data journaling mode"); 2979 "requested data journaling mode");
2898 clear_opt(sbi->s_mount_opt, DELALLOC); 2980 clear_opt(sbi->s_mount_opt, DELALLOC);
2899 } 2981 }
2982 if (test_opt(sb, DIOREAD_NOLOCK)) {
2983 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2984 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2985 "option - requested data journaling mode");
2986 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2987 }
2988 if (sb->s_blocksize < PAGE_SIZE) {
2989 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2990 "option - block size is too small");
2991 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2992 }
2993 }
2900 2994
2901 err = ext4_setup_system_zone(sb); 2995 err = ext4_setup_system_zone(sb);
2902 if (err) { 2996 if (err) {
@@ -3360,10 +3454,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
3360 char nbuf[16]; 3454 char nbuf[16];
3361 3455
3362 errstr = ext4_decode_error(sb, j_errno, nbuf); 3456 errstr = ext4_decode_error(sb, j_errno, nbuf);
3363 ext4_warning(sb, __func__, "Filesystem error recorded " 3457 ext4_warning(sb, "Filesystem error recorded "
3364 "from previous mount: %s", errstr); 3458 "from previous mount: %s", errstr);
3365 ext4_warning(sb, __func__, "Marking fs in need of " 3459 ext4_warning(sb, "Marking fs in need of filesystem check.");
3366 "filesystem check.");
3367 3460
3368 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 3461 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
3369 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 3462 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -3514,7 +3607,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3514 ext4_abort(sb, __func__, "Abort forced by user"); 3607 ext4_abort(sb, __func__, "Abort forced by user");
3515 3608
3516 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3609 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3517 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 3610 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3518 3611
3519 es = sbi->s_es; 3612 es = sbi->s_es;
3520 3613
@@ -3917,9 +4010,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3917 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 4010 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
3918 int err = 0; 4011 int err = 0;
3919 int offset = off & (sb->s_blocksize - 1); 4012 int offset = off & (sb->s_blocksize - 1);
3920 int tocopy;
3921 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; 4013 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
3922 size_t towrite = len;
3923 struct buffer_head *bh; 4014 struct buffer_head *bh;
3924 handle_t *handle = journal_current_handle(); 4015 handle_t *handle = journal_current_handle();
3925 4016
@@ -3929,52 +4020,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3929 (unsigned long long)off, (unsigned long long)len); 4020 (unsigned long long)off, (unsigned long long)len);
3930 return -EIO; 4021 return -EIO;
3931 } 4022 }
4023 /*
4024 * Since we account only one data block in transaction credits,
4025 * then it is impossible to cross a block boundary.
4026 */
4027 if (sb->s_blocksize - offset < len) {
4028 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
4029 " cancelled because not block aligned",
4030 (unsigned long long)off, (unsigned long long)len);
4031 return -EIO;
4032 }
4033
3932 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 4034 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
3933 while (towrite > 0) { 4035 bh = ext4_bread(handle, inode, blk, 1, &err);
3934 tocopy = sb->s_blocksize - offset < towrite ? 4036 if (!bh)
3935 sb->s_blocksize - offset : towrite; 4037 goto out;
3936 bh = ext4_bread(handle, inode, blk, 1, &err); 4038 if (journal_quota) {
3937 if (!bh) 4039 err = ext4_journal_get_write_access(handle, bh);
4040 if (err) {
4041 brelse(bh);
3938 goto out; 4042 goto out;
3939 if (journal_quota) {
3940 err = ext4_journal_get_write_access(handle, bh);
3941 if (err) {
3942 brelse(bh);
3943 goto out;
3944 }
3945 } 4043 }
3946 lock_buffer(bh);
3947 memcpy(bh->b_data+offset, data, tocopy);
3948 flush_dcache_page(bh->b_page);
3949 unlock_buffer(bh);
3950 if (journal_quota)
3951 err = ext4_handle_dirty_metadata(handle, NULL, bh);
3952 else {
3953 /* Always do at least ordered writes for quotas */
3954 err = ext4_jbd2_file_inode(handle, inode);
3955 mark_buffer_dirty(bh);
3956 }
3957 brelse(bh);
3958 if (err)
3959 goto out;
3960 offset = 0;
3961 towrite -= tocopy;
3962 data += tocopy;
3963 blk++;
3964 } 4044 }
4045 lock_buffer(bh);
4046 memcpy(bh->b_data+offset, data, len);
4047 flush_dcache_page(bh->b_page);
4048 unlock_buffer(bh);
4049 if (journal_quota)
4050 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4051 else {
4052 /* Always do at least ordered writes for quotas */
4053 err = ext4_jbd2_file_inode(handle, inode);
4054 mark_buffer_dirty(bh);
4055 }
4056 brelse(bh);
3965out: 4057out:
3966 if (len == towrite) { 4058 if (err) {
3967 mutex_unlock(&inode->i_mutex); 4059 mutex_unlock(&inode->i_mutex);
3968 return err; 4060 return err;
3969 } 4061 }
3970 if (inode->i_size < off+len-towrite) { 4062 if (inode->i_size < off + len) {
3971 i_size_write(inode, off+len-towrite); 4063 i_size_write(inode, off + len);
3972 EXT4_I(inode)->i_disksize = inode->i_size; 4064 EXT4_I(inode)->i_disksize = inode->i_size;
3973 } 4065 }
3974 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 4066 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3975 ext4_mark_inode_dirty(handle, inode); 4067 ext4_mark_inode_dirty(handle, inode);
3976 mutex_unlock(&inode->i_mutex); 4068 mutex_unlock(&inode->i_mutex);
3977 return len - towrite; 4069 return len;
3978} 4070}
3979 4071
3980#endif 4072#endif
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f3a2f7ed45aa..efc16a4b7ceb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
227 ea_bdebug(bh, "b_count=%d, refcount=%d", 227 ea_bdebug(bh, "b_count=%d, refcount=%d",
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: ext4_error(inode->i_sb, __func__, 230bad_block:
231 ext4_error(inode->i_sb,
231 "inode %lu: bad block %llu", inode->i_ino, 232 "inode %lu: bad block %llu", inode->i_ino,
232 EXT4_I(inode)->i_file_acl); 233 EXT4_I(inode)->i_file_acl);
233 error = -EIO; 234 error = -EIO;
@@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
267 void *end; 268 void *end;
268 int error; 269 int error;
269 270
270 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 271 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
271 return -ENODATA; 272 return -ENODATA;
272 error = ext4_get_inode_loc(inode, &iloc); 273 error = ext4_get_inode_loc(inode, &iloc);
273 if (error) 274 if (error)
@@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
371 ea_bdebug(bh, "b_count=%d, refcount=%d", 372 ea_bdebug(bh, "b_count=%d, refcount=%d",
372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
373 if (ext4_xattr_check_block(bh)) { 374 if (ext4_xattr_check_block(bh)) {
374 ext4_error(inode->i_sb, __func__, 375 ext4_error(inode->i_sb,
375 "inode %lu: bad block %llu", inode->i_ino, 376 "inode %lu: bad block %llu", inode->i_ino,
376 EXT4_I(inode)->i_file_acl); 377 EXT4_I(inode)->i_file_acl);
377 error = -EIO; 378 error = -EIO;
@@ -396,7 +397,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
396 void *end; 397 void *end;
397 int error; 398 int error;
398 399
399 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 400 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
400 return 0; 401 return 0;
401 error = ext4_get_inode_loc(inode, &iloc); 402 error = ext4_get_inode_loc(inode, &iloc);
402 if (error) 403 if (error)
@@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
665 atomic_read(&(bs->bh->b_count)), 666 atomic_read(&(bs->bh->b_count)),
666 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 667 le32_to_cpu(BHDR(bs->bh)->h_refcount));
667 if (ext4_xattr_check_block(bs->bh)) { 668 if (ext4_xattr_check_block(bs->bh)) {
668 ext4_error(sb, __func__, 669 ext4_error(sb, "inode %lu: bad block %llu",
669 "inode %lu: bad block %llu", inode->i_ino, 670 inode->i_ino, EXT4_I(inode)->i_file_acl);
670 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 671 error = -EIO;
672 goto cleanup; 672 goto cleanup;
673 } 673 }
@@ -880,9 +880,8 @@ cleanup_dquot:
880 goto cleanup; 880 goto cleanup;
881 881
882bad_block: 882bad_block:
883 ext4_error(inode->i_sb, __func__, 883 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
884 "inode %lu: bad block %llu", inode->i_ino, 884 inode->i_ino, EXT4_I(inode)->i_file_acl);
885 EXT4_I(inode)->i_file_acl);
886 goto cleanup; 885 goto cleanup;
887 886
888#undef header 887#undef header
@@ -908,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
908 is->s.base = is->s.first = IFIRST(header); 907 is->s.base = is->s.first = IFIRST(header);
909 is->s.here = is->s.first; 908 is->s.here = is->s.first;
910 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 909 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
911 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 910 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
912 error = ext4_xattr_check_names(IFIRST(header), is->s.end); 911 error = ext4_xattr_check_names(IFIRST(header), is->s.end);
913 if (error) 912 if (error)
914 return error; 913 return error;
@@ -940,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
940 header = IHDR(inode, ext4_raw_inode(&is->iloc)); 939 header = IHDR(inode, ext4_raw_inode(&is->iloc));
941 if (!IS_LAST_ENTRY(s->first)) { 940 if (!IS_LAST_ENTRY(s->first)) {
942 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); 941 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
943 EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; 942 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
944 } else { 943 } else {
945 header->h_magic = cpu_to_le32(0); 944 header->h_magic = cpu_to_le32(0);
946 EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; 945 ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
947 } 946 }
948 return 0; 947 return 0;
949} 948}
@@ -986,8 +985,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
986 if (strlen(name) > 255) 985 if (strlen(name) > 255)
987 return -ERANGE; 986 return -ERANGE;
988 down_write(&EXT4_I(inode)->xattr_sem); 987 down_write(&EXT4_I(inode)->xattr_sem);
989 no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; 988 no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
990 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 989 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
991 990
992 error = ext4_get_inode_loc(inode, &is.iloc); 991 error = ext4_get_inode_loc(inode, &is.iloc);
993 if (error) 992 if (error)
@@ -997,10 +996,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
997 if (error) 996 if (error)
998 goto cleanup; 997 goto cleanup;
999 998
1000 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { 999 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
1001 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); 1000 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
1002 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 1001 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
1003 EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; 1002 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
1004 } 1003 }
1005 1004
1006 error = ext4_xattr_ibody_find(inode, &i, &is); 1005 error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1052,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1052 ext4_xattr_update_super_block(handle, inode->i_sb); 1051 ext4_xattr_update_super_block(handle, inode->i_sb);
1053 inode->i_ctime = ext4_current_time(inode); 1052 inode->i_ctime = ext4_current_time(inode);
1054 if (!value) 1053 if (!value)
1055 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1054 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1056 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 1055 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
1057 /* 1056 /*
1058 * The bh is consumed by ext4_mark_iloc_dirty, even with 1057 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,7 +1066,7 @@ cleanup:
1067 brelse(is.iloc.bh); 1066 brelse(is.iloc.bh);
1068 brelse(bs.bh); 1067 brelse(bs.bh);
1069 if (no_expand == 0) 1068 if (no_expand == 0)
1070 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1069 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1071 up_write(&EXT4_I(inode)->xattr_sem); 1070 up_write(&EXT4_I(inode)->xattr_sem);
1072 return error; 1071 return error;
1073} 1072}
@@ -1195,9 +1194,8 @@ retry:
1195 if (!bh) 1194 if (!bh)
1196 goto cleanup; 1195 goto cleanup;
1197 if (ext4_xattr_check_block(bh)) { 1196 if (ext4_xattr_check_block(bh)) {
1198 ext4_error(inode->i_sb, __func__, 1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1199 "inode %lu: bad block %llu", inode->i_ino, 1198 inode->i_ino, EXT4_I(inode)->i_file_acl);
1200 EXT4_I(inode)->i_file_acl);
1201 error = -EIO; 1199 error = -EIO;
1202 goto cleanup; 1200 goto cleanup;
1203 } 1201 }
@@ -1302,6 +1300,8 @@ retry:
1302 1300
1303 /* Remove the chosen entry from the inode */ 1301 /* Remove the chosen entry from the inode */
1304 error = ext4_xattr_ibody_set(handle, inode, &i, is); 1302 error = ext4_xattr_ibody_set(handle, inode, &i, is);
1303 if (error)
1304 goto cleanup;
1305 1305
1306 entry = IFIRST(header); 1306 entry = IFIRST(header);
1307 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) 1307 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1372,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1372 goto cleanup; 1372 goto cleanup;
1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1374 if (!bh) { 1374 if (!bh) {
1375 ext4_error(inode->i_sb, __func__, 1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error",
1376 "inode %lu: block %llu read error", inode->i_ino, 1376 inode->i_ino, EXT4_I(inode)->i_file_acl);
1377 EXT4_I(inode)->i_file_acl);
1378 goto cleanup; 1377 goto cleanup;
1379 } 1378 }
1380 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1381 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1382 ext4_error(inode->i_sb, __func__, 1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1383 "inode %lu: bad block %llu", inode->i_ino, 1382 inode->i_ino, EXT4_I(inode)->i_file_acl);
1384 EXT4_I(inode)->i_file_acl);
1385 goto cleanup; 1383 goto cleanup;
1386 } 1384 }
1387 ext4_xattr_release_block(handle, inode, bh); 1385 ext4_xattr_release_block(handle, inode, bh);
@@ -1506,7 +1504,7 @@ again:
1506 } 1504 }
1507 bh = sb_bread(inode->i_sb, ce->e_block); 1505 bh = sb_bread(inode->i_sb, ce->e_block);
1508 if (!bh) { 1506 if (!bh) {
1509 ext4_error(inode->i_sb, __func__, 1507 ext4_error(inode->i_sb,
1510 "inode %lu: block %lu read error", 1508 "inode %lu: block %lu read error",
1511 inode->i_ino, (unsigned long) ce->e_block); 1509 inode->i_ino, (unsigned long) ce->e_block);
1512 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 14da530b05ca..fbeecdc194dc 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
577 return i_pos; 577 return i_pos;
578} 578}
579 579
580static int fat_write_inode(struct inode *inode, int wait) 580static int __fat_write_inode(struct inode *inode, int wait)
581{ 581{
582 struct super_block *sb = inode->i_sb; 582 struct super_block *sb = inode->i_sb;
583 struct msdos_sb_info *sbi = MSDOS_SB(sb); 583 struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -634,9 +634,14 @@ retry:
634 return err; 634 return err;
635} 635}
636 636
637static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
638{
639 return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
640}
641
637int fat_sync_inode(struct inode *inode) 642int fat_sync_inode(struct inode *inode)
638{ 643{
639 return fat_write_inode(inode, 1); 644 return __fat_write_inode(inode, 1);
640} 645}
641 646
642EXPORT_SYMBOL_GPL(fat_sync_inode); 647EXPORT_SYMBOL_GPL(fat_sync_inode);
diff --git a/fs/file.c b/fs/file.c
index 87e129030ab1..38039af67663 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -478,7 +478,7 @@ repeat:
478 error = fd; 478 error = fd;
479#if 1 479#if 1
480 /* Sanity check */ 480 /* Sanity check */
481 if (rcu_dereference(fdt->fd[fd]) != NULL) { 481 if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
482 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 482 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
483 rcu_assign_pointer(fdt->fd[fd], NULL); 483 rcu_assign_pointer(fdt->fd[fd], NULL);
484 } 484 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1a7c42c64ff4..76fc4d594acb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -381,10 +381,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
381 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 381 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
382} 382}
383 383
384static int write_inode(struct inode *inode, int sync) 384static int write_inode(struct inode *inode, struct writeback_control *wbc)
385{ 385{
386 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 386 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
387 return inode->i_sb->s_op->write_inode(inode, sync); 387 return inode->i_sb->s_op->write_inode(inode, wbc);
388 return 0; 388 return 0;
389} 389}
390 390
@@ -421,7 +421,6 @@ static int
421writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 421writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
422{ 422{
423 struct address_space *mapping = inode->i_mapping; 423 struct address_space *mapping = inode->i_mapping;
424 int wait = wbc->sync_mode == WB_SYNC_ALL;
425 unsigned dirty; 424 unsigned dirty;
426 int ret; 425 int ret;
427 426
@@ -439,7 +438,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
439 * We'll have another go at writing back this inode when we 438 * We'll have another go at writing back this inode when we
440 * completed a full scan of b_io. 439 * completed a full scan of b_io.
441 */ 440 */
442 if (!wait) { 441 if (wbc->sync_mode != WB_SYNC_ALL) {
443 requeue_io(inode); 442 requeue_io(inode);
444 return 0; 443 return 0;
445 } 444 }
@@ -461,15 +460,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
461 460
462 ret = do_writepages(mapping, wbc); 461 ret = do_writepages(mapping, wbc);
463 462
464 /* Don't write the inode if only I_DIRTY_PAGES was set */ 463 /*
465 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 464 * Make sure to wait on the data before writing out the metadata.
466 int err = write_inode(inode, wait); 465 * This is important for filesystems that modify metadata on data
466 * I/O completion.
467 */
468 if (wbc->sync_mode == WB_SYNC_ALL) {
469 int err = filemap_fdatawait(mapping);
467 if (ret == 0) 470 if (ret == 0)
468 ret = err; 471 ret = err;
469 } 472 }
470 473
471 if (wait) { 474 /* Don't write the inode if only I_DIRTY_PAGES was set */
472 int err = filemap_fdatawait(mapping); 475 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
476 int err = write_inode(inode, wbc);
473 if (ret == 0) 477 if (ret == 0)
474 ret = err; 478 ret = err;
475 } 479 }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51d9e33d634f..eb7e9423691f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -865,13 +865,10 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
865 865
866 down_read(&fc->killsb); 866 down_read(&fc->killsb);
867 err = -ENOENT; 867 err = -ENOENT;
868 if (!fc->sb) 868 if (fc->sb) {
869 goto err_unlock; 869 err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
870 870 outarg.off, outarg.len);
871 err = fuse_reverse_inval_inode(fc->sb, outarg.ino, 871 }
872 outarg.off, outarg.len);
873
874err_unlock:
875 up_read(&fc->killsb); 872 up_read(&fc->killsb);
876 return err; 873 return err;
877 874
@@ -884,10 +881,15 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
884 struct fuse_copy_state *cs) 881 struct fuse_copy_state *cs)
885{ 882{
886 struct fuse_notify_inval_entry_out outarg; 883 struct fuse_notify_inval_entry_out outarg;
887 int err = -EINVAL; 884 int err = -ENOMEM;
888 char buf[FUSE_NAME_MAX+1]; 885 char *buf;
889 struct qstr name; 886 struct qstr name;
890 887
888 buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
889 if (!buf)
890 goto err;
891
892 err = -EINVAL;
891 if (size < sizeof(outarg)) 893 if (size < sizeof(outarg))
892 goto err; 894 goto err;
893 895
@@ -910,16 +912,14 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
910 912
911 down_read(&fc->killsb); 913 down_read(&fc->killsb);
912 err = -ENOENT; 914 err = -ENOENT;
913 if (!fc->sb) 915 if (fc->sb)
914 goto err_unlock; 916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
915
916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
917
918err_unlock:
919 up_read(&fc->killsb); 917 up_read(&fc->killsb);
918 kfree(buf);
920 return err; 919 return err;
921 920
922err: 921err:
922 kfree(buf);
923 fuse_copy_finish(cs); 923 fuse_copy_finish(cs);
924 return err; 924 return err;
925} 925}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7b8da9415267..0c1d0b82dcf1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1061,8 +1061,8 @@ out:
1061 1061
1062int gfs2_releasepage(struct page *page, gfp_t gfp_mask) 1062int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
1063{ 1063{
1064 struct inode *aspace = page->mapping->host; 1064 struct address_space *mapping = page->mapping;
1065 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; 1065 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
1066 struct buffer_head *bh, *head; 1066 struct buffer_head *bh, *head;
1067 struct gfs2_bufdata *bd; 1067 struct gfs2_bufdata *bd;
1068 1068
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d47379e794b..583e823307ae 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -541,7 +541,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
541 *ptr++ = cpu_to_be64(bn++); 541 *ptr++ = cpu_to_be64(bn++);
542 break; 542 break;
543 } 543 }
544 } while (state != ALLOC_DATA); 544 } while ((state != ALLOC_DATA) || !dblock);
545 545
546 ip->i_height = height; 546 ip->i_height = height;
547 gfs2_add_inode_blocks(&ip->i_inode, alloced); 547 gfs2_add_inode_blocks(&ip->i_inode, alloced);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f42663325931..454d4b4eb36b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,7 +19,6 @@
19#include <linux/list.h> 19#include <linux/list.h>
20#include <linux/wait.h> 20#include <linux/wait.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/rwsem.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
24#include <linux/seq_file.h> 23#include <linux/seq_file.h>
25#include <linux/debugfs.h> 24#include <linux/debugfs.h>
@@ -60,7 +59,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
60#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) 59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
61static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); 60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
62 61
63static DECLARE_RWSEM(gfs2_umount_flush_sem);
64static struct dentry *gfs2_root; 62static struct dentry *gfs2_root;
65static struct workqueue_struct *glock_workqueue; 63static struct workqueue_struct *glock_workqueue;
66struct workqueue_struct *gfs2_delete_workqueue; 64struct workqueue_struct *gfs2_delete_workqueue;
@@ -154,12 +152,14 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
154static void glock_free(struct gfs2_glock *gl) 152static void glock_free(struct gfs2_glock *gl)
155{ 153{
156 struct gfs2_sbd *sdp = gl->gl_sbd; 154 struct gfs2_sbd *sdp = gl->gl_sbd;
157 struct inode *aspace = gl->gl_aspace; 155 struct address_space *mapping = gfs2_glock2aspace(gl);
156 struct kmem_cache *cachep = gfs2_glock_cachep;
158 157
159 if (aspace) 158 GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
160 gfs2_aspace_put(aspace);
161 trace_gfs2_glock_put(gl); 159 trace_gfs2_glock_put(gl);
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); 160 if (mapping)
161 cachep = gfs2_glock_aspace_cachep;
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
163} 163}
164 164
165/** 165/**
@@ -712,7 +712,6 @@ static void glock_work_func(struct work_struct *work)
712 finish_xmote(gl, gl->gl_reply); 712 finish_xmote(gl, gl->gl_reply);
713 drop_ref = 1; 713 drop_ref = 1;
714 } 714 }
715 down_read(&gfs2_umount_flush_sem);
716 spin_lock(&gl->gl_spin); 715 spin_lock(&gl->gl_spin);
717 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 716 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
718 gl->gl_state != LM_ST_UNLOCKED && 717 gl->gl_state != LM_ST_UNLOCKED &&
@@ -725,7 +724,6 @@ static void glock_work_func(struct work_struct *work)
725 } 724 }
726 run_queue(gl, 0); 725 run_queue(gl, 0);
727 spin_unlock(&gl->gl_spin); 726 spin_unlock(&gl->gl_spin);
728 up_read(&gfs2_umount_flush_sem);
729 if (!delay || 727 if (!delay ||
730 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 728 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
731 gfs2_glock_put(gl); 729 gfs2_glock_put(gl);
@@ -750,10 +748,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
750 const struct gfs2_glock_operations *glops, int create, 748 const struct gfs2_glock_operations *glops, int create,
751 struct gfs2_glock **glp) 749 struct gfs2_glock **glp)
752{ 750{
751 struct super_block *s = sdp->sd_vfs;
753 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; 752 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
754 struct gfs2_glock *gl, *tmp; 753 struct gfs2_glock *gl, *tmp;
755 unsigned int hash = gl_hash(sdp, &name); 754 unsigned int hash = gl_hash(sdp, &name);
756 int error; 755 struct address_space *mapping;
757 756
758 read_lock(gl_lock_addr(hash)); 757 read_lock(gl_lock_addr(hash));
759 gl = search_bucket(hash, sdp, &name); 758 gl = search_bucket(hash, sdp, &name);
@@ -765,7 +764,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
765 if (!create) 764 if (!create)
766 return -ENOENT; 765 return -ENOENT;
767 766
768 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); 767 if (glops->go_flags & GLOF_ASPACE)
768 gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
769 else
770 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
769 if (!gl) 771 if (!gl)
770 return -ENOMEM; 772 return -ENOMEM;
771 773
@@ -784,18 +786,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
784 gl->gl_tchange = jiffies; 786 gl->gl_tchange = jiffies;
785 gl->gl_object = NULL; 787 gl->gl_object = NULL;
786 gl->gl_sbd = sdp; 788 gl->gl_sbd = sdp;
787 gl->gl_aspace = NULL;
788 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); 789 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
789 INIT_WORK(&gl->gl_delete, delete_work_func); 790 INIT_WORK(&gl->gl_delete, delete_work_func);
790 791
791 /* If this glock protects actual on-disk data or metadata blocks, 792 mapping = gfs2_glock2aspace(gl);
792 create a VFS inode to manage the pages/buffers holding them. */ 793 if (mapping) {
793 if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) { 794 mapping->a_ops = &gfs2_meta_aops;
794 gl->gl_aspace = gfs2_aspace_get(sdp); 795 mapping->host = s->s_bdev->bd_inode;
795 if (!gl->gl_aspace) { 796 mapping->flags = 0;
796 error = -ENOMEM; 797 mapping_set_gfp_mask(mapping, GFP_NOFS);
797 goto fail; 798 mapping->assoc_mapping = NULL;
798 } 799 mapping->backing_dev_info = s->s_bdi;
800 mapping->writeback_index = 0;
799 } 801 }
800 802
801 write_lock(gl_lock_addr(hash)); 803 write_lock(gl_lock_addr(hash));
@@ -812,10 +814,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
812 *glp = gl; 814 *glp = gl;
813 815
814 return 0; 816 return 0;
815
816fail:
817 kmem_cache_free(gfs2_glock_cachep, gl);
818 return error;
819} 817}
820 818
821/** 819/**
@@ -1510,35 +1508,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1510 1508
1511void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) 1509void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1512{ 1510{
1513 unsigned long t;
1514 unsigned int x; 1511 unsigned int x;
1515 int cont;
1516 1512
1517 t = jiffies; 1513 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1518 1514 examine_bucket(clear_glock, sdp, x);
1519 for (;;) {
1520 cont = 0;
1521 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
1522 if (examine_bucket(clear_glock, sdp, x))
1523 cont = 1;
1524 }
1525
1526 if (!cont)
1527 break;
1528
1529 if (time_after_eq(jiffies,
1530 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
1531 fs_warn(sdp, "Unmount seems to be stalled. "
1532 "Dumping lock state...\n");
1533 gfs2_dump_lockstate(sdp);
1534 t = jiffies;
1535 }
1536
1537 down_write(&gfs2_umount_flush_sem);
1538 invalidate_inodes(sdp->sd_vfs);
1539 up_write(&gfs2_umount_flush_sem);
1540 msleep(10);
1541 }
1542 flush_workqueue(glock_workqueue); 1515 flush_workqueue(glock_workqueue);
1543 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); 1516 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
1544 gfs2_dump_lockstate(sdp); 1517 gfs2_dump_lockstate(sdp);
@@ -1685,7 +1658,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1685 dtime *= 1000000/HZ; /* demote time in uSec */ 1658 dtime *= 1000000/HZ; /* demote time in uSec */
1686 if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) 1659 if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
1687 dtime = 0; 1660 dtime = 0;
1688 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n", 1661 gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
1689 state2str(gl->gl_state), 1662 state2str(gl->gl_state),
1690 gl->gl_name.ln_type, 1663 gl->gl_name.ln_type,
1691 (unsigned long long)gl->gl_name.ln_number, 1664 (unsigned long long)gl->gl_name.ln_number,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c0262faf4725..2bda1911b156 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -180,6 +180,13 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
180 return gl->gl_state == LM_ST_SHARED; 180 return gl->gl_state == LM_ST_SHARED;
181} 181}
182 182
183static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
184{
185 if (gl->gl_ops->go_flags & GLOF_ASPACE)
186 return (struct address_space *)(gl + 1);
187 return NULL;
188}
189
183int gfs2_glock_get(struct gfs2_sbd *sdp, 190int gfs2_glock_get(struct gfs2_sbd *sdp,
184 u64 number, const struct gfs2_glock_operations *glops, 191 u64 number, const struct gfs2_glock_operations *glops,
185 int create, struct gfs2_glock **glp); 192 int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78554acc0605..38e3749d476c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -87,7 +87,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
87 87
88static void rgrp_go_sync(struct gfs2_glock *gl) 88static void rgrp_go_sync(struct gfs2_glock *gl)
89{ 89{
90 struct address_space *metamapping = gl->gl_aspace->i_mapping; 90 struct address_space *metamapping = gfs2_glock2aspace(gl);
91 int error; 91 int error;
92 92
93 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 93 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -113,7 +113,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
113 113
114static void rgrp_go_inval(struct gfs2_glock *gl, int flags) 114static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
115{ 115{
116 struct address_space *mapping = gl->gl_aspace->i_mapping; 116 struct address_space *mapping = gfs2_glock2aspace(gl);
117 117
118 BUG_ON(!(flags & DIO_METADATA)); 118 BUG_ON(!(flags & DIO_METADATA));
119 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 119 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
@@ -134,7 +134,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
134static void inode_go_sync(struct gfs2_glock *gl) 134static void inode_go_sync(struct gfs2_glock *gl)
135{ 135{
136 struct gfs2_inode *ip = gl->gl_object; 136 struct gfs2_inode *ip = gl->gl_object;
137 struct address_space *metamapping = gl->gl_aspace->i_mapping; 137 struct address_space *metamapping = gfs2_glock2aspace(gl);
138 int error; 138 int error;
139 139
140 if (ip && !S_ISREG(ip->i_inode.i_mode)) 140 if (ip && !S_ISREG(ip->i_inode.i_mode))
@@ -183,7 +183,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
183 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 183 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
184 184
185 if (flags & DIO_METADATA) { 185 if (flags & DIO_METADATA) {
186 struct address_space *mapping = gl->gl_aspace->i_mapping; 186 struct address_space *mapping = gfs2_glock2aspace(gl);
187 truncate_inode_pages(mapping, 0); 187 truncate_inode_pages(mapping, 0);
188 if (ip) { 188 if (ip) {
189 set_bit(GIF_INVALID, &ip->i_flags); 189 set_bit(GIF_INVALID, &ip->i_flags);
@@ -282,7 +282,8 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
282 282
283static int rgrp_go_demote_ok(const struct gfs2_glock *gl) 283static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
284{ 284{
285 return !gl->gl_aspace->i_mapping->nrpages; 285 const struct address_space *mapping = (const struct address_space *)(gl + 1);
286 return !mapping->nrpages;
286} 287}
287 288
288/** 289/**
@@ -387,8 +388,7 @@ static void iopen_go_callback(struct gfs2_glock *gl)
387 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; 388 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
388 389
389 if (gl->gl_demote_state == LM_ST_UNLOCKED && 390 if (gl->gl_demote_state == LM_ST_UNLOCKED &&
390 gl->gl_state == LM_ST_SHARED && 391 gl->gl_state == LM_ST_SHARED && ip) {
391 ip && test_bit(GIF_USER, &ip->i_flags)) {
392 gfs2_glock_hold(gl); 392 gfs2_glock_hold(gl);
393 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) 393 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
394 gfs2_glock_put_nolock(gl); 394 gfs2_glock_put_nolock(gl);
@@ -407,6 +407,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
407 .go_dump = inode_go_dump, 407 .go_dump = inode_go_dump,
408 .go_type = LM_TYPE_INODE, 408 .go_type = LM_TYPE_INODE,
409 .go_min_hold_time = HZ / 5, 409 .go_min_hold_time = HZ / 5,
410 .go_flags = GLOF_ASPACE,
410}; 411};
411 412
412const struct gfs2_glock_operations gfs2_rgrp_glops = { 413const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -418,6 +419,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
418 .go_dump = gfs2_rgrp_dump, 419 .go_dump = gfs2_rgrp_dump,
419 .go_type = LM_TYPE_RGRP, 420 .go_type = LM_TYPE_RGRP,
420 .go_min_hold_time = HZ / 5, 421 .go_min_hold_time = HZ / 5,
422 .go_flags = GLOF_ASPACE,
421}; 423};
422 424
423const struct gfs2_glock_operations gfs2_trans_glops = { 425const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index bc0ad158e6b4..b8025e51cabf 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -162,6 +162,8 @@ struct gfs2_glock_operations {
162 void (*go_callback) (struct gfs2_glock *gl); 162 void (*go_callback) (struct gfs2_glock *gl);
163 const int go_type; 163 const int go_type;
164 const unsigned long go_min_hold_time; 164 const unsigned long go_min_hold_time;
165 const unsigned long go_flags;
166#define GLOF_ASPACE 1
165}; 167};
166 168
167enum { 169enum {
@@ -225,7 +227,6 @@ struct gfs2_glock {
225 227
226 struct gfs2_sbd *gl_sbd; 228 struct gfs2_sbd *gl_sbd;
227 229
228 struct inode *gl_aspace;
229 struct list_head gl_ail_list; 230 struct list_head gl_ail_list;
230 atomic_t gl_ail_count; 231 atomic_t gl_ail_count;
231 struct delayed_work gl_work; 232 struct delayed_work gl_work;
@@ -258,7 +259,6 @@ enum {
258 GIF_INVALID = 0, 259 GIF_INVALID = 0,
259 GIF_QD_LOCKED = 1, 260 GIF_QD_LOCKED = 1,
260 GIF_SW_PAGED = 3, 261 GIF_SW_PAGED = 3,
261 GIF_USER = 4, /* user inode, not metadata addr space */
262}; 262};
263 263
264 264
@@ -451,7 +451,6 @@ struct gfs2_tune {
451 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 451 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
452 unsigned int gt_new_files_jdata; 452 unsigned int gt_new_files_jdata;
453 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 453 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
454 unsigned int gt_stall_secs; /* Detects trouble! */
455 unsigned int gt_complain_secs; 454 unsigned int gt_complain_secs;
456 unsigned int gt_statfs_quantum; 455 unsigned int gt_statfs_quantum;
457 unsigned int gt_statfs_slow; 456 unsigned int gt_statfs_slow;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6e220f4eee7d..b1bf2694fb2b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -45,7 +45,7 @@ static int iget_test(struct inode *inode, void *opaque)
45 struct gfs2_inode *ip = GFS2_I(inode); 45 struct gfs2_inode *ip = GFS2_I(inode);
46 u64 *no_addr = opaque; 46 u64 *no_addr = opaque;
47 47
48 if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags)) 48 if (ip->i_no_addr == *no_addr)
49 return 1; 49 return 1;
50 50
51 return 0; 51 return 0;
@@ -58,7 +58,6 @@ static int iget_set(struct inode *inode, void *opaque)
58 58
59 inode->i_ino = (unsigned long)*no_addr; 59 inode->i_ino = (unsigned long)*no_addr;
60 ip->i_no_addr = *no_addr; 60 ip->i_no_addr = *no_addr;
61 set_bit(GIF_USER, &ip->i_flags);
62 return 0; 61 return 0;
63} 62}
64 63
@@ -84,7 +83,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
84 struct gfs2_inode *ip = GFS2_I(inode); 83 struct gfs2_inode *ip = GFS2_I(inode);
85 struct gfs2_skip_data *data = opaque; 84 struct gfs2_skip_data *data = opaque;
86 85
87 if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){ 86 if (ip->i_no_addr == data->no_addr) {
88 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){ 87 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
89 data->skipped = 1; 88 data->skipped = 1;
90 return 0; 89 return 0;
@@ -103,7 +102,6 @@ static int iget_skip_set(struct inode *inode, void *opaque)
103 return 1; 102 return 1;
104 inode->i_ino = (unsigned long)(data->no_addr); 103 inode->i_ino = (unsigned long)(data->no_addr);
105 ip->i_no_addr = data->no_addr; 104 ip->i_no_addr = data->no_addr;
106 set_bit(GIF_USER, &ip->i_flags);
107 return 0; 105 return 0;
108} 106}
109 107
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e5e0e7022e5..569b46240f61 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -30,7 +30,10 @@ static void gdlm_ast(void *arg)
30 30
31 switch (gl->gl_lksb.sb_status) { 31 switch (gl->gl_lksb.sb_status) {
32 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ 32 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
33 kmem_cache_free(gfs2_glock_cachep, gl); 33 if (gl->gl_ops->go_flags & GLOF_ASPACE)
34 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
35 else
36 kmem_cache_free(gfs2_glock_cachep, gl);
34 if (atomic_dec_and_test(&sdp->sd_glock_disposal)) 37 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
35 wake_up(&sdp->sd_glock_wait); 38 wake_up(&sdp->sd_glock_wait);
36 return; 39 return;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index de97632ba32f..adc260fbea90 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -528,9 +528,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
528 gfs2_pin(sdp, bd->bd_bh); 528 gfs2_pin(sdp, bd->bd_bh);
529 tr->tr_num_databuf_new++; 529 tr->tr_num_databuf_new++;
530 sdp->sd_log_num_databuf++; 530 sdp->sd_log_num_databuf++;
531 list_add(&le->le_list, &sdp->sd_log_le_databuf); 531 list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
532 } else { 532 } else {
533 list_add(&le->le_list, &sdp->sd_log_le_ordered); 533 list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
534 } 534 }
535out: 535out:
536 gfs2_log_unlock(sdp); 536 gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 5b31f7741a8f..a88fadc704bb 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -52,6 +52,22 @@ static void gfs2_init_glock_once(void *foo)
52 atomic_set(&gl->gl_ail_count, 0); 52 atomic_set(&gl->gl_ail_count, 0);
53} 53}
54 54
55static void gfs2_init_gl_aspace_once(void *foo)
56{
57 struct gfs2_glock *gl = foo;
58 struct address_space *mapping = (struct address_space *)(gl + 1);
59
60 gfs2_init_glock_once(gl);
61 memset(mapping, 0, sizeof(*mapping));
62 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
63 spin_lock_init(&mapping->tree_lock);
64 spin_lock_init(&mapping->i_mmap_lock);
65 INIT_LIST_HEAD(&mapping->private_list);
66 spin_lock_init(&mapping->private_lock);
67 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
68 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
69}
70
55/** 71/**
56 * init_gfs2_fs - Register GFS2 as a filesystem 72 * init_gfs2_fs - Register GFS2 as a filesystem
57 * 73 *
@@ -78,6 +94,14 @@ static int __init init_gfs2_fs(void)
78 if (!gfs2_glock_cachep) 94 if (!gfs2_glock_cachep)
79 goto fail; 95 goto fail;
80 96
97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
98 sizeof(struct gfs2_glock) +
99 sizeof(struct address_space),
100 0, 0, gfs2_init_gl_aspace_once);
101
102 if (!gfs2_glock_aspace_cachep)
103 goto fail;
104
81 gfs2_inode_cachep = kmem_cache_create("gfs2_inode", 105 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
82 sizeof(struct gfs2_inode), 106 sizeof(struct gfs2_inode),
83 0, SLAB_RECLAIM_ACCOUNT| 107 0, SLAB_RECLAIM_ACCOUNT|
@@ -144,6 +168,9 @@ fail:
144 if (gfs2_inode_cachep) 168 if (gfs2_inode_cachep)
145 kmem_cache_destroy(gfs2_inode_cachep); 169 kmem_cache_destroy(gfs2_inode_cachep);
146 170
171 if (gfs2_glock_aspace_cachep)
172 kmem_cache_destroy(gfs2_glock_aspace_cachep);
173
147 if (gfs2_glock_cachep) 174 if (gfs2_glock_cachep)
148 kmem_cache_destroy(gfs2_glock_cachep); 175 kmem_cache_destroy(gfs2_glock_cachep);
149 176
@@ -169,6 +196,7 @@ static void __exit exit_gfs2_fs(void)
169 kmem_cache_destroy(gfs2_rgrpd_cachep); 196 kmem_cache_destroy(gfs2_rgrpd_cachep);
170 kmem_cache_destroy(gfs2_bufdata_cachep); 197 kmem_cache_destroy(gfs2_bufdata_cachep);
171 kmem_cache_destroy(gfs2_inode_cachep); 198 kmem_cache_destroy(gfs2_inode_cachep);
199 kmem_cache_destroy(gfs2_glock_aspace_cachep);
172 kmem_cache_destroy(gfs2_glock_cachep); 200 kmem_cache_destroy(gfs2_glock_cachep);
173 201
174 gfs2_sys_uninit(); 202 gfs2_sys_uninit();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 6f68a5f18eb8..0bb12c80937a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -93,49 +93,13 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
93 return err; 93 return err;
94} 94}
95 95
96static const struct address_space_operations aspace_aops = { 96const struct address_space_operations gfs2_meta_aops = {
97 .writepage = gfs2_aspace_writepage, 97 .writepage = gfs2_aspace_writepage,
98 .releasepage = gfs2_releasepage, 98 .releasepage = gfs2_releasepage,
99 .sync_page = block_sync_page, 99 .sync_page = block_sync_page,
100}; 100};
101 101
102/** 102/**
103 * gfs2_aspace_get - Create and initialize a struct inode structure
104 * @sdp: the filesystem the aspace is in
105 *
106 * Right now a struct inode is just a struct inode. Maybe Linux
107 * will supply a more lightweight address space construct (that works)
108 * in the future.
109 *
110 * Make sure pages/buffers in this aspace aren't in high memory.
111 *
112 * Returns: the aspace
113 */
114
115struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
116{
117 struct inode *aspace;
118 struct gfs2_inode *ip;
119
120 aspace = new_inode(sdp->sd_vfs);
121 if (aspace) {
122 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
123 aspace->i_mapping->a_ops = &aspace_aops;
124 aspace->i_size = MAX_LFS_FILESIZE;
125 ip = GFS2_I(aspace);
126 clear_bit(GIF_USER, &ip->i_flags);
127 insert_inode_hash(aspace);
128 }
129 return aspace;
130}
131
132void gfs2_aspace_put(struct inode *aspace)
133{
134 remove_inode_hash(aspace);
135 iput(aspace);
136}
137
138/**
139 * gfs2_meta_sync - Sync all buffers associated with a glock 103 * gfs2_meta_sync - Sync all buffers associated with a glock
140 * @gl: The glock 104 * @gl: The glock
141 * 105 *
@@ -143,7 +107,7 @@ void gfs2_aspace_put(struct inode *aspace)
143 107
144void gfs2_meta_sync(struct gfs2_glock *gl) 108void gfs2_meta_sync(struct gfs2_glock *gl)
145{ 109{
146 struct address_space *mapping = gl->gl_aspace->i_mapping; 110 struct address_space *mapping = gfs2_glock2aspace(gl);
147 int error; 111 int error;
148 112
149 filemap_fdatawrite(mapping); 113 filemap_fdatawrite(mapping);
@@ -164,7 +128,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
164 128
165struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) 129struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
166{ 130{
167 struct address_space *mapping = gl->gl_aspace->i_mapping; 131 struct address_space *mapping = gfs2_glock2aspace(gl);
168 struct gfs2_sbd *sdp = gl->gl_sbd; 132 struct gfs2_sbd *sdp = gl->gl_sbd;
169 struct page *page; 133 struct page *page;
170 struct buffer_head *bh; 134 struct buffer_head *bh;
@@ -344,8 +308,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
344 308
345void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) 309void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
346{ 310{
347 struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host); 311 struct address_space *mapping = bh->b_page->mapping;
312 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
348 struct gfs2_bufdata *bd = bh->b_private; 313 struct gfs2_bufdata *bd = bh->b_private;
314
349 if (test_clear_buffer_pinned(bh)) { 315 if (test_clear_buffer_pinned(bh)) {
350 list_del_init(&bd->bd_le.le_list); 316 list_del_init(&bd->bd_le.le_list);
351 if (meta) { 317 if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index de270c2f9b63..6a1d9ba16411 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -37,8 +37,16 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
37 0, from_head - to_head); 37 0, from_head - to_head);
38} 38}
39 39
40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); 40extern const struct address_space_operations gfs2_meta_aops;
41void gfs2_aspace_put(struct inode *aspace); 41
42static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
43{
44 struct inode *inode = mapping->host;
45 if (mapping->a_ops == &gfs2_meta_aops)
46 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
47 else
48 return inode->i_sb->s_fs_info;
49}
42 50
43void gfs2_meta_sync(struct gfs2_glock *gl); 51void gfs2_meta_sync(struct gfs2_glock *gl);
44 52
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8a102f731003..a054b526dc08 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -65,7 +65,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
65 gt->gt_quota_scale_den = 1; 65 gt->gt_quota_scale_den = 1;
66 gt->gt_new_files_jdata = 0; 66 gt->gt_new_files_jdata = 0;
67 gt->gt_max_readahead = 1 << 18; 67 gt->gt_max_readahead = 1 << 18;
68 gt->gt_stall_secs = 600;
69 gt->gt_complain_secs = 10; 68 gt->gt_complain_secs = 10;
70} 69}
71 70
@@ -725,7 +724,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
725 goto fail; 724 goto fail;
726 } 725 }
727 726
728 error = -EINVAL; 727 error = -EUSERS;
729 if (!gfs2_jindex_size(sdp)) { 728 if (!gfs2_jindex_size(sdp)) {
730 fs_err(sdp, "no journals!\n"); 729 fs_err(sdp, "no journals!\n");
731 goto fail_jindex; 730 goto fail_jindex;
@@ -1241,10 +1240,9 @@ fail_sb:
1241fail_locking: 1240fail_locking:
1242 init_locking(sdp, &mount_gh, UNDO); 1241 init_locking(sdp, &mount_gh, UNDO);
1243fail_lm: 1242fail_lm:
1243 invalidate_inodes(sb);
1244 gfs2_gl_hash_clear(sdp); 1244 gfs2_gl_hash_clear(sdp);
1245 gfs2_lm_unmount(sdp); 1245 gfs2_lm_unmount(sdp);
1246 while (invalidate_inodes(sb))
1247 yield();
1248fail_sys: 1246fail_sys:
1249 gfs2_sys_fs_del(sdp); 1247 gfs2_sys_fs_del(sdp);
1250fail: 1248fail:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 84350e1be66d..4e64352d49de 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -976,122 +976,62 @@ out:
976} 976}
977 977
978/** 978/**
979 * gfs2_readlinki - return the contents of a symlink 979 * gfs2_follow_link - Follow a symbolic link
980 * @ip: the symlink's inode 980 * @dentry: The dentry of the link
981 * @buf: a pointer to the buffer to be filled 981 * @nd: Data that we pass to vfs_follow_link()
982 * @len: a pointer to the length of @buf
983 * 982 *
984 * If @buf is too small, a piece of memory is kmalloc()ed and needs 983 * This can handle symlinks of any size.
985 * to be freed by the caller.
986 * 984 *
987 * Returns: errno 985 * Returns: 0 on success or error code
988 */ 986 */
989 987
990static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) 988static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
991{ 989{
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
992 struct gfs2_holder i_gh; 991 struct gfs2_holder i_gh;
993 struct buffer_head *dibh; 992 struct buffer_head *dibh;
994 unsigned int x; 993 unsigned int x;
994 char *buf;
995 int error; 995 int error;
996 996
997 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); 997 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
998 error = gfs2_glock_nq(&i_gh); 998 error = gfs2_glock_nq(&i_gh);
999 if (error) { 999 if (error) {
1000 gfs2_holder_uninit(&i_gh); 1000 gfs2_holder_uninit(&i_gh);
1001 return error; 1001 nd_set_link(nd, ERR_PTR(error));
1002 return NULL;
1002 } 1003 }
1003 1004
1004 if (!ip->i_disksize) { 1005 if (!ip->i_disksize) {
1005 gfs2_consist_inode(ip); 1006 gfs2_consist_inode(ip);
1006 error = -EIO; 1007 buf = ERR_PTR(-EIO);
1007 goto out; 1008 goto out;
1008 } 1009 }
1009 1010
1010 error = gfs2_meta_inode_buffer(ip, &dibh); 1011 error = gfs2_meta_inode_buffer(ip, &dibh);
1011 if (error) 1012 if (error) {
1013 buf = ERR_PTR(error);
1012 goto out; 1014 goto out;
1013
1014 x = ip->i_disksize + 1;
1015 if (x > *len) {
1016 *buf = kmalloc(x, GFP_NOFS);
1017 if (!*buf) {
1018 error = -ENOMEM;
1019 goto out_brelse;
1020 }
1021 } 1015 }
1022 1016
1023 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); 1017 x = ip->i_disksize + 1;
1024 *len = x; 1018 buf = kmalloc(x, GFP_NOFS);
1025 1019 if (!buf)
1026out_brelse: 1020 buf = ERR_PTR(-ENOMEM);
1021 else
1022 memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1027 brelse(dibh); 1023 brelse(dibh);
1028out: 1024out:
1029 gfs2_glock_dq_uninit(&i_gh); 1025 gfs2_glock_dq_uninit(&i_gh);
1030 return error; 1026 nd_set_link(nd, buf);
1031} 1027 return NULL;
1032
1033/**
1034 * gfs2_readlink - Read the value of a symlink
1035 * @dentry: the symlink
1036 * @buf: the buffer to read the symlink data into
1037 * @size: the size of the buffer
1038 *
1039 * Returns: errno
1040 */
1041
1042static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
1043 int user_size)
1044{
1045 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
1046 char array[GFS2_FAST_NAME_SIZE], *buf = array;
1047 unsigned int len = GFS2_FAST_NAME_SIZE;
1048 int error;
1049
1050 error = gfs2_readlinki(ip, &buf, &len);
1051 if (error)
1052 return error;
1053
1054 if (user_size > len - 1)
1055 user_size = len - 1;
1056
1057 if (copy_to_user(user_buf, buf, user_size))
1058 error = -EFAULT;
1059 else
1060 error = user_size;
1061
1062 if (buf != array)
1063 kfree(buf);
1064
1065 return error;
1066} 1028}
1067 1029
1068/** 1030static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1069 * gfs2_follow_link - Follow a symbolic link
1070 * @dentry: The dentry of the link
1071 * @nd: Data that we pass to vfs_follow_link()
1072 *
1073 * This can handle symlinks of any size. It is optimised for symlinks
1074 * under GFS2_FAST_NAME_SIZE.
1075 *
1076 * Returns: 0 on success or error code
1077 */
1078
1079static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1080{ 1031{
1081 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 1032 char *s = nd_get_link(nd);
1082 char array[GFS2_FAST_NAME_SIZE], *buf = array; 1033 if (!IS_ERR(s))
1083 unsigned int len = GFS2_FAST_NAME_SIZE; 1034 kfree(s);
1084 int error;
1085
1086 error = gfs2_readlinki(ip, &buf, &len);
1087 if (!error) {
1088 error = vfs_follow_link(nd, buf);
1089 if (buf != array)
1090 kfree(buf);
1091 } else
1092 path_put(&nd->path);
1093
1094 return ERR_PTR(error);
1095} 1035}
1096 1036
1097/** 1037/**
@@ -1426,8 +1366,9 @@ const struct inode_operations gfs2_dir_iops = {
1426}; 1366};
1427 1367
1428const struct inode_operations gfs2_symlink_iops = { 1368const struct inode_operations gfs2_symlink_iops = {
1429 .readlink = gfs2_readlink, 1369 .readlink = generic_readlink,
1430 .follow_link = gfs2_follow_link, 1370 .follow_link = gfs2_follow_link,
1371 .put_link = gfs2_put_link,
1431 .permission = gfs2_permission, 1372 .permission = gfs2_permission,
1432 .setattr = gfs2_setattr, 1373 .setattr = gfs2_setattr,
1433 .getattr = gfs2_getattr, 1374 .getattr = gfs2_getattr,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b9dd3da22c0a..ca87598ead7f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -22,6 +22,7 @@
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/time.h> 23#include <linux/time.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/writeback.h>
25 26
26#include "gfs2.h" 27#include "gfs2.h"
27#include "incore.h" 28#include "incore.h"
@@ -711,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
711 * Returns: errno 712 * Returns: errno
712 */ 713 */
713 714
714static int gfs2_write_inode(struct inode *inode, int sync) 715static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
715{ 716{
716 struct gfs2_inode *ip = GFS2_I(inode); 717 struct gfs2_inode *ip = GFS2_I(inode);
717 struct gfs2_sbd *sdp = GFS2_SB(inode); 718 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -722,8 +723,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
722 int ret = 0; 723 int ret = 0;
723 724
724 /* Check this is a "normal" inode, etc */ 725 /* Check this is a "normal" inode, etc */
725 if (!test_bit(GIF_USER, &ip->i_flags) || 726 if (current->flags & PF_MEMALLOC)
726 (current->flags & PF_MEMALLOC))
727 return 0; 727 return 0;
728 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 728 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
729 if (ret) 729 if (ret)
@@ -746,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
746do_unlock: 746do_unlock:
747 gfs2_glock_dq_uninit(&gh); 747 gfs2_glock_dq_uninit(&gh);
748do_flush: 748do_flush:
749 if (sync != 0) 749 if (wbc->sync_mode == WB_SYNC_ALL)
750 gfs2_log_flush(GFS2_SB(inode), ip->i_gl); 750 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
751 return ret; 751 return ret;
752} 752}
@@ -860,6 +860,7 @@ restart:
860 gfs2_clear_rgrpd(sdp); 860 gfs2_clear_rgrpd(sdp);
861 gfs2_jindex_free(sdp); 861 gfs2_jindex_free(sdp);
862 /* Take apart glock structures and buffer lists */ 862 /* Take apart glock structures and buffer lists */
863 invalidate_inodes(sdp->sd_vfs);
863 gfs2_gl_hash_clear(sdp); 864 gfs2_gl_hash_clear(sdp);
864 /* Unmount the locking protocol */ 865 /* Unmount the locking protocol */
865 gfs2_lm_unmount(sdp); 866 gfs2_lm_unmount(sdp);
@@ -1194,7 +1195,7 @@ static void gfs2_drop_inode(struct inode *inode)
1194{ 1195{
1195 struct gfs2_inode *ip = GFS2_I(inode); 1196 struct gfs2_inode *ip = GFS2_I(inode);
1196 1197
1197 if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { 1198 if (inode->i_nlink) {
1198 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; 1199 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1199 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1200 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1200 clear_nlink(inode); 1201 clear_nlink(inode);
@@ -1212,18 +1213,12 @@ static void gfs2_clear_inode(struct inode *inode)
1212{ 1213{
1213 struct gfs2_inode *ip = GFS2_I(inode); 1214 struct gfs2_inode *ip = GFS2_I(inode);
1214 1215
1215 /* This tells us its a "real" inode and not one which only 1216 ip->i_gl->gl_object = NULL;
1216 * serves to contain an address space (see rgrp.c, meta_io.c) 1217 gfs2_glock_put(ip->i_gl);
1217 * which therefore doesn't have its own glocks. 1218 ip->i_gl = NULL;
1218 */ 1219 if (ip->i_iopen_gh.gh_gl) {
1219 if (test_bit(GIF_USER, &ip->i_flags)) { 1220 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1220 ip->i_gl->gl_object = NULL; 1221 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1221 gfs2_glock_put(ip->i_gl);
1222 ip->i_gl = NULL;
1223 if (ip->i_iopen_gh.gh_gl) {
1224 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1225 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1226 }
1227 } 1222 }
1228} 1223}
1229 1224
@@ -1358,9 +1353,6 @@ static void gfs2_delete_inode(struct inode *inode)
1358 struct gfs2_holder gh; 1353 struct gfs2_holder gh;
1359 int error; 1354 int error;
1360 1355
1361 if (!test_bit(GIF_USER, &ip->i_flags))
1362 goto out;
1363
1364 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 1356 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1365 if (unlikely(error)) { 1357 if (unlikely(error)) {
1366 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 1358 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0dc34621f6a6..a0db1c94317d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -478,7 +478,6 @@ TUNE_ATTR(complain_secs, 0);
478TUNE_ATTR(statfs_slow, 0); 478TUNE_ATTR(statfs_slow, 0);
479TUNE_ATTR(new_files_jdata, 0); 479TUNE_ATTR(new_files_jdata, 0);
480TUNE_ATTR(quota_simul_sync, 1); 480TUNE_ATTR(quota_simul_sync, 1);
481TUNE_ATTR(stall_secs, 1);
482TUNE_ATTR(statfs_quantum, 1); 481TUNE_ATTR(statfs_quantum, 1);
483TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 482TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
484 483
@@ -491,7 +490,6 @@ static struct attribute *tune_attrs[] = {
491 &tune_attr_complain_secs.attr, 490 &tune_attr_complain_secs.attr,
492 &tune_attr_statfs_slow.attr, 491 &tune_attr_statfs_slow.attr,
493 &tune_attr_quota_simul_sync.attr, 492 &tune_attr_quota_simul_sync.attr,
494 &tune_attr_stall_secs.attr,
495 &tune_attr_statfs_quantum.attr, 493 &tune_attr_statfs_quantum.attr,
496 &tune_attr_quota_scale.attr, 494 &tune_attr_quota_scale.attr,
497 &tune_attr_new_files_jdata.attr, 495 &tune_attr_new_files_jdata.attr,
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f6a7efa34eb9..226f2bfbf16a 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -21,6 +21,7 @@
21#include "util.h" 21#include "util.h"
22 22
23struct kmem_cache *gfs2_glock_cachep __read_mostly; 23struct kmem_cache *gfs2_glock_cachep __read_mostly;
24struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
24struct kmem_cache *gfs2_inode_cachep __read_mostly; 25struct kmem_cache *gfs2_inode_cachep __read_mostly;
25struct kmem_cache *gfs2_bufdata_cachep __read_mostly; 26struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
26struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; 27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 33e96b0ce9ab..b432e04600de 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -145,6 +145,7 @@ gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
145 145
146 146
147extern struct kmem_cache *gfs2_glock_cachep; 147extern struct kmem_cache *gfs2_glock_cachep;
148extern struct kmem_cache *gfs2_glock_aspace_cachep;
148extern struct kmem_cache *gfs2_inode_cachep; 149extern struct kmem_cache *gfs2_inode_cachep;
149extern struct kmem_cache *gfs2_bufdata_cachep; 150extern struct kmem_cache *gfs2_bufdata_cachep;
150extern struct kmem_cache *gfs2_rgrpd_cachep; 151extern struct kmem_cache *gfs2_rgrpd_cachep;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 052387e11671..fe35e3b626c4 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops;
188 188
189extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); 189extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
190extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); 190extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
191extern int hfs_write_inode(struct inode *, int); 191extern int hfs_write_inode(struct inode *, struct writeback_control *);
192extern int hfs_inode_setattr(struct dentry *, struct iattr *); 192extern int hfs_inode_setattr(struct dentry *, struct iattr *);
193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, 193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
194 __be32 log_size, __be32 phys_size, u32 clump_size); 194 __be32 log_size, __be32 phys_size, u32 clump_size);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a1cbff2b4d99..14f5cb1b9fdc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
381 HFS_SB(inode->i_sb)->alloc_blksz); 381 HFS_SB(inode->i_sb)->alloc_blksz);
382} 382}
383 383
384int hfs_write_inode(struct inode *inode, int unused) 384int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
385{ 385{
386 struct inode *main_inode = inode; 386 struct inode *main_inode = inode;
387 struct hfs_find_data fd; 387 struct hfs_find_data fd;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 43022f3d5148..74b473a8ef92 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -87,7 +87,8 @@ bad_inode:
87 return ERR_PTR(err); 87 return ERR_PTR(err);
88} 88}
89 89
90static int hfsplus_write_inode(struct inode *inode, int unused) 90static int hfsplus_write_inode(struct inode *inode,
91 struct writeback_control *wbc)
91{ 92{
92 struct hfsplus_vh *vhdr; 93 struct hfsplus_vh *vhdr;
93 int ret = 0; 94 int ret = 0;
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 1aa88c4e0964..6a2f04bf3df0 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -353,7 +353,7 @@ int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
353} 353}
354 354
355int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos, 355int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
356 unsigned len, char *buf) 356 unsigned len, const char *buf)
357{ 357{
358 struct buffer_head *bh; 358 struct buffer_head *bh;
359 char *data; 359 char *data;
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 940d6d150bee..67d9d36b3d5f 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -20,8 +20,8 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
20 20
21 if (l == 1) if (qstr->name[0]=='.') goto x; 21 if (l == 1) if (qstr->name[0]=='.') goto x;
22 if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x; 22 if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x;
23 hpfs_adjust_length((char *)qstr->name, &l); 23 hpfs_adjust_length(qstr->name, &l);
24 /*if (hpfs_chk_name((char *)qstr->name,&l))*/ 24 /*if (hpfs_chk_name(qstr->name,&l))*/
25 /*return -ENAMETOOLONG;*/ 25 /*return -ENAMETOOLONG;*/
26 /*return -ENOENT;*/ 26 /*return -ENOENT;*/
27 x: 27 x:
@@ -38,14 +38,16 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
38{ 38{
39 unsigned al=a->len; 39 unsigned al=a->len;
40 unsigned bl=b->len; 40 unsigned bl=b->len;
41 hpfs_adjust_length((char *)a->name, &al); 41 hpfs_adjust_length(a->name, &al);
42 /*hpfs_adjust_length((char *)b->name, &bl);*/ 42 /*hpfs_adjust_length(b->name, &bl);*/
43 /* 'a' is the qstr of an already existing dentry, so the name 43 /* 'a' is the qstr of an already existing dentry, so the name
44 * must be valid. 'b' must be validated first. 44 * must be valid. 'b' must be validated first.
45 */ 45 */
46 46
47 if (hpfs_chk_name((char *)b->name, &bl)) return 1; 47 if (hpfs_chk_name(b->name, &bl))
48 if (hpfs_compare_names(dentry->d_sb, (char *)a->name, al, (char *)b->name, bl, 0)) return 1; 48 return 1;
49 if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
50 return 1;
49 return 0; 51 return 0;
50} 52}
51 53
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8865c94f55f6..26e3964a4b8c 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -59,7 +59,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
59 struct hpfs_dirent *de; 59 struct hpfs_dirent *de;
60 int lc; 60 int lc;
61 long old_pos; 61 long old_pos;
62 char *tempname; 62 unsigned char *tempname;
63 int c1, c2 = 0; 63 int c1, c2 = 0;
64 int ret = 0; 64 int ret = 0;
65 65
@@ -158,11 +158,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
158 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); 158 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
159 if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) { 159 if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) {
160 filp->f_pos = old_pos; 160 filp->f_pos = old_pos;
161 if (tempname != (char *)de->name) kfree(tempname); 161 if (tempname != de->name) kfree(tempname);
162 hpfs_brelse4(&qbh); 162 hpfs_brelse4(&qbh);
163 goto out; 163 goto out;
164 } 164 }
165 if (tempname != (char *)de->name) kfree(tempname); 165 if (tempname != de->name) kfree(tempname);
166 hpfs_brelse4(&qbh); 166 hpfs_brelse4(&qbh);
167 } 167 }
168out: 168out:
@@ -187,7 +187,7 @@ out:
187 187
188struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 188struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
189{ 189{
190 const char *name = dentry->d_name.name; 190 const unsigned char *name = dentry->d_name.name;
191 unsigned len = dentry->d_name.len; 191 unsigned len = dentry->d_name.len;
192 struct quad_buffer_head qbh; 192 struct quad_buffer_head qbh;
193 struct hpfs_dirent *de; 193 struct hpfs_dirent *de;
@@ -197,7 +197,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
197 struct hpfs_inode_info *hpfs_result; 197 struct hpfs_inode_info *hpfs_result;
198 198
199 lock_kernel(); 199 lock_kernel();
200 if ((err = hpfs_chk_name((char *)name, &len))) { 200 if ((err = hpfs_chk_name(name, &len))) {
201 if (err == -ENAMETOOLONG) { 201 if (err == -ENAMETOOLONG) {
202 unlock_kernel(); 202 unlock_kernel();
203 return ERR_PTR(-ENAMETOOLONG); 203 return ERR_PTR(-ENAMETOOLONG);
@@ -209,7 +209,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
209 * '.' and '..' will never be passed here. 209 * '.' and '..' will never be passed here.
210 */ 210 */
211 211
212 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *) name, len, NULL, &qbh); 212 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh);
213 213
214 /* 214 /*
215 * This is not really a bailout, just means file not found. 215 * This is not really a bailout, just means file not found.
@@ -250,7 +250,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
250 hpfs_result = hpfs_i(result); 250 hpfs_result = hpfs_i(result);
251 if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; 251 if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino;
252 252
253 hpfs_decide_conv(result, (char *)name, len); 253 hpfs_decide_conv(result, name, len);
254 254
255 if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) { 255 if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) {
256 hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); 256 hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures");
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index fe83c2b7d2d8..9b2ffadfc8c4 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -158,7 +158,8 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
158 158
159/* Add an entry to dnode and don't care if it grows over 2048 bytes */ 159/* Add an entry to dnode and don't care if it grows over 2048 bytes */
160 160
161struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, unsigned char *name, 161struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
162 const unsigned char *name,
162 unsigned namelen, secno down_ptr) 163 unsigned namelen, secno down_ptr)
163{ 164{
164 struct hpfs_dirent *de; 165 struct hpfs_dirent *de;
@@ -223,7 +224,7 @@ static void fix_up_ptrs(struct super_block *s, struct dnode *d)
223/* Add an entry to dnode and do dnode splitting if required */ 224/* Add an entry to dnode and do dnode splitting if required */
224 225
225static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, 226static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
226 unsigned char *name, unsigned namelen, 227 const unsigned char *name, unsigned namelen,
227 struct hpfs_dirent *new_de, dnode_secno down_ptr) 228 struct hpfs_dirent *new_de, dnode_secno down_ptr)
228{ 229{
229 struct quad_buffer_head qbh, qbh1, qbh2; 230 struct quad_buffer_head qbh, qbh1, qbh2;
@@ -231,7 +232,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
231 dnode_secno adno, rdno; 232 dnode_secno adno, rdno;
232 struct hpfs_dirent *de; 233 struct hpfs_dirent *de;
233 struct hpfs_dirent nde; 234 struct hpfs_dirent nde;
234 char *nname; 235 unsigned char *nname;
235 int h; 236 int h;
236 int pos; 237 int pos;
237 struct buffer_head *bh; 238 struct buffer_head *bh;
@@ -305,7 +306,9 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
305 pos++; 306 pos++;
306 } 307 }
307 copy_de(new_de = &nde, de); 308 copy_de(new_de = &nde, de);
308 memcpy(name = nname, de->name, namelen = de->namelen); 309 memcpy(nname, de->name, de->namelen);
310 name = nname;
311 namelen = de->namelen;
309 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4); 312 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4);
310 down_ptr = adno; 313 down_ptr = adno;
311 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); 314 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
@@ -368,7 +371,8 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
368 * I hope, now it's finally bug-free. 371 * I hope, now it's finally bug-free.
369 */ 372 */
370 373
371int hpfs_add_dirent(struct inode *i, unsigned char *name, unsigned namelen, 374int hpfs_add_dirent(struct inode *i,
375 const unsigned char *name, unsigned namelen,
372 struct hpfs_dirent *new_de, int cdepth) 376 struct hpfs_dirent *new_de, int cdepth)
373{ 377{
374 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 378 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
@@ -897,7 +901,8 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
897 901
898/* Find a dirent in tree */ 902/* Find a dirent in tree */
899 903
900struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, char *name, unsigned len, 904struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno,
905 const unsigned char *name, unsigned len,
901 dnode_secno *dd, struct quad_buffer_head *qbh) 906 dnode_secno *dd, struct quad_buffer_head *qbh)
902{ 907{
903 struct dnode *dnode; 908 struct dnode *dnode;
@@ -988,8 +993,8 @@ void hpfs_remove_dtree(struct super_block *s, dnode_secno dno)
988struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, 993struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
989 struct fnode *f, struct quad_buffer_head *qbh) 994 struct fnode *f, struct quad_buffer_head *qbh)
990{ 995{
991 char *name1; 996 unsigned char *name1;
992 char *name2; 997 unsigned char *name2;
993 int name1len, name2len; 998 int name1len, name2len;
994 struct dnode *d; 999 struct dnode *d;
995 dnode_secno dno, downd; 1000 dnode_secno dno, downd;
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index 547a8384571f..45e53d972b42 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -62,8 +62,8 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
62 return ret; 62 return ret;
63} 63}
64 64
65static void set_indirect_ea(struct super_block *s, int ano, secno a, char *data, 65static void set_indirect_ea(struct super_block *s, int ano, secno a,
66 int size) 66 const char *data, int size)
67{ 67{
68 hpfs_ea_write(s, a, ano, 0, size, data); 68 hpfs_ea_write(s, a, ano, 0, size, data);
69} 69}
@@ -186,7 +186,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
186 * This driver can't change sizes of eas ('cause I just don't need it). 186 * This driver can't change sizes of eas ('cause I just don't need it).
187 */ 187 */
188 188
189void hpfs_set_ea(struct inode *inode, struct fnode *fnode, char *key, char *data, int size) 189void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
190 const char *data, int size)
190{ 191{
191 fnode_secno fno = inode->i_ino; 192 fnode_secno fno = inode->i_ino;
192 struct super_block *s = inode->i_sb; 193 struct super_block *s = inode->i_sb;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 701ca54c0867..97bf738cd5d6 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -215,7 +215,7 @@ secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_heade
215secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned); 215secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned);
216void hpfs_remove_btree(struct super_block *, struct bplus_header *); 216void hpfs_remove_btree(struct super_block *, struct bplus_header *);
217int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *); 217int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *);
218int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, char *); 218int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *);
219void hpfs_ea_remove(struct super_block *, secno, int, unsigned); 219void hpfs_ea_remove(struct super_block *, secno, int, unsigned);
220void hpfs_truncate_btree(struct super_block *, secno, int, unsigned); 220void hpfs_truncate_btree(struct super_block *, secno, int, unsigned);
221void hpfs_remove_fnode(struct super_block *, fnode_secno fno); 221void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
@@ -244,13 +244,17 @@ extern const struct file_operations hpfs_dir_ops;
244 244
245void hpfs_add_pos(struct inode *, loff_t *); 245void hpfs_add_pos(struct inode *, loff_t *);
246void hpfs_del_pos(struct inode *, loff_t *); 246void hpfs_del_pos(struct inode *, loff_t *);
247struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, unsigned char *, unsigned, secno); 247struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
248int hpfs_add_dirent(struct inode *, unsigned char *, unsigned, struct hpfs_dirent *, int); 248 const unsigned char *, unsigned, secno);
249int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned,
250 struct hpfs_dirent *, int);
249int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int); 251int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int);
250void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *); 252void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *);
251dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno); 253dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno);
252struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *); 254struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *);
253struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, char *, unsigned, dnode_secno *, struct quad_buffer_head *); 255struct hpfs_dirent *map_dirent(struct inode *, dnode_secno,
256 const unsigned char *, unsigned, dnode_secno *,
257 struct quad_buffer_head *);
254void hpfs_remove_dtree(struct super_block *, dnode_secno); 258void hpfs_remove_dtree(struct super_block *, dnode_secno);
255struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *); 259struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *);
256 260
@@ -259,7 +263,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct f
259void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned); 263void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned);
260int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int); 264int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int);
261char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *); 265char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *);
262void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int); 266void hpfs_set_ea(struct inode *, struct fnode *, const char *,
267 const char *, int);
263 268
264/* file.c */ 269/* file.c */
265 270
@@ -282,7 +287,7 @@ void hpfs_delete_inode(struct inode *);
282 287
283unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); 288unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
284unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); 289unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
285char *hpfs_load_code_page(struct super_block *, secno); 290unsigned char *hpfs_load_code_page(struct super_block *, secno);
286secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp); 291secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
287struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); 292struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
288struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); 293struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
@@ -292,12 +297,13 @@ dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino);
292/* name.c */ 297/* name.c */
293 298
294unsigned char hpfs_upcase(unsigned char *, unsigned char); 299unsigned char hpfs_upcase(unsigned char *, unsigned char);
295int hpfs_chk_name(unsigned char *, unsigned *); 300int hpfs_chk_name(const unsigned char *, unsigned *);
296char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int); 301unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
297int hpfs_compare_names(struct super_block *, unsigned char *, unsigned, unsigned char *, unsigned, int); 302int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned,
298int hpfs_is_name_long(unsigned char *, unsigned); 303 const unsigned char *, unsigned, int);
299void hpfs_adjust_length(unsigned char *, unsigned *); 304int hpfs_is_name_long(const unsigned char *, unsigned);
300void hpfs_decide_conv(struct inode *, unsigned char *, unsigned); 305void hpfs_adjust_length(const unsigned char *, unsigned *);
306void hpfs_decide_conv(struct inode *, const unsigned char *, unsigned);
301 307
302/* namei.c */ 308/* namei.c */
303 309
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index fe703ae46bc7..ff90affb94e1 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -46,7 +46,7 @@ void hpfs_read_inode(struct inode *i)
46 struct fnode *fnode; 46 struct fnode *fnode;
47 struct super_block *sb = i->i_sb; 47 struct super_block *sb = i->i_sb;
48 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 48 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
49 unsigned char *ea; 49 void *ea;
50 int ea_size; 50 int ea_size;
51 51
52 if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) { 52 if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) {
@@ -112,7 +112,7 @@ void hpfs_read_inode(struct inode *i)
112 } 112 }
113 } 113 }
114 if (fnode->dirflag) { 114 if (fnode->dirflag) {
115 unsigned n_dnodes, n_subdirs; 115 int n_dnodes, n_subdirs;
116 i->i_mode |= S_IFDIR; 116 i->i_mode |= S_IFDIR;
117 i->i_op = &hpfs_dir_iops; 117 i->i_op = &hpfs_dir_iops;
118 i->i_fop = &hpfs_dir_ops; 118 i->i_fop = &hpfs_dir_ops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index c4724589b2eb..840d033ecee8 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -35,7 +35,7 @@ unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
35 * lowercasing table 35 * lowercasing table
36 */ 36 */
37 37
38char *hpfs_load_code_page(struct super_block *s, secno cps) 38unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
39{ 39{
40 struct buffer_head *bh; 40 struct buffer_head *bh;
41 secno cpds; 41 secno cpds;
@@ -71,7 +71,7 @@ char *hpfs_load_code_page(struct super_block *s, secno cps)
71 brelse(bh); 71 brelse(bh);
72 return NULL; 72 return NULL;
73 } 73 }
74 ptr = (char *)cpd + cpd->offs[cpi] + 6; 74 ptr = (unsigned char *)cpd + cpd->offs[cpi] + 6;
75 if (!(cp_table = kmalloc(256, GFP_KERNEL))) { 75 if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
76 printk("HPFS: out of memory for code page table\n"); 76 printk("HPFS: out of memory for code page table\n");
77 brelse(bh); 77 brelse(bh);
@@ -217,7 +217,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
217 if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD))) 217 if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
218 if (hpfs_sb(s)->sb_chk) { 218 if (hpfs_sb(s)->sb_chk) {
219 unsigned p, pp = 0; 219 unsigned p, pp = 0;
220 unsigned char *d = (char *)dnode; 220 unsigned char *d = (unsigned char *)dnode;
221 int b = 0; 221 int b = 0;
222 if (dnode->magic != DNODE_MAGIC) { 222 if (dnode->magic != DNODE_MAGIC) {
223 hpfs_error(s, "bad magic on dnode %08x", secno); 223 hpfs_error(s, "bad magic on dnode %08x", secno);
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index 1f4a964384eb..f24736d7a439 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -8,16 +8,16 @@
8 8
9#include "hpfs_fn.h" 9#include "hpfs_fn.h"
10 10
11static char *text_postfix[]={ 11static const char *text_postfix[]={
12".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF", 12".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF",
13".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS", 13".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS",
14".RC", ".TEX", ".TXT", ".Y", ""}; 14".RC", ".TEX", ".TXT", ".Y", ""};
15 15
16static char *text_prefix[]={ 16static const char *text_prefix[]={
17"AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ", 17"AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ",
18"MAKEFILE", "READ.ME", "README", "TERMCAP", ""}; 18"MAKEFILE", "READ.ME", "README", "TERMCAP", ""};
19 19
20void hpfs_decide_conv(struct inode *inode, unsigned char *name, unsigned len) 20void hpfs_decide_conv(struct inode *inode, const unsigned char *name, unsigned len)
21{ 21{
22 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); 22 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
23 int i; 23 int i;
@@ -71,7 +71,7 @@ static inline unsigned char locase(unsigned char *dir, unsigned char a)
71 return dir[a]; 71 return dir[a];
72} 72}
73 73
74int hpfs_chk_name(unsigned char *name, unsigned *len) 74int hpfs_chk_name(const unsigned char *name, unsigned *len)
75{ 75{
76 int i; 76 int i;
77 if (*len > 254) return -ENAMETOOLONG; 77 if (*len > 254) return -ENAMETOOLONG;
@@ -83,10 +83,10 @@ int hpfs_chk_name(unsigned char *name, unsigned *len)
83 return 0; 83 return 0;
84} 84}
85 85
86char *hpfs_translate_name(struct super_block *s, unsigned char *from, 86unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
87 unsigned len, int lc, int lng) 87 unsigned len, int lc, int lng)
88{ 88{
89 char *to; 89 unsigned char *to;
90 int i; 90 int i;
91 if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) { 91 if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
92 printk("HPFS: Long name flag mismatch - name "); 92 printk("HPFS: Long name flag mismatch - name ");
@@ -103,8 +103,9 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from,
103 return to; 103 return to;
104} 104}
105 105
106int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1, 106int hpfs_compare_names(struct super_block *s,
107 unsigned char *n2, unsigned l2, int last) 107 const unsigned char *n1, unsigned l1,
108 const unsigned char *n2, unsigned l2, int last)
108{ 109{
109 unsigned l = l1 < l2 ? l1 : l2; 110 unsigned l = l1 < l2 ? l1 : l2;
110 unsigned i; 111 unsigned i;
@@ -120,7 +121,7 @@ int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
120 return 0; 121 return 0;
121} 122}
122 123
123int hpfs_is_name_long(unsigned char *name, unsigned len) 124int hpfs_is_name_long(const unsigned char *name, unsigned len)
124{ 125{
125 int i,j; 126 int i,j;
126 for (i = 0; i < len && name[i] != '.'; i++) 127 for (i = 0; i < len && name[i] != '.'; i++)
@@ -134,7 +135,7 @@ int hpfs_is_name_long(unsigned char *name, unsigned len)
134 135
135/* OS/2 clears dots and spaces at the end of file name, so we have to */ 136/* OS/2 clears dots and spaces at the end of file name, so we have to */
136 137
137void hpfs_adjust_length(unsigned char *name, unsigned *len) 138void hpfs_adjust_length(const unsigned char *name, unsigned *len)
138{ 139{
139 if (!*len) return; 140 if (!*len) return;
140 if (*len == 1 && name[0] == '.') return; 141 if (*len == 1 && name[0] == '.') return;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 82b9c4ba9ed0..11c2b4080f65 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
11 11
12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
13{ 13{
14 const char *name = dentry->d_name.name; 14 const unsigned char *name = dentry->d_name.name;
15 unsigned len = dentry->d_name.len; 15 unsigned len = dentry->d_name.len;
16 struct quad_buffer_head qbh0; 16 struct quad_buffer_head qbh0;
17 struct buffer_head *bh; 17 struct buffer_head *bh;
@@ -24,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
24 int r; 24 int r;
25 struct hpfs_dirent dee; 25 struct hpfs_dirent dee;
26 int err; 26 int err;
27 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 27 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
28 lock_kernel(); 28 lock_kernel();
29 err = -ENOSPC; 29 err = -ENOSPC;
30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -62,7 +62,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
62 result->i_mode &= ~0222; 62 result->i_mode &= ~0222;
63 63
64 mutex_lock(&hpfs_i(dir)->i_mutex); 64 mutex_lock(&hpfs_i(dir)->i_mutex);
65 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 65 r = hpfs_add_dirent(dir, name, len, &dee, 0);
66 if (r == 1) 66 if (r == 1)
67 goto bail3; 67 goto bail3;
68 if (r == -1) { 68 if (r == -1) {
@@ -121,7 +121,7 @@ bail:
121 121
122static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) 122static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
123{ 123{
124 const char *name = dentry->d_name.name; 124 const unsigned char *name = dentry->d_name.name;
125 unsigned len = dentry->d_name.len; 125 unsigned len = dentry->d_name.len;
126 struct inode *result = NULL; 126 struct inode *result = NULL;
127 struct buffer_head *bh; 127 struct buffer_head *bh;
@@ -130,7 +130,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
130 int r; 130 int r;
131 struct hpfs_dirent dee; 131 struct hpfs_dirent dee;
132 int err; 132 int err;
133 if ((err = hpfs_chk_name((char *)name, &len))) 133 if ((err = hpfs_chk_name(name, &len)))
134 return err==-ENOENT ? -EINVAL : err; 134 return err==-ENOENT ? -EINVAL : err;
135 lock_kernel(); 135 lock_kernel();
136 err = -ENOSPC; 136 err = -ENOSPC;
@@ -155,7 +155,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
155 result->i_op = &hpfs_file_iops; 155 result->i_op = &hpfs_file_iops;
156 result->i_fop = &hpfs_file_ops; 156 result->i_fop = &hpfs_file_ops;
157 result->i_nlink = 1; 157 result->i_nlink = 1;
158 hpfs_decide_conv(result, (char *)name, len); 158 hpfs_decide_conv(result, name, len);
159 hpfs_i(result)->i_parent_dir = dir->i_ino; 159 hpfs_i(result)->i_parent_dir = dir->i_ino;
160 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); 160 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date);
161 result->i_ctime.tv_nsec = 0; 161 result->i_ctime.tv_nsec = 0;
@@ -170,7 +170,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
170 hpfs_i(result)->mmu_private = 0; 170 hpfs_i(result)->mmu_private = 0;
171 171
172 mutex_lock(&hpfs_i(dir)->i_mutex); 172 mutex_lock(&hpfs_i(dir)->i_mutex);
173 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 173 r = hpfs_add_dirent(dir, name, len, &dee, 0);
174 if (r == 1) 174 if (r == 1)
175 goto bail2; 175 goto bail2;
176 if (r == -1) { 176 if (r == -1) {
@@ -211,7 +211,7 @@ bail:
211 211
212static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) 212static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
213{ 213{
214 const char *name = dentry->d_name.name; 214 const unsigned char *name = dentry->d_name.name;
215 unsigned len = dentry->d_name.len; 215 unsigned len = dentry->d_name.len;
216 struct buffer_head *bh; 216 struct buffer_head *bh;
217 struct fnode *fnode; 217 struct fnode *fnode;
@@ -220,7 +220,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
220 struct hpfs_dirent dee; 220 struct hpfs_dirent dee;
221 struct inode *result = NULL; 221 struct inode *result = NULL;
222 int err; 222 int err;
223 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 223 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; 224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
225 if (!new_valid_dev(rdev)) 225 if (!new_valid_dev(rdev))
226 return -EINVAL; 226 return -EINVAL;
@@ -256,7 +256,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
256 init_special_inode(result, mode, rdev); 256 init_special_inode(result, mode, rdev);
257 257
258 mutex_lock(&hpfs_i(dir)->i_mutex); 258 mutex_lock(&hpfs_i(dir)->i_mutex);
259 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 259 r = hpfs_add_dirent(dir, name, len, &dee, 0);
260 if (r == 1) 260 if (r == 1)
261 goto bail2; 261 goto bail2;
262 if (r == -1) { 262 if (r == -1) {
@@ -289,7 +289,7 @@ bail:
289 289
290static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink) 290static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
291{ 291{
292 const char *name = dentry->d_name.name; 292 const unsigned char *name = dentry->d_name.name;
293 unsigned len = dentry->d_name.len; 293 unsigned len = dentry->d_name.len;
294 struct buffer_head *bh; 294 struct buffer_head *bh;
295 struct fnode *fnode; 295 struct fnode *fnode;
@@ -298,7 +298,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
298 struct hpfs_dirent dee; 298 struct hpfs_dirent dee;
299 struct inode *result; 299 struct inode *result;
300 int err; 300 int err;
301 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 301 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
302 lock_kernel(); 302 lock_kernel();
303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) { 303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
304 unlock_kernel(); 304 unlock_kernel();
@@ -335,7 +335,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
335 result->i_data.a_ops = &hpfs_symlink_aops; 335 result->i_data.a_ops = &hpfs_symlink_aops;
336 336
337 mutex_lock(&hpfs_i(dir)->i_mutex); 337 mutex_lock(&hpfs_i(dir)->i_mutex);
338 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 338 r = hpfs_add_dirent(dir, name, len, &dee, 0);
339 if (r == 1) 339 if (r == 1)
340 goto bail2; 340 goto bail2;
341 if (r == -1) { 341 if (r == -1) {
@@ -345,7 +345,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
345 fnode->len = len; 345 fnode->len = len;
346 memcpy(fnode->name, name, len > 15 ? 15 : len); 346 memcpy(fnode->name, name, len > 15 ? 15 : len);
347 fnode->up = dir->i_ino; 347 fnode->up = dir->i_ino;
348 hpfs_set_ea(result, fnode, "SYMLINK", (char *)symlink, strlen(symlink)); 348 hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink));
349 mark_buffer_dirty(bh); 349 mark_buffer_dirty(bh);
350 brelse(bh); 350 brelse(bh);
351 351
@@ -369,7 +369,7 @@ bail:
369 369
370static int hpfs_unlink(struct inode *dir, struct dentry *dentry) 370static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
371{ 371{
372 const char *name = dentry->d_name.name; 372 const unsigned char *name = dentry->d_name.name;
373 unsigned len = dentry->d_name.len; 373 unsigned len = dentry->d_name.len;
374 struct quad_buffer_head qbh; 374 struct quad_buffer_head qbh;
375 struct hpfs_dirent *de; 375 struct hpfs_dirent *de;
@@ -381,12 +381,12 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
381 int err; 381 int err;
382 382
383 lock_kernel(); 383 lock_kernel();
384 hpfs_adjust_length((char *)name, &len); 384 hpfs_adjust_length(name, &len);
385again: 385again:
386 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 386 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
387 mutex_lock(&hpfs_i(dir)->i_mutex); 387 mutex_lock(&hpfs_i(dir)->i_mutex);
388 err = -ENOENT; 388 err = -ENOENT;
389 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); 389 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
390 if (!de) 390 if (!de)
391 goto out; 391 goto out;
392 392
@@ -413,22 +413,25 @@ again:
413 413
414 mutex_unlock(&hpfs_i(dir)->i_mutex); 414 mutex_unlock(&hpfs_i(dir)->i_mutex);
415 mutex_unlock(&hpfs_i(inode)->i_parent_mutex); 415 mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
416 d_drop(dentry); 416 dentry_unhash(dentry);
417 spin_lock(&dentry->d_lock); 417 if (!d_unhashed(dentry)) {
418 if (atomic_read(&dentry->d_count) > 1 || 418 dput(dentry);
419 generic_permission(inode, MAY_WRITE, NULL) || 419 unlock_kernel();
420 return -ENOSPC;
421 }
422 if (generic_permission(inode, MAY_WRITE, NULL) ||
420 !S_ISREG(inode->i_mode) || 423 !S_ISREG(inode->i_mode) ||
421 get_write_access(inode)) { 424 get_write_access(inode)) {
422 spin_unlock(&dentry->d_lock);
423 d_rehash(dentry); 425 d_rehash(dentry);
426 dput(dentry);
424 } else { 427 } else {
425 struct iattr newattrs; 428 struct iattr newattrs;
426 spin_unlock(&dentry->d_lock);
427 /*printk("HPFS: truncating file before delete.\n");*/ 429 /*printk("HPFS: truncating file before delete.\n");*/
428 newattrs.ia_size = 0; 430 newattrs.ia_size = 0;
429 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; 431 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
430 err = notify_change(dentry, &newattrs); 432 err = notify_change(dentry, &newattrs);
431 put_write_access(inode); 433 put_write_access(inode);
434 dput(dentry);
432 if (!err) 435 if (!err)
433 goto again; 436 goto again;
434 } 437 }
@@ -451,7 +454,7 @@ out:
451 454
452static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) 455static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
453{ 456{
454 const char *name = dentry->d_name.name; 457 const unsigned char *name = dentry->d_name.name;
455 unsigned len = dentry->d_name.len; 458 unsigned len = dentry->d_name.len;
456 struct quad_buffer_head qbh; 459 struct quad_buffer_head qbh;
457 struct hpfs_dirent *de; 460 struct hpfs_dirent *de;
@@ -462,12 +465,12 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
462 int err; 465 int err;
463 int r; 466 int r;
464 467
465 hpfs_adjust_length((char *)name, &len); 468 hpfs_adjust_length(name, &len);
466 lock_kernel(); 469 lock_kernel();
467 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 470 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
468 mutex_lock(&hpfs_i(dir)->i_mutex); 471 mutex_lock(&hpfs_i(dir)->i_mutex);
469 err = -ENOENT; 472 err = -ENOENT;
470 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); 473 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
471 if (!de) 474 if (!de)
472 goto out; 475 goto out;
473 476
@@ -546,10 +549,10 @@ const struct address_space_operations hpfs_symlink_aops = {
546static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, 549static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
547 struct inode *new_dir, struct dentry *new_dentry) 550 struct inode *new_dir, struct dentry *new_dentry)
548{ 551{
549 char *old_name = (char *)old_dentry->d_name.name; 552 const unsigned char *old_name = old_dentry->d_name.name;
550 int old_len = old_dentry->d_name.len; 553 unsigned old_len = old_dentry->d_name.len;
551 char *new_name = (char *)new_dentry->d_name.name; 554 const unsigned char *new_name = new_dentry->d_name.name;
552 int new_len = new_dentry->d_name.len; 555 unsigned new_len = new_dentry->d_name.len;
553 struct inode *i = old_dentry->d_inode; 556 struct inode *i = old_dentry->d_inode;
554 struct inode *new_inode = new_dentry->d_inode; 557 struct inode *new_inode = new_dentry->d_inode;
555 struct quad_buffer_head qbh, qbh1; 558 struct quad_buffer_head qbh, qbh1;
@@ -560,9 +563,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
560 struct buffer_head *bh; 563 struct buffer_head *bh;
561 struct fnode *fnode; 564 struct fnode *fnode;
562 int err; 565 int err;
563 if ((err = hpfs_chk_name((char *)new_name, &new_len))) return err; 566 if ((err = hpfs_chk_name(new_name, &new_len))) return err;
564 err = 0; 567 err = 0;
565 hpfs_adjust_length((char *)old_name, &old_len); 568 hpfs_adjust_length(old_name, &old_len);
566 569
567 lock_kernel(); 570 lock_kernel();
568 /* order doesn't matter, due to VFS exclusion */ 571 /* order doesn't matter, due to VFS exclusion */
@@ -579,7 +582,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
579 goto end1; 582 goto end1;
580 } 583 }
581 584
582 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { 585 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
583 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed"); 586 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed");
584 err = -ENOENT; 587 err = -ENOENT;
585 goto end1; 588 goto end1;
@@ -590,7 +593,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
590 if (new_inode) { 593 if (new_inode) {
591 int r; 594 int r;
592 if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) { 595 if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
593 if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, (char *)new_name, new_len, NULL, &qbh1))) { 596 if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) {
594 clear_nlink(new_inode); 597 clear_nlink(new_inode);
595 copy_de(nde, &de); 598 copy_de(nde, &de);
596 memcpy(nde->name, new_name, new_len); 599 memcpy(nde->name, new_name, new_len);
@@ -618,7 +621,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
618 } 621 }
619 622
620 if (new_dir == old_dir) 623 if (new_dir == old_dir)
621 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { 624 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
622 hpfs_unlock_creation(i->i_sb); 625 hpfs_unlock_creation(i->i_sb);
623 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2"); 626 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2");
624 err = -ENOENT; 627 err = -ENOENT;
@@ -648,7 +651,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
648 brelse(bh); 651 brelse(bh);
649 } 652 }
650 hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv; 653 hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv;
651 hpfs_decide_conv(i, (char *)new_name, new_len); 654 hpfs_decide_conv(i, new_name, new_len);
652end1: 655end1:
653 if (old_dir != new_dir) 656 if (old_dir != new_dir)
654 mutex_unlock(&hpfs_i(new_dir)->i_mutex); 657 mutex_unlock(&hpfs_i(new_dir)->i_mutex);
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7239efc690d8..2e4dfa8593da 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -718,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
718 struct vfsmount *proc_mnt; 718 struct vfsmount *proc_mnt;
719 int err = -ENOENT; 719 int err = -ENOENT;
720 720
721 proc_mnt = do_kern_mount("proc", 0, "proc", NULL); 721 proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
722 if (IS_ERR(proc_mnt)) 722 if (IS_ERR(proc_mnt))
723 goto out; 723 goto out;
724 724
diff --git a/fs/internal.h b/fs/internal.h
index e96a1667d749..8a03a5447bdf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
70 70
71extern void __init mnt_init(void); 71extern void __init mnt_init(void);
72 72
73extern spinlock_t vfsmount_lock;
74
73/* 75/*
74 * fs_struct.c 76 * fs_struct.c
75 */ 77 */
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 886849370950..30beb11ef928 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
507 if (blocknr < journal->j_tail) 507 if (blocknr < journal->j_tail)
508 freed = freed + journal->j_last - journal->j_first; 508 freed = freed + journal->j_last - journal->j_first;
509 509
510 trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
510 jbd_debug(1, 511 jbd_debug(1,
511 "Cleaning journal tail from %d to %d (offset %lu), " 512 "Cleaning journal tail from %d to %d (offset %lu), "
512 "freeing %lu\n", 513 "freeing %lu\n",
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1bc74b6f26d2..671da7fb7ffd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -883,8 +883,7 @@ restart_loop:
883 spin_unlock(&journal->j_list_lock); 883 spin_unlock(&journal->j_list_lock);
884 bh = jh2bh(jh); 884 bh = jh2bh(jh);
885 jbd_lock_bh_state(bh); 885 jbd_lock_bh_state(bh);
886 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || 886 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
887 jh->b_transaction == journal->j_running_transaction);
888 887
889 /* 888 /*
890 * If there is undo-protected committed data against 889 * If there is undo-protected committed data against
@@ -930,12 +929,12 @@ restart_loop:
930 /* A buffer which has been freed while still being 929 /* A buffer which has been freed while still being
931 * journaled by a previous transaction may end up still 930 * journaled by a previous transaction may end up still
932 * being dirty here, but we want to avoid writing back 931 * being dirty here, but we want to avoid writing back
933 * that buffer in the future now that the last use has 932 * that buffer in the future after the "add to orphan"
934 * been committed. That's not only a performance gain, 933 * operation been committed, That's not only a performance
935 * it also stops aliasing problems if the buffer is left 934 * gain, it also stops aliasing problems if the buffer is
936 * behind for writeback and gets reallocated for another 935 * left behind for writeback and gets reallocated for another
937 * use in a different page. */ 936 * use in a different page. */
938 if (buffer_freed(bh)) { 937 if (buffer_freed(bh) && !jh->b_next_transaction) {
939 clear_buffer_freed(bh); 938 clear_buffer_freed(bh);
940 clear_buffer_jbddirty(bh); 939 clear_buffer_jbddirty(bh);
941 } 940 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ac0d027595d0..c03d4dce4d76 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
39#include <linux/seq_file.h> 39#include <linux/seq_file.h>
40#include <linux/math64.h> 40#include <linux/math64.h>
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/log2.h>
43#include <linux/vmalloc.h>
42 44
43#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
44#include <trace/events/jbd2.h> 46#include <trace/events/jbd2.h>
@@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
93 95
94static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 96static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
95static void __journal_abort_soft (journal_t *journal, int errno); 97static void __journal_abort_soft (journal_t *journal, int errno);
98static int jbd2_journal_create_slab(size_t slab_size);
96 99
97/* 100/*
98 * Helper function used to manage commit timeouts 101 * Helper function used to manage commit timeouts
@@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal)
1248 } 1251 }
1249 } 1252 }
1250 1253
1254 /*
1255 * Create a slab for this blocksize
1256 */
1257 err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
1258 if (err)
1259 return err;
1260
1251 /* Let the recovery code check whether it needs to recover any 1261 /* Let the recovery code check whether it needs to recover any
1252 * data from the journal. */ 1262 * data from the journal. */
1253 if (jbd2_journal_recover(journal)) 1263 if (jbd2_journal_recover(journal))
@@ -1807,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal)
1807} 1817}
1808 1818
1809/* 1819/*
1820 * JBD memory management
1821 *
1822 * These functions are used to allocate block-sized chunks of memory
1823 * used for making copies of buffer_head data. Very often it will be
1824 * page-sized chunks of data, but sometimes it will be in
1825 * sub-page-size chunks. (For example, 16k pages on Power systems
1826 * with a 4k block file system.) For blocks smaller than a page, we
1827 * use a SLAB allocator. There are slab caches for each block size,
1828 * which are allocated at mount time, if necessary, and we only free
1829 * (all of) the slab caches when/if the jbd2 module is unloaded. For
1830 * this reason we don't need to a mutex to protect access to
1831 * jbd2_slab[] allocating or releasing memory; only in
1832 * jbd2_journal_create_slab().
1833 */
1834#define JBD2_MAX_SLABS 8
1835static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
1836static DECLARE_MUTEX(jbd2_slab_create_sem);
1837
1838static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
1839 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
1840 "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
1841};
1842
1843
1844static void jbd2_journal_destroy_slabs(void)
1845{
1846 int i;
1847
1848 for (i = 0; i < JBD2_MAX_SLABS; i++) {
1849 if (jbd2_slab[i])
1850 kmem_cache_destroy(jbd2_slab[i]);
1851 jbd2_slab[i] = NULL;
1852 }
1853}
1854
1855static int jbd2_journal_create_slab(size_t size)
1856{
1857 int i = order_base_2(size) - 10;
1858 size_t slab_size;
1859
1860 if (size == PAGE_SIZE)
1861 return 0;
1862
1863 if (i >= JBD2_MAX_SLABS)
1864 return -EINVAL;
1865
1866 if (unlikely(i < 0))
1867 i = 0;
1868 down(&jbd2_slab_create_sem);
1869 if (jbd2_slab[i]) {
1870 up(&jbd2_slab_create_sem);
1871 return 0; /* Already created */
1872 }
1873
1874 slab_size = 1 << (i+10);
1875 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
1876 slab_size, 0, NULL);
1877 up(&jbd2_slab_create_sem);
1878 if (!jbd2_slab[i]) {
1879 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
1880 return -ENOMEM;
1881 }
1882 return 0;
1883}
1884
1885static struct kmem_cache *get_slab(size_t size)
1886{
1887 int i = order_base_2(size) - 10;
1888
1889 BUG_ON(i >= JBD2_MAX_SLABS);
1890 if (unlikely(i < 0))
1891 i = 0;
1892 BUG_ON(jbd2_slab[i] == 0);
1893 return jbd2_slab[i];
1894}
1895
1896void *jbd2_alloc(size_t size, gfp_t flags)
1897{
1898 void *ptr;
1899
1900 BUG_ON(size & (size-1)); /* Must be a power of 2 */
1901
1902 flags |= __GFP_REPEAT;
1903 if (size == PAGE_SIZE)
1904 ptr = (void *)__get_free_pages(flags, 0);
1905 else if (size > PAGE_SIZE) {
1906 int order = get_order(size);
1907
1908 if (order < 3)
1909 ptr = (void *)__get_free_pages(flags, order);
1910 else
1911 ptr = vmalloc(size);
1912 } else
1913 ptr = kmem_cache_alloc(get_slab(size), flags);
1914
1915 /* Check alignment; SLUB has gotten this wrong in the past,
1916 * and this can lead to user data corruption! */
1917 BUG_ON(((unsigned long) ptr) & (size-1));
1918
1919 return ptr;
1920}
1921
1922void jbd2_free(void *ptr, size_t size)
1923{
1924 if (size == PAGE_SIZE) {
1925 free_pages((unsigned long)ptr, 0);
1926 return;
1927 }
1928 if (size > PAGE_SIZE) {
1929 int order = get_order(size);
1930
1931 if (order < 3)
1932 free_pages((unsigned long)ptr, order);
1933 else
1934 vfree(ptr);
1935 return;
1936 }
1937 kmem_cache_free(get_slab(size), ptr);
1938};
1939
1940/*
1810 * Journal_head storage management 1941 * Journal_head storage management
1811 */ 1942 */
1812static struct kmem_cache *jbd2_journal_head_cache; 1943static struct kmem_cache *jbd2_journal_head_cache;
@@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void)
2204 jbd2_journal_destroy_revoke_caches(); 2335 jbd2_journal_destroy_revoke_caches();
2205 jbd2_journal_destroy_jbd2_journal_head_cache(); 2336 jbd2_journal_destroy_jbd2_journal_head_cache();
2206 jbd2_journal_destroy_handle_cache(); 2337 jbd2_journal_destroy_handle_cache();
2338 jbd2_journal_destroy_slabs();
2207} 2339}
2208 2340
2209static int __init journal_init(void) 2341static int __init journal_init(void)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0512700542f..bfc70f57900f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1727 if (!jh) 1727 if (!jh)
1728 goto zap_buffer_no_jh; 1728 goto zap_buffer_no_jh;
1729 1729
1730 /*
1731 * We cannot remove the buffer from checkpoint lists until the
1732 * transaction adding inode to orphan list (let's call it T)
1733 * is committed. Otherwise if the transaction changing the
1734 * buffer would be cleaned from the journal before T is
1735 * committed, a crash will cause that the correct contents of
1736 * the buffer will be lost. On the other hand we have to
1737 * clear the buffer dirty bit at latest at the moment when the
1738 * transaction marking the buffer as freed in the filesystem
1739 * structures is committed because from that moment on the
1740 * buffer can be reallocated and used by a different page.
1741 * Since the block hasn't been freed yet but the inode has
1742 * already been added to orphan list, it is safe for us to add
1743 * the buffer to BJ_Forget list of the newest transaction.
1744 */
1730 transaction = jh->b_transaction; 1745 transaction = jh->b_transaction;
1731 if (transaction == NULL) { 1746 if (transaction == NULL) {
1732 /* First case: not on any transaction. If it 1747 /* First case: not on any transaction. If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1783 } else if (transaction == journal->j_committing_transaction) { 1798 } else if (transaction == journal->j_committing_transaction) {
1784 JBUFFER_TRACE(jh, "on committing transaction"); 1799 JBUFFER_TRACE(jh, "on committing transaction");
1785 /* 1800 /*
1786 * If it is committing, we simply cannot touch it. We 1801 * The buffer is committing, we simply cannot touch
1787 * can remove it's next_transaction pointer from the 1802 * it. So we just set j_next_transaction to the
1788 * running transaction if that is set, but nothing 1803 * running transaction (if there is one) and mark
1789 * else. */ 1804 * buffer as freed so that commit code knows it should
1805 * clear dirty bits when it is done with the buffer.
1806 */
1790 set_buffer_freed(bh); 1807 set_buffer_freed(bh);
1791 if (jh->b_next_transaction) { 1808 if (journal->j_running_transaction && buffer_jbddirty(bh))
1792 J_ASSERT(jh->b_next_transaction == 1809 jh->b_next_transaction = journal->j_running_transaction;
1793 journal->j_running_transaction);
1794 jh->b_next_transaction = NULL;
1795 }
1796 jbd2_journal_put_journal_head(jh); 1810 jbd2_journal_put_journal_head(jh);
1797 spin_unlock(&journal->j_list_lock); 1811 spin_unlock(&journal->j_list_lock);
1798 jbd_unlock_bh_state(bh); 1812 jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
1969 */ 1983 */
1970void __jbd2_journal_refile_buffer(struct journal_head *jh) 1984void __jbd2_journal_refile_buffer(struct journal_head *jh)
1971{ 1985{
1972 int was_dirty; 1986 int was_dirty, jlist;
1973 struct buffer_head *bh = jh2bh(jh); 1987 struct buffer_head *bh = jh2bh(jh);
1974 1988
1975 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 1989 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
1991 __jbd2_journal_temp_unlink_buffer(jh); 2005 __jbd2_journal_temp_unlink_buffer(jh);
1992 jh->b_transaction = jh->b_next_transaction; 2006 jh->b_transaction = jh->b_next_transaction;
1993 jh->b_next_transaction = NULL; 2007 jh->b_next_transaction = NULL;
1994 __jbd2_journal_file_buffer(jh, jh->b_transaction, 2008 if (buffer_freed(bh))
1995 jh->b_modified ? BJ_Metadata : BJ_Reserved); 2009 jlist = BJ_Forget;
2010 else if (jh->b_modified)
2011 jlist = BJ_Metadata;
2012 else
2013 jlist = BJ_Reserved;
2014 __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
1996 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2015 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
1997 2016
1998 if (was_dirty) 2017 if (was_dirty)
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77ba..182b78cc3e62 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,6 +22,7 @@
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/quotaops.h> 24#include <linux/quotaops.h>
25#include <linux/writeback.h>
25#include "jfs_incore.h" 26#include "jfs_incore.h"
26#include "jfs_inode.h" 27#include "jfs_inode.h"
27#include "jfs_filsys.h" 28#include "jfs_filsys.h"
@@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait)
120 return rc; 121 return rc;
121} 122}
122 123
123int jfs_write_inode(struct inode *inode, int wait) 124int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
124{ 125{
126 int wait = wbc->sync_mode == WB_SYNC_ALL;
127
125 if (test_cflag(COMMIT_Nolink, inode)) 128 if (test_cflag(COMMIT_Nolink, inode))
126 return 0; 129 return 0;
127 /* 130 /*
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1eff7db34d63..15902b03c2a7 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); 26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
28extern int jfs_commit_inode(struct inode *, int); 28extern int jfs_commit_inode(struct inode *, int);
29extern int jfs_write_inode(struct inode*, int); 29extern int jfs_write_inode(struct inode *, struct writeback_control *);
30extern void jfs_delete_inode(struct inode *); 30extern void jfs_delete_inode(struct inode *);
31extern void jfs_dirty_inode(struct inode *); 31extern void jfs_dirty_inode(struct inode *);
32extern void jfs_truncate(struct inode *); 32extern void jfs_truncate(struct inode *);
diff --git a/fs/libfs.c b/fs/libfs.c
index 6e8d17e1dc4c..9e50bcf55857 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -338,28 +338,14 @@ int simple_readpage(struct file *file, struct page *page)
338 return 0; 338 return 0;
339} 339}
340 340
341int simple_prepare_write(struct file *file, struct page *page,
342 unsigned from, unsigned to)
343{
344 if (!PageUptodate(page)) {
345 if (to - from != PAGE_CACHE_SIZE)
346 zero_user_segments(page,
347 0, from,
348 to, PAGE_CACHE_SIZE);
349 }
350 return 0;
351}
352
353int simple_write_begin(struct file *file, struct address_space *mapping, 341int simple_write_begin(struct file *file, struct address_space *mapping,
354 loff_t pos, unsigned len, unsigned flags, 342 loff_t pos, unsigned len, unsigned flags,
355 struct page **pagep, void **fsdata) 343 struct page **pagep, void **fsdata)
356{ 344{
357 struct page *page; 345 struct page *page;
358 pgoff_t index; 346 pgoff_t index;
359 unsigned from;
360 347
361 index = pos >> PAGE_CACHE_SHIFT; 348 index = pos >> PAGE_CACHE_SHIFT;
362 from = pos & (PAGE_CACHE_SIZE - 1);
363 349
364 page = grab_cache_page_write_begin(mapping, index, flags); 350 page = grab_cache_page_write_begin(mapping, index, flags);
365 if (!page) 351 if (!page)
@@ -367,43 +353,59 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
367 353
368 *pagep = page; 354 *pagep = page;
369 355
370 return simple_prepare_write(file, page, from, from+len); 356 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
371} 357 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
372
373static int simple_commit_write(struct file *file, struct page *page,
374 unsigned from, unsigned to)
375{
376 struct inode *inode = page->mapping->host;
377 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
378 358
379 if (!PageUptodate(page)) 359 zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
380 SetPageUptodate(page); 360 }
381 /*
382 * No need to use i_size_read() here, the i_size
383 * cannot change under us because we hold the i_mutex.
384 */
385 if (pos > inode->i_size)
386 i_size_write(inode, pos);
387 set_page_dirty(page);
388 return 0; 361 return 0;
389} 362}
390 363
364/**
365 * simple_write_end - .write_end helper for non-block-device FSes
366 * @available: See .write_end of address_space_operations
367 * @file: "
368 * @mapping: "
369 * @pos: "
370 * @len: "
371 * @copied: "
372 * @page: "
373 * @fsdata: "
374 *
375 * simple_write_end does the minimum needed for updating a page after writing is
376 * done. It has the same API signature as the .write_end of
377 * address_space_operations vector. So it can just be set onto .write_end for
378 * FSes that don't need any other processing. i_mutex is assumed to be held.
379 * Block based filesystems should use generic_write_end().
380 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
381 * is not called, so a filesystem that actually does store data in .write_inode
382 * should extend on what's done here with a call to mark_inode_dirty() in the
383 * case that i_size has changed.
384 */
391int simple_write_end(struct file *file, struct address_space *mapping, 385int simple_write_end(struct file *file, struct address_space *mapping,
392 loff_t pos, unsigned len, unsigned copied, 386 loff_t pos, unsigned len, unsigned copied,
393 struct page *page, void *fsdata) 387 struct page *page, void *fsdata)
394{ 388{
395 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 389 struct inode *inode = page->mapping->host;
390 loff_t last_pos = pos + copied;
396 391
397 /* zero the stale part of the page if we did a short copy */ 392 /* zero the stale part of the page if we did a short copy */
398 if (copied < len) { 393 if (copied < len) {
399 void *kaddr = kmap_atomic(page, KM_USER0); 394 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
400 memset(kaddr + from + copied, 0, len - copied); 395
401 flush_dcache_page(page); 396 zero_user(page, from + copied, len - copied);
402 kunmap_atomic(kaddr, KM_USER0);
403 } 397 }
404 398
405 simple_commit_write(file, page, from, from+copied); 399 if (!PageUptodate(page))
400 SetPageUptodate(page);
401 /*
402 * No need to use i_size_read() here, the i_size
403 * cannot change under us because we hold the i_mutex.
404 */
405 if (last_pos > inode->i_size)
406 i_size_write(inode, last_pos);
406 407
408 set_page_dirty(page);
407 unlock_page(page); 409 unlock_page(page);
408 page_cache_release(page); 410 page_cache_release(page);
409 411
@@ -853,7 +855,6 @@ EXPORT_SYMBOL(simple_getattr);
853EXPORT_SYMBOL(simple_link); 855EXPORT_SYMBOL(simple_link);
854EXPORT_SYMBOL(simple_lookup); 856EXPORT_SYMBOL(simple_lookup);
855EXPORT_SYMBOL(simple_pin_fs); 857EXPORT_SYMBOL(simple_pin_fs);
856EXPORT_UNUSED_SYMBOL(simple_prepare_write);
857EXPORT_SYMBOL(simple_readpage); 858EXPORT_SYMBOL(simple_readpage);
858EXPORT_SYMBOL(simple_release_fs); 859EXPORT_SYMBOL(simple_release_fs);
859EXPORT_SYMBOL(simple_rename); 860EXPORT_SYMBOL(simple_rename);
diff --git a/fs/locks.c b/fs/locks.c
index a8794f233bc9..ae9ded026b7c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1182,8 +1182,9 @@ int __break_lease(struct inode *inode, unsigned int mode)
1182 struct file_lock *fl; 1182 struct file_lock *fl;
1183 unsigned long break_time; 1183 unsigned long break_time;
1184 int i_have_this_lease = 0; 1184 int i_have_this_lease = 0;
1185 int want_write = (mode & O_ACCMODE) != O_RDONLY;
1185 1186
1186 new_fl = lease_alloc(NULL, mode & FMODE_WRITE ? F_WRLCK : F_RDLCK); 1187 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1187 1188
1188 lock_kernel(); 1189 lock_kernel();
1189 1190
@@ -1197,7 +1198,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1197 if (fl->fl_owner == current->files) 1198 if (fl->fl_owner == current->files)
1198 i_have_this_lease = 1; 1199 i_have_this_lease = 1;
1199 1200
1200 if (mode & FMODE_WRITE) { 1201 if (want_write) {
1201 /* If we want write access, we have to revoke any lease. */ 1202 /* If we want write access, we have to revoke any lease. */
1202 future = F_UNLCK | F_INPROGRESS; 1203 future = F_UNLCK | F_INPROGRESS;
1203 } else if (flock->fl_type & F_INPROGRESS) { 1204 } else if (flock->fl_type & F_INPROGRESS) {
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 74ea82d72164..756f8c93780c 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,8 +17,10 @@
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/highuid.h> 18#include <linux/highuid.h>
19#include <linux/vfs.h> 19#include <linux/vfs.h>
20#include <linux/writeback.h>
20 21
21static int minix_write_inode(struct inode * inode, int wait); 22static int minix_write_inode(struct inode *inode,
23 struct writeback_control *wbc);
22static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); 24static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
23static int minix_remount (struct super_block * sb, int * flags, char * data); 25static int minix_remount (struct super_block * sb, int * flags, char * data);
24 26
@@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
552 return bh; 554 return bh;
553} 555}
554 556
555static int minix_write_inode(struct inode *inode, int wait) 557static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
556{ 558{
557 int err = 0; 559 int err = 0;
558 struct buffer_head *bh; 560 struct buffer_head *bh;
@@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait)
563 bh = V2_minix_update_inode(inode); 565 bh = V2_minix_update_inode(inode);
564 if (!bh) 566 if (!bh)
565 return -EIO; 567 return -EIO;
566 if (wait && buffer_dirty(bh)) { 568 if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
567 sync_dirty_buffer(bh); 569 sync_dirty_buffer(bh);
568 if (buffer_req(bh) && !buffer_uptodate(bh)) { 570 if (buffer_req(bh) && !buffer_uptodate(bh)) {
569 printk("IO error syncing minix inode [%s:%08lx]\n", 571 printk("IO error syncing minix inode [%s:%08lx]\n",
diff --git a/fs/namei.c b/fs/namei.c
index d62fdc875f22..9a6456099f1e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -498,8 +498,6 @@ static int link_path_walk(const char *, struct nameidata *);
498 498
499static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 499static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
500{ 500{
501 int res = 0;
502 char *name;
503 if (IS_ERR(link)) 501 if (IS_ERR(link))
504 goto fail; 502 goto fail;
505 503
@@ -510,22 +508,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
510 path_get(&nd->root); 508 path_get(&nd->root);
511 } 509 }
512 510
513 res = link_path_walk(link, nd); 511 return link_path_walk(link, nd);
514 if (nd->depth || res || nd->last_type!=LAST_NORM)
515 return res;
516 /*
517 * If it is an iterative symlinks resolution in open_namei() we
518 * have to copy the last component. And all that crap because of
519 * bloody create() on broken symlinks. Furrfu...
520 */
521 name = __getname();
522 if (unlikely(!name)) {
523 path_put(&nd->path);
524 return -ENOMEM;
525 }
526 strcpy(name, nd->last.name);
527 nd->last.name = name;
528 return 0;
529fail: 512fail:
530 path_put(&nd->path); 513 path_put(&nd->path);
531 return PTR_ERR(link); 514 return PTR_ERR(link);
@@ -547,10 +530,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
547 nd->path.dentry = path->dentry; 530 nd->path.dentry = path->dentry;
548} 531}
549 532
550static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) 533static __always_inline int
534__do_follow_link(struct path *path, struct nameidata *nd, void **p)
551{ 535{
552 int error; 536 int error;
553 void *cookie;
554 struct dentry *dentry = path->dentry; 537 struct dentry *dentry = path->dentry;
555 538
556 touch_atime(path->mnt, dentry); 539 touch_atime(path->mnt, dentry);
@@ -562,9 +545,9 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
562 } 545 }
563 mntget(path->mnt); 546 mntget(path->mnt);
564 nd->last_type = LAST_BIND; 547 nd->last_type = LAST_BIND;
565 cookie = dentry->d_inode->i_op->follow_link(dentry, nd); 548 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
566 error = PTR_ERR(cookie); 549 error = PTR_ERR(*p);
567 if (!IS_ERR(cookie)) { 550 if (!IS_ERR(*p)) {
568 char *s = nd_get_link(nd); 551 char *s = nd_get_link(nd);
569 error = 0; 552 error = 0;
570 if (s) 553 if (s)
@@ -574,8 +557,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
574 if (error) 557 if (error)
575 path_put(&nd->path); 558 path_put(&nd->path);
576 } 559 }
577 if (dentry->d_inode->i_op->put_link)
578 dentry->d_inode->i_op->put_link(dentry, nd, cookie);
579 } 560 }
580 return error; 561 return error;
581} 562}
@@ -589,6 +570,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
589 */ 570 */
590static inline int do_follow_link(struct path *path, struct nameidata *nd) 571static inline int do_follow_link(struct path *path, struct nameidata *nd)
591{ 572{
573 void *cookie;
592 int err = -ELOOP; 574 int err = -ELOOP;
593 if (current->link_count >= MAX_NESTED_LINKS) 575 if (current->link_count >= MAX_NESTED_LINKS)
594 goto loop; 576 goto loop;
@@ -602,7 +584,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
602 current->link_count++; 584 current->link_count++;
603 current->total_link_count++; 585 current->total_link_count++;
604 nd->depth++; 586 nd->depth++;
605 err = __do_follow_link(path, nd); 587 err = __do_follow_link(path, nd, &cookie);
588 if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
589 path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
606 path_put(path); 590 path_put(path);
607 current->link_count--; 591 current->link_count--;
608 nd->depth--; 592 nd->depth--;
@@ -689,33 +673,20 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
689 set_root(nd); 673 set_root(nd);
690 674
691 while(1) { 675 while(1) {
692 struct vfsmount *parent;
693 struct dentry *old = nd->path.dentry; 676 struct dentry *old = nd->path.dentry;
694 677
695 if (nd->path.dentry == nd->root.dentry && 678 if (nd->path.dentry == nd->root.dentry &&
696 nd->path.mnt == nd->root.mnt) { 679 nd->path.mnt == nd->root.mnt) {
697 break; 680 break;
698 } 681 }
699 spin_lock(&dcache_lock);
700 if (nd->path.dentry != nd->path.mnt->mnt_root) { 682 if (nd->path.dentry != nd->path.mnt->mnt_root) {
701 nd->path.dentry = dget(nd->path.dentry->d_parent); 683 /* rare case of legitimate dget_parent()... */
702 spin_unlock(&dcache_lock); 684 nd->path.dentry = dget_parent(nd->path.dentry);
703 dput(old); 685 dput(old);
704 break; 686 break;
705 } 687 }
706 spin_unlock(&dcache_lock); 688 if (!follow_up(&nd->path))
707 spin_lock(&vfsmount_lock);
708 parent = nd->path.mnt->mnt_parent;
709 if (parent == nd->path.mnt) {
710 spin_unlock(&vfsmount_lock);
711 break; 689 break;
712 }
713 mntget(parent);
714 nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
715 spin_unlock(&vfsmount_lock);
716 dput(old);
717 mntput(nd->path.mnt);
718 nd->path.mnt = parent;
719 } 690 }
720 follow_mount(&nd->path); 691 follow_mount(&nd->path);
721} 692}
@@ -823,6 +794,17 @@ fail:
823} 794}
824 795
825/* 796/*
797 * This is a temporary kludge to deal with "automount" symlinks; proper
798 * solution is to trigger them on follow_mount(), so that do_lookup()
799 * would DTRT. To be killed before 2.6.34-final.
800 */
801static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
802{
803 return inode && unlikely(inode->i_op->follow_link) &&
804 ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
805}
806
807/*
826 * Name resolution. 808 * Name resolution.
827 * This is the basic name resolution function, turning a pathname into 809 * This is the basic name resolution function, turning a pathname into
828 * the final dentry. We expect 'base' to be positive and a directory. 810 * the final dentry. We expect 'base' to be positive and a directory.
@@ -942,8 +924,7 @@ last_component:
942 if (err) 924 if (err)
943 break; 925 break;
944 inode = next.dentry->d_inode; 926 inode = next.dentry->d_inode;
945 if ((lookup_flags & LOOKUP_FOLLOW) 927 if (follow_on_final(inode, lookup_flags)) {
946 && inode && inode->i_op->follow_link) {
947 err = do_follow_link(&next, nd); 928 err = do_follow_link(&next, nd);
948 if (err) 929 if (err)
949 goto return_err; 930 goto return_err;
@@ -1337,7 +1318,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1337 return -ENOENT; 1318 return -ENOENT;
1338 1319
1339 BUG_ON(victim->d_parent->d_inode != dir); 1320 BUG_ON(victim->d_parent->d_inode != dir);
1340 audit_inode_child(victim->d_name.name, victim, dir); 1321 audit_inode_child(victim, dir);
1341 1322
1342 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 1323 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
1343 if (error) 1324 if (error)
@@ -1378,22 +1359,6 @@ static inline int may_create(struct inode *dir, struct dentry *child)
1378 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 1359 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
1379} 1360}
1380 1361
1381/*
1382 * O_DIRECTORY translates into forcing a directory lookup.
1383 */
1384static inline int lookup_flags(unsigned int f)
1385{
1386 unsigned long retval = LOOKUP_FOLLOW;
1387
1388 if (f & O_NOFOLLOW)
1389 retval &= ~LOOKUP_FOLLOW;
1390
1391 if (f & O_DIRECTORY)
1392 retval |= LOOKUP_DIRECTORY;
1393
1394 return retval;
1395}
1396
1397/* 1362/*
1398 * p1 and p2 should be directories on the same fs. 1363 * p1 and p2 should be directories on the same fs.
1399 */ 1364 */
@@ -1493,7 +1458,7 @@ int may_open(struct path *path, int acc_mode, int flag)
1493 * An append-only file must be opened in append mode for writing. 1458 * An append-only file must be opened in append mode for writing.
1494 */ 1459 */
1495 if (IS_APPEND(inode)) { 1460 if (IS_APPEND(inode)) {
1496 if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) 1461 if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
1497 return -EPERM; 1462 return -EPERM;
1498 if (flag & O_TRUNC) 1463 if (flag & O_TRUNC)
1499 return -EPERM; 1464 return -EPERM;
@@ -1537,7 +1502,7 @@ static int handle_truncate(struct path *path)
1537 * what get passed to sys_open(). 1502 * what get passed to sys_open().
1538 */ 1503 */
1539static int __open_namei_create(struct nameidata *nd, struct path *path, 1504static int __open_namei_create(struct nameidata *nd, struct path *path,
1540 int flag, int mode) 1505 int open_flag, int mode)
1541{ 1506{
1542 int error; 1507 int error;
1543 struct dentry *dir = nd->path.dentry; 1508 struct dentry *dir = nd->path.dentry;
@@ -1555,7 +1520,7 @@ out_unlock:
1555 if (error) 1520 if (error)
1556 return error; 1521 return error;
1557 /* Don't check for write permission, don't truncate */ 1522 /* Don't check for write permission, don't truncate */
1558 return may_open(&nd->path, 0, flag & ~O_TRUNC); 1523 return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
1559} 1524}
1560 1525
1561/* 1526/*
@@ -1593,129 +1558,135 @@ static int open_will_truncate(int flag, struct inode *inode)
1593 return (flag & O_TRUNC); 1558 return (flag & O_TRUNC);
1594} 1559}
1595 1560
1596/* 1561static struct file *finish_open(struct nameidata *nd,
1597 * Note that the low bits of the passed in "open_flag" 1562 int open_flag, int acc_mode)
1598 * are not the same as in the local variable "flag". See
1599 * open_to_namei_flags() for more details.
1600 */
1601struct file *do_filp_open(int dfd, const char *pathname,
1602 int open_flag, int mode, int acc_mode)
1603{ 1563{
1604 struct file *filp; 1564 struct file *filp;
1605 struct nameidata nd;
1606 int error;
1607 struct path path;
1608 struct dentry *dir;
1609 int count = 0;
1610 int will_truncate; 1565 int will_truncate;
1611 int flag = open_to_namei_flags(open_flag); 1566 int error;
1612 int force_reval = 0;
1613 1567
1568 will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
1569 if (will_truncate) {
1570 error = mnt_want_write(nd->path.mnt);
1571 if (error)
1572 goto exit;
1573 }
1574 error = may_open(&nd->path, acc_mode, open_flag);
1575 if (error) {
1576 if (will_truncate)
1577 mnt_drop_write(nd->path.mnt);
1578 goto exit;
1579 }
1580 filp = nameidata_to_filp(nd);
1581 if (!IS_ERR(filp)) {
1582 error = ima_file_check(filp, acc_mode);
1583 if (error) {
1584 fput(filp);
1585 filp = ERR_PTR(error);
1586 }
1587 }
1588 if (!IS_ERR(filp)) {
1589 if (acc_mode & MAY_WRITE)
1590 vfs_dq_init(nd->path.dentry->d_inode);
1591
1592 if (will_truncate) {
1593 error = handle_truncate(&nd->path);
1594 if (error) {
1595 fput(filp);
1596 filp = ERR_PTR(error);
1597 }
1598 }
1599 }
1614 /* 1600 /*
1615 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only 1601 * It is now safe to drop the mnt write
1616 * check for O_DSYNC if the need any syncing at all we enforce it's 1602 * because the filp has had a write taken
1617 * always set instead of having to deal with possibly weird behaviour 1603 * on its behalf.
1618 * for malicious applications setting only __O_SYNC.
1619 */ 1604 */
1620 if (open_flag & __O_SYNC) 1605 if (will_truncate)
1621 open_flag |= O_DSYNC; 1606 mnt_drop_write(nd->path.mnt);
1622 1607 return filp;
1623 if (!acc_mode)
1624 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
1625 1608
1626 /* O_TRUNC implies we need access checks for write permissions */ 1609exit:
1627 if (flag & O_TRUNC) 1610 if (!IS_ERR(nd->intent.open.file))
1628 acc_mode |= MAY_WRITE; 1611 release_open_intent(nd);
1612 path_put(&nd->path);
1613 return ERR_PTR(error);
1614}
1629 1615
1630 /* Allow the LSM permission hook to distinguish append 1616static struct file *do_last(struct nameidata *nd, struct path *path,
1631 access from general write access. */ 1617 int open_flag, int acc_mode,
1632 if (flag & O_APPEND) 1618 int mode, const char *pathname,
1633 acc_mode |= MAY_APPEND; 1619 int *want_dir)
1620{
1621 struct dentry *dir = nd->path.dentry;
1622 struct file *filp;
1623 int error = -EISDIR;
1634 1624
1635 /* 1625 switch (nd->last_type) {
1636 * The simplest case - just a plain lookup. 1626 case LAST_DOTDOT:
1637 */ 1627 follow_dotdot(nd);
1638 if (!(flag & O_CREAT)) { 1628 dir = nd->path.dentry;
1639 filp = get_empty_filp(); 1629 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1640 1630 if (!dir->d_op->d_revalidate(dir, nd)) {
1641 if (filp == NULL) 1631 error = -ESTALE;
1642 return ERR_PTR(-ENFILE); 1632 goto exit;
1643 nd.intent.open.file = filp;
1644 filp->f_flags = open_flag;
1645 nd.intent.open.flags = flag;
1646 nd.intent.open.create_mode = 0;
1647 error = do_path_lookup(dfd, pathname,
1648 lookup_flags(flag)|LOOKUP_OPEN, &nd);
1649 if (IS_ERR(nd.intent.open.file)) {
1650 if (error == 0) {
1651 error = PTR_ERR(nd.intent.open.file);
1652 path_put(&nd.path);
1653 } 1633 }
1654 } else if (error) 1634 }
1655 release_open_intent(&nd); 1635 /* fallthrough */
1656 if (error) 1636 case LAST_DOT:
1657 return ERR_PTR(error); 1637 case LAST_ROOT:
1638 if (open_flag & O_CREAT)
1639 goto exit;
1640 /* fallthrough */
1641 case LAST_BIND:
1642 audit_inode(pathname, dir);
1658 goto ok; 1643 goto ok;
1659 } 1644 }
1660 1645
1661 /* 1646 /* trailing slashes? */
1662 * Create - we need to know the parent. 1647 if (nd->last.name[nd->last.len]) {
1663 */ 1648 if (open_flag & O_CREAT)
1664reval: 1649 goto exit;
1665 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); 1650 *want_dir = 1;
1666 if (error)
1667 return ERR_PTR(error);
1668 if (force_reval)
1669 nd.flags |= LOOKUP_REVAL;
1670 error = path_walk(pathname, &nd);
1671 if (error) {
1672 if (nd.root.mnt)
1673 path_put(&nd.root);
1674 return ERR_PTR(error);
1675 } 1651 }
1676 if (unlikely(!audit_dummy_context()))
1677 audit_inode(pathname, nd.path.dentry);
1678 1652
1679 /* 1653 /* just plain open? */
1680 * We have the parent and last component. First of all, check 1654 if (!(open_flag & O_CREAT)) {
1681 * that we are not asked to creat(2) an obvious directory - that 1655 error = do_lookup(nd, &nd->last, path);
1682 * will not do. 1656 if (error)
1683 */ 1657 goto exit;
1684 error = -EISDIR; 1658 error = -ENOENT;
1685 if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) 1659 if (!path->dentry->d_inode)
1686 goto exit_parent; 1660 goto exit_dput;
1661 if (path->dentry->d_inode->i_op->follow_link)
1662 return NULL;
1663 error = -ENOTDIR;
1664 if (*want_dir & !path->dentry->d_inode->i_op->lookup)
1665 goto exit_dput;
1666 path_to_nameidata(path, nd);
1667 audit_inode(pathname, nd->path.dentry);
1668 goto ok;
1669 }
1687 1670
1688 error = -ENFILE; 1671 /* OK, it's O_CREAT */
1689 filp = get_empty_filp();
1690 if (filp == NULL)
1691 goto exit_parent;
1692 nd.intent.open.file = filp;
1693 filp->f_flags = open_flag;
1694 nd.intent.open.flags = flag;
1695 nd.intent.open.create_mode = mode;
1696 dir = nd.path.dentry;
1697 nd.flags &= ~LOOKUP_PARENT;
1698 nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
1699 if (flag & O_EXCL)
1700 nd.flags |= LOOKUP_EXCL;
1701 mutex_lock(&dir->d_inode->i_mutex); 1672 mutex_lock(&dir->d_inode->i_mutex);
1702 path.dentry = lookup_hash(&nd);
1703 path.mnt = nd.path.mnt;
1704 1673
1705do_last: 1674 path->dentry = lookup_hash(nd);
1706 error = PTR_ERR(path.dentry); 1675 path->mnt = nd->path.mnt;
1707 if (IS_ERR(path.dentry)) { 1676
1677 error = PTR_ERR(path->dentry);
1678 if (IS_ERR(path->dentry)) {
1708 mutex_unlock(&dir->d_inode->i_mutex); 1679 mutex_unlock(&dir->d_inode->i_mutex);
1709 goto exit; 1680 goto exit;
1710 } 1681 }
1711 1682
1712 if (IS_ERR(nd.intent.open.file)) { 1683 if (IS_ERR(nd->intent.open.file)) {
1713 error = PTR_ERR(nd.intent.open.file); 1684 error = PTR_ERR(nd->intent.open.file);
1714 goto exit_mutex_unlock; 1685 goto exit_mutex_unlock;
1715 } 1686 }
1716 1687
1717 /* Negative dentry, just create the file */ 1688 /* Negative dentry, just create the file */
1718 if (!path.dentry->d_inode) { 1689 if (!path->dentry->d_inode) {
1719 /* 1690 /*
1720 * This write is needed to ensure that a 1691 * This write is needed to ensure that a
1721 * ro->rw transition does not occur between 1692 * ro->rw transition does not occur between
@@ -1723,18 +1694,16 @@ do_last:
1723 * a permanent write count is taken through 1694 * a permanent write count is taken through
1724 * the 'struct file' in nameidata_to_filp(). 1695 * the 'struct file' in nameidata_to_filp().
1725 */ 1696 */
1726 error = mnt_want_write(nd.path.mnt); 1697 error = mnt_want_write(nd->path.mnt);
1727 if (error) 1698 if (error)
1728 goto exit_mutex_unlock; 1699 goto exit_mutex_unlock;
1729 error = __open_namei_create(&nd, &path, flag, mode); 1700 error = __open_namei_create(nd, path, open_flag, mode);
1730 if (error) { 1701 if (error) {
1731 mnt_drop_write(nd.path.mnt); 1702 mnt_drop_write(nd->path.mnt);
1732 goto exit; 1703 goto exit;
1733 } 1704 }
1734 filp = nameidata_to_filp(&nd); 1705 filp = nameidata_to_filp(nd);
1735 mnt_drop_write(nd.path.mnt); 1706 mnt_drop_write(nd->path.mnt);
1736 if (nd.root.mnt)
1737 path_put(&nd.root);
1738 if (!IS_ERR(filp)) { 1707 if (!IS_ERR(filp)) {
1739 error = ima_file_check(filp, acc_mode); 1708 error = ima_file_check(filp, acc_mode);
1740 if (error) { 1709 if (error) {
@@ -1749,150 +1718,181 @@ do_last:
1749 * It already exists. 1718 * It already exists.
1750 */ 1719 */
1751 mutex_unlock(&dir->d_inode->i_mutex); 1720 mutex_unlock(&dir->d_inode->i_mutex);
1752 audit_inode(pathname, path.dentry); 1721 audit_inode(pathname, path->dentry);
1753 1722
1754 error = -EEXIST; 1723 error = -EEXIST;
1755 if (flag & O_EXCL) 1724 if (open_flag & O_EXCL)
1756 goto exit_dput; 1725 goto exit_dput;
1757 1726
1758 if (__follow_mount(&path)) { 1727 if (__follow_mount(path)) {
1759 error = -ELOOP; 1728 error = -ELOOP;
1760 if (flag & O_NOFOLLOW) 1729 if (open_flag & O_NOFOLLOW)
1761 goto exit_dput; 1730 goto exit_dput;
1762 } 1731 }
1763 1732
1764 error = -ENOENT; 1733 error = -ENOENT;
1765 if (!path.dentry->d_inode) 1734 if (!path->dentry->d_inode)
1766 goto exit_dput; 1735 goto exit_dput;
1767 if (path.dentry->d_inode->i_op->follow_link)
1768 goto do_link;
1769 1736
1770 path_to_nameidata(&path, &nd); 1737 if (path->dentry->d_inode->i_op->follow_link)
1738 return NULL;
1739
1740 path_to_nameidata(path, nd);
1771 error = -EISDIR; 1741 error = -EISDIR;
1772 if (S_ISDIR(path.dentry->d_inode->i_mode)) 1742 if (S_ISDIR(path->dentry->d_inode->i_mode))
1773 goto exit; 1743 goto exit;
1774ok: 1744ok:
1745 filp = finish_open(nd, open_flag, acc_mode);
1746 return filp;
1747
1748exit_mutex_unlock:
1749 mutex_unlock(&dir->d_inode->i_mutex);
1750exit_dput:
1751 path_put_conditional(path, nd);
1752exit:
1753 if (!IS_ERR(nd->intent.open.file))
1754 release_open_intent(nd);
1755 path_put(&nd->path);
1756 return ERR_PTR(error);
1757}
1758
1759/*
1760 * Note that the low bits of the passed in "open_flag"
1761 * are not the same as in the local variable "flag". See
1762 * open_to_namei_flags() for more details.
1763 */
1764struct file *do_filp_open(int dfd, const char *pathname,
1765 int open_flag, int mode, int acc_mode)
1766{
1767 struct file *filp;
1768 struct nameidata nd;
1769 int error;
1770 struct path path;
1771 int count = 0;
1772 int flag = open_to_namei_flags(open_flag);
1773 int force_reval = 0;
1774 int want_dir = open_flag & O_DIRECTORY;
1775
1776 if (!(open_flag & O_CREAT))
1777 mode = 0;
1778
1775 /* 1779 /*
1776 * Consider: 1780 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
1777 * 1. may_open() truncates a file 1781 * check for O_DSYNC if the need any syncing at all we enforce it's
1778 * 2. a rw->ro mount transition occurs 1782 * always set instead of having to deal with possibly weird behaviour
1779 * 3. nameidata_to_filp() fails due to 1783 * for malicious applications setting only __O_SYNC.
1780 * the ro mount.
1781 * That would be inconsistent, and should
1782 * be avoided. Taking this mnt write here
1783 * ensures that (2) can not occur.
1784 */ 1784 */
1785 will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode); 1785 if (open_flag & __O_SYNC)
1786 if (will_truncate) { 1786 open_flag |= O_DSYNC;
1787 error = mnt_want_write(nd.path.mnt); 1787
1788 if (error) 1788 if (!acc_mode)
1789 goto exit; 1789 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
1790 } 1790
1791 error = may_open(&nd.path, acc_mode, flag); 1791 /* O_TRUNC implies we need access checks for write permissions */
1792 if (open_flag & O_TRUNC)
1793 acc_mode |= MAY_WRITE;
1794
1795 /* Allow the LSM permission hook to distinguish append
1796 access from general write access. */
1797 if (open_flag & O_APPEND)
1798 acc_mode |= MAY_APPEND;
1799
1800 /* find the parent */
1801reval:
1802 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
1803 if (error)
1804 return ERR_PTR(error);
1805 if (force_reval)
1806 nd.flags |= LOOKUP_REVAL;
1807
1808 current->total_link_count = 0;
1809 error = link_path_walk(pathname, &nd);
1792 if (error) { 1810 if (error) {
1793 if (will_truncate) 1811 filp = ERR_PTR(error);
1794 mnt_drop_write(nd.path.mnt); 1812 goto out;
1795 goto exit;
1796 }
1797 filp = nameidata_to_filp(&nd);
1798 if (!IS_ERR(filp)) {
1799 error = ima_file_check(filp, acc_mode);
1800 if (error) {
1801 fput(filp);
1802 filp = ERR_PTR(error);
1803 }
1804 } 1813 }
1805 if (!IS_ERR(filp)) { 1814 if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
1806 if (acc_mode & MAY_WRITE) 1815 audit_inode(pathname, nd.path.dentry);
1807 vfs_dq_init(nd.path.dentry->d_inode);
1808 1816
1809 if (will_truncate) {
1810 error = handle_truncate(&nd.path);
1811 if (error) {
1812 fput(filp);
1813 filp = ERR_PTR(error);
1814 }
1815 }
1816 }
1817 /* 1817 /*
1818 * It is now safe to drop the mnt write 1818 * We have the parent and last component.
1819 * because the filp has had a write taken
1820 * on its behalf.
1821 */ 1819 */
1822 if (will_truncate) 1820
1823 mnt_drop_write(nd.path.mnt); 1821 error = -ENFILE;
1822 filp = get_empty_filp();
1823 if (filp == NULL)
1824 goto exit_parent;
1825 nd.intent.open.file = filp;
1826 filp->f_flags = open_flag;
1827 nd.intent.open.flags = flag;
1828 nd.intent.open.create_mode = mode;
1829 nd.flags &= ~LOOKUP_PARENT;
1830 nd.flags |= LOOKUP_OPEN;
1831 if (open_flag & O_CREAT) {
1832 nd.flags |= LOOKUP_CREATE;
1833 if (open_flag & O_EXCL)
1834 nd.flags |= LOOKUP_EXCL;
1835 }
1836 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
1837 while (unlikely(!filp)) { /* trailing symlink */
1838 struct path holder;
1839 struct inode *inode = path.dentry->d_inode;
1840 void *cookie;
1841 error = -ELOOP;
1842 /* S_ISDIR part is a temporary automount kludge */
1843 if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
1844 goto exit_dput;
1845 if (count++ == 32)
1846 goto exit_dput;
1847 /*
1848 * This is subtle. Instead of calling do_follow_link() we do
1849 * the thing by hands. The reason is that this way we have zero
1850 * link_count and path_walk() (called from ->follow_link)
1851 * honoring LOOKUP_PARENT. After that we have the parent and
1852 * last component, i.e. we are in the same situation as after
1853 * the first path_walk(). Well, almost - if the last component
1854 * is normal we get its copy stored in nd->last.name and we will
1855 * have to putname() it when we are done. Procfs-like symlinks
1856 * just set LAST_BIND.
1857 */
1858 nd.flags |= LOOKUP_PARENT;
1859 error = security_inode_follow_link(path.dentry, &nd);
1860 if (error)
1861 goto exit_dput;
1862 error = __do_follow_link(&path, &nd, &cookie);
1863 if (unlikely(error)) {
1864 /* nd.path had been dropped */
1865 if (!IS_ERR(cookie) && inode->i_op->put_link)
1866 inode->i_op->put_link(path.dentry, &nd, cookie);
1867 path_put(&path);
1868 release_open_intent(&nd);
1869 filp = ERR_PTR(error);
1870 goto out;
1871 }
1872 holder = path;
1873 nd.flags &= ~LOOKUP_PARENT;
1874 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
1875 if (inode->i_op->put_link)
1876 inode->i_op->put_link(holder.dentry, &nd, cookie);
1877 path_put(&holder);
1878 }
1879out:
1824 if (nd.root.mnt) 1880 if (nd.root.mnt)
1825 path_put(&nd.root); 1881 path_put(&nd.root);
1882 if (filp == ERR_PTR(-ESTALE) && !force_reval) {
1883 force_reval = 1;
1884 goto reval;
1885 }
1826 return filp; 1886 return filp;
1827 1887
1828exit_mutex_unlock:
1829 mutex_unlock(&dir->d_inode->i_mutex);
1830exit_dput: 1888exit_dput:
1831 path_put_conditional(&path, &nd); 1889 path_put_conditional(&path, &nd);
1832exit:
1833 if (!IS_ERR(nd.intent.open.file)) 1890 if (!IS_ERR(nd.intent.open.file))
1834 release_open_intent(&nd); 1891 release_open_intent(&nd);
1835exit_parent: 1892exit_parent:
1836 if (nd.root.mnt)
1837 path_put(&nd.root);
1838 path_put(&nd.path); 1893 path_put(&nd.path);
1839 return ERR_PTR(error); 1894 filp = ERR_PTR(error);
1840 1895 goto out;
1841do_link:
1842 error = -ELOOP;
1843 if (flag & O_NOFOLLOW)
1844 goto exit_dput;
1845 /*
1846 * This is subtle. Instead of calling do_follow_link() we do the
1847 * thing by hands. The reason is that this way we have zero link_count
1848 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
1849 * After that we have the parent and last component, i.e.
1850 * we are in the same situation as after the first path_walk().
1851 * Well, almost - if the last component is normal we get its copy
1852 * stored in nd->last.name and we will have to putname() it when we
1853 * are done. Procfs-like symlinks just set LAST_BIND.
1854 */
1855 nd.flags |= LOOKUP_PARENT;
1856 error = security_inode_follow_link(path.dentry, &nd);
1857 if (error)
1858 goto exit_dput;
1859 error = __do_follow_link(&path, &nd);
1860 path_put(&path);
1861 if (error) {
1862 /* Does someone understand code flow here? Or it is only
1863 * me so stupid? Anathema to whoever designed this non-sense
1864 * with "intent.open".
1865 */
1866 release_open_intent(&nd);
1867 if (nd.root.mnt)
1868 path_put(&nd.root);
1869 if (error == -ESTALE && !force_reval) {
1870 force_reval = 1;
1871 goto reval;
1872 }
1873 return ERR_PTR(error);
1874 }
1875 nd.flags &= ~LOOKUP_PARENT;
1876 if (nd.last_type == LAST_BIND)
1877 goto ok;
1878 error = -EISDIR;
1879 if (nd.last_type != LAST_NORM)
1880 goto exit;
1881 if (nd.last.name[nd.last.len]) {
1882 __putname(nd.last.name);
1883 goto exit;
1884 }
1885 error = -ELOOP;
1886 if (count++==32) {
1887 __putname(nd.last.name);
1888 goto exit;
1889 }
1890 dir = nd.path.dentry;
1891 mutex_lock(&dir->d_inode->i_mutex);
1892 path.dentry = lookup_hash(&nd);
1893 path.mnt = nd.path.mnt;
1894 __putname(nd.last.name);
1895 goto do_last;
1896} 1896}
1897 1897
1898/** 1898/**
@@ -2265,8 +2265,11 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2265 error = -EBUSY; 2265 error = -EBUSY;
2266 else { 2266 else {
2267 error = security_inode_unlink(dir, dentry); 2267 error = security_inode_unlink(dir, dentry);
2268 if (!error) 2268 if (!error) {
2269 error = dir->i_op->unlink(dir, dentry); 2269 error = dir->i_op->unlink(dir, dentry);
2270 if (!error)
2271 dentry->d_inode->i_flags |= S_DEAD;
2272 }
2270 } 2273 }
2271 mutex_unlock(&dentry->d_inode->i_mutex); 2274 mutex_unlock(&dentry->d_inode->i_mutex);
2272 2275
@@ -2619,6 +2622,8 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2619 else 2622 else
2620 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2623 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2621 if (!error) { 2624 if (!error) {
2625 if (target)
2626 target->i_flags |= S_DEAD;
2622 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 2627 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2623 d_move(old_dentry, new_dentry); 2628 d_move(old_dentry, new_dentry);
2624 } 2629 }
@@ -2661,11 +2666,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2661 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); 2666 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2662 else 2667 else
2663 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); 2668 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2664 if (!error) { 2669 if (!error)
2665 const char *new_name = old_dentry->d_name.name; 2670 fsnotify_move(old_dir, new_dir, old_name, is_dir,
2666 fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
2667 new_dentry->d_inode, old_dentry); 2671 new_dentry->d_inode, old_dentry);
2668 }
2669 fsnotify_oldname_free(old_name); 2672 fsnotify_oldname_free(old_name);
2670 2673
2671 return error; 2674 return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index c768f733c8d6..8174c8ab5c70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -573,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
573 mnt->mnt_master = old; 573 mnt->mnt_master = old;
574 CLEAR_MNT_SHARED(mnt); 574 CLEAR_MNT_SHARED(mnt);
575 } else if (!(flag & CL_PRIVATE)) { 575 } else if (!(flag & CL_PRIVATE)) {
576 if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) 576 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
577 list_add(&mnt->mnt_share, &old->mnt_share); 577 list_add(&mnt->mnt_share, &old->mnt_share);
578 if (IS_MNT_SLAVE(old)) 578 if (IS_MNT_SLAVE(old))
579 list_add(&mnt->mnt_slave, &old->mnt_slave); 579 list_add(&mnt->mnt_slave, &old->mnt_slave);
@@ -737,6 +737,21 @@ static void m_stop(struct seq_file *m, void *v)
737 up_read(&namespace_sem); 737 up_read(&namespace_sem);
738} 738}
739 739
740int mnt_had_events(struct proc_mounts *p)
741{
742 struct mnt_namespace *ns = p->ns;
743 int res = 0;
744
745 spin_lock(&vfsmount_lock);
746 if (p->event != ns->event) {
747 p->event = ns->event;
748 res = 1;
749 }
750 spin_unlock(&vfsmount_lock);
751
752 return res;
753}
754
740struct proc_fs_info { 755struct proc_fs_info {
741 int flag; 756 int flag;
742 const char *str; 757 const char *str;
@@ -1121,8 +1136,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1121{ 1136{
1122 struct path path; 1137 struct path path;
1123 int retval; 1138 int retval;
1139 int lookup_flags = 0;
1124 1140
1125 retval = user_path(name, &path); 1141 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1142 return -EINVAL;
1143
1144 if (!(flags & UMOUNT_NOFOLLOW))
1145 lookup_flags |= LOOKUP_FOLLOW;
1146
1147 retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
1126 if (retval) 1148 if (retval)
1127 goto out; 1149 goto out;
1128 retval = -EINVAL; 1150 retval = -EINVAL;
@@ -1246,6 +1268,21 @@ void drop_collected_mounts(struct vfsmount *mnt)
1246 release_mounts(&umount_list); 1268 release_mounts(&umount_list);
1247} 1269}
1248 1270
1271int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1272 struct vfsmount *root)
1273{
1274 struct vfsmount *mnt;
1275 int res = f(root, arg);
1276 if (res)
1277 return res;
1278 list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
1279 res = f(mnt, arg);
1280 if (res)
1281 return res;
1282 }
1283 return 0;
1284}
1285
1249static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) 1286static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
1250{ 1287{
1251 struct vfsmount *p; 1288 struct vfsmount *p;
@@ -1538,7 +1575,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1538 err = do_remount_sb(sb, flags, data, 0); 1575 err = do_remount_sb(sb, flags, data, 0);
1539 if (!err) { 1576 if (!err) {
1540 spin_lock(&vfsmount_lock); 1577 spin_lock(&vfsmount_lock);
1541 mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK; 1578 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
1542 path->mnt->mnt_flags = mnt_flags; 1579 path->mnt->mnt_flags = mnt_flags;
1543 spin_unlock(&vfsmount_lock); 1580 spin_unlock(&vfsmount_lock);
1544 } 1581 }
@@ -1671,7 +1708,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1671{ 1708{
1672 int err; 1709 int err;
1673 1710
1674 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD); 1711 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
1675 1712
1676 down_write(&namespace_sem); 1713 down_write(&namespace_sem);
1677 /* Something was mounted here while we slept */ 1714 /* Something was mounted here while we slept */
@@ -2314,17 +2351,13 @@ void __init mnt_init(void)
2314 2351
2315void put_mnt_ns(struct mnt_namespace *ns) 2352void put_mnt_ns(struct mnt_namespace *ns)
2316{ 2353{
2317 struct vfsmount *root;
2318 LIST_HEAD(umount_list); 2354 LIST_HEAD(umount_list);
2319 2355
2320 if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock)) 2356 if (!atomic_dec_and_test(&ns->count))
2321 return; 2357 return;
2322 root = ns->root;
2323 ns->root = NULL;
2324 spin_unlock(&vfsmount_lock);
2325 down_write(&namespace_sem); 2358 down_write(&namespace_sem);
2326 spin_lock(&vfsmount_lock); 2359 spin_lock(&vfsmount_lock);
2327 umount_tree(root, 0, &umount_list); 2360 umount_tree(ns->root, 0, &umount_list);
2328 spin_unlock(&vfsmount_lock); 2361 spin_unlock(&vfsmount_lock);
2329 up_write(&namespace_sem); 2362 up_write(&namespace_sem);
2330 release_mounts(&umount_list); 2363 release_mounts(&umount_list);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 59e5673b4597..a43d07e7b924 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -95,8 +95,7 @@ config ROOT_NFS
95 Most people say N here. 95 Most people say N here.
96 96
97config NFS_FSCACHE 97config NFS_FSCACHE
98 bool "Provide NFS client caching support (EXPERIMENTAL)" 98 bool "Provide NFS client caching support"
99 depends on EXPERIMENTAL
100 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y 99 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
101 help 100 help
102 Say Y here if you want NFS data to be cached locally on disc through 101 Say Y here if you want NFS data to be cached locally on disc through
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3c7f03b669fb..a1f6b4438fb1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -560,7 +560,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
560 desc->entry = &my_entry; 560 desc->entry = &my_entry;
561 561
562 nfs_block_sillyrename(dentry); 562 nfs_block_sillyrename(dentry);
563 res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping); 563 res = nfs_revalidate_mapping(inode, filp->f_mapping);
564 if (res < 0) 564 if (res < 0)
565 goto out; 565 goto out;
566 566
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e1d415e97849..0d289823e856 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -342,6 +342,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
342 data->res.fattr = &data->fattr; 342 data->res.fattr = &data->fattr;
343 data->res.eof = 0; 343 data->res.eof = 0;
344 data->res.count = bytes; 344 data->res.count = bytes;
345 nfs_fattr_init(&data->fattr);
345 msg.rpc_argp = &data->args; 346 msg.rpc_argp = &data->args;
346 msg.rpc_resp = &data->res; 347 msg.rpc_resp = &data->res;
347 348
@@ -575,6 +576,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
575 data->res.count = 0; 576 data->res.count = 0;
576 data->res.fattr = &data->fattr; 577 data->res.fattr = &data->fattr;
577 data->res.verf = &data->verf; 578 data->res.verf = &data->verf;
579 nfs_fattr_init(&data->fattr);
578 580
579 NFS_PROTO(data->inode)->commit_setup(data, &msg); 581 NFS_PROTO(data->inode)->commit_setup(data, &msg);
580 582
@@ -766,6 +768,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
766 data->res.fattr = &data->fattr; 768 data->res.fattr = &data->fattr;
767 data->res.count = bytes; 769 data->res.count = bytes;
768 data->res.verf = &data->verf; 770 data->res.verf = &data->verf;
771 nfs_fattr_init(&data->fattr);
769 772
770 task_setup_data.task = &data->task; 773 task_setup_data.task = &data->task;
771 task_setup_data.callback_data = data; 774 task_setup_data.callback_data = data;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index fa588006588d..237874f1af23 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -354,12 +354,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
354 */ 354 */
355int nfs_fscache_release_page(struct page *page, gfp_t gfp) 355int nfs_fscache_release_page(struct page *page, gfp_t gfp)
356{ 356{
357 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
358 struct fscache_cookie *cookie = nfsi->fscache;
359
360 BUG_ON(!cookie);
361
362 if (PageFsCache(page)) { 357 if (PageFsCache(page)) {
358 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
359 struct fscache_cookie *cookie = nfsi->fscache;
360
361 BUG_ON(!cookie);
363 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", 362 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
364 cookie, page, nfsi); 363 cookie, page, nfsi);
365 364
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 87cca56846d6..657201acda84 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
97 return ino; 97 return ino;
98} 98}
99 99
100int nfs_write_inode(struct inode *inode, int sync)
101{
102 int ret;
103
104 if (sync) {
105 ret = filemap_fdatawait(inode->i_mapping);
106 if (ret == 0)
107 ret = nfs_commit_inode(inode, FLUSH_SYNC);
108 } else
109 ret = nfs_commit_inode(inode, 0);
110 if (ret >= 0)
111 return 0;
112 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
113 return ret;
114}
115
116void nfs_clear_inode(struct inode *inode) 100void nfs_clear_inode(struct inode *inode)
117{ 101{
118 /* 102 /*
@@ -130,16 +114,12 @@ void nfs_clear_inode(struct inode *inode)
130 */ 114 */
131int nfs_sync_mapping(struct address_space *mapping) 115int nfs_sync_mapping(struct address_space *mapping)
132{ 116{
133 int ret; 117 int ret = 0;
134 118
135 if (mapping->nrpages == 0) 119 if (mapping->nrpages != 0) {
136 return 0; 120 unmap_mapping_range(mapping, 0, 0, 0);
137 unmap_mapping_range(mapping, 0, 0, 0); 121 ret = nfs_wb_all(mapping->host);
138 ret = filemap_write_and_wait(mapping); 122 }
139 if (ret != 0)
140 goto out;
141 ret = nfs_wb_all(mapping->host);
142out:
143 return ret; 123 return ret;
144} 124}
145 125
@@ -511,17 +491,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
511 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; 491 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
512 int err; 492 int err;
513 493
514 /* 494 /* Flush out writes to the server in order to update c/mtime. */
515 * Flush out writes to the server in order to update c/mtime.
516 *
517 * Hold the i_mutex to suspend application writes temporarily;
518 * this prevents long-running writing applications from blocking
519 * nfs_wb_nocommit.
520 */
521 if (S_ISREG(inode->i_mode)) { 495 if (S_ISREG(inode->i_mode)) {
522 mutex_lock(&inode->i_mutex); 496 err = filemap_write_and_wait(inode->i_mapping);
523 nfs_wb_nocommit(inode); 497 if (err)
524 mutex_unlock(&inode->i_mutex); 498 goto out;
525 } 499 }
526 500
527 /* 501 /*
@@ -545,6 +519,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
545 generic_fillattr(inode, stat); 519 generic_fillattr(inode, stat);
546 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 520 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
547 } 521 }
522out:
548 return err; 523 return err;
549} 524}
550 525
@@ -574,14 +549,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
574 nfs_revalidate_inode(server, inode); 549 nfs_revalidate_inode(server, inode);
575} 550}
576 551
577static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) 552static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
578{ 553{
579 struct nfs_open_context *ctx; 554 struct nfs_open_context *ctx;
580 555
581 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 556 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
582 if (ctx != NULL) { 557 if (ctx != NULL) {
583 ctx->path.dentry = dget(dentry); 558 ctx->path = *path;
584 ctx->path.mnt = mntget(mnt); 559 path_get(&ctx->path);
585 ctx->cred = get_rpccred(cred); 560 ctx->cred = get_rpccred(cred);
586 ctx->state = NULL; 561 ctx->state = NULL;
587 ctx->lockowner = current->files; 562 ctx->lockowner = current->files;
@@ -681,7 +656,7 @@ int nfs_open(struct inode *inode, struct file *filp)
681 cred = rpc_lookup_cred(); 656 cred = rpc_lookup_cred();
682 if (IS_ERR(cred)) 657 if (IS_ERR(cred))
683 return PTR_ERR(cred); 658 return PTR_ERR(cred);
684 ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred); 659 ctx = alloc_nfs_open_context(&filp->f_path, cred);
685 put_rpccred(cred); 660 put_rpccred(cred);
686 if (ctx == NULL) 661 if (ctx == NULL)
687 return -ENOMEM; 662 return -ENOMEM;
@@ -774,7 +749,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
774 return __nfs_revalidate_inode(server, inode); 749 return __nfs_revalidate_inode(server, inode);
775} 750}
776 751
777static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) 752static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
778{ 753{
779 struct nfs_inode *nfsi = NFS_I(inode); 754 struct nfs_inode *nfsi = NFS_I(inode);
780 755
@@ -795,49 +770,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
795 return 0; 770 return 0;
796} 771}
797 772
798static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
799{
800 int ret = 0;
801
802 mutex_lock(&inode->i_mutex);
803 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) {
804 ret = nfs_sync_mapping(mapping);
805 if (ret == 0)
806 ret = nfs_invalidate_mapping_nolock(inode, mapping);
807 }
808 mutex_unlock(&inode->i_mutex);
809 return ret;
810}
811
812/**
813 * nfs_revalidate_mapping_nolock - Revalidate the pagecache
814 * @inode - pointer to host inode
815 * @mapping - pointer to mapping
816 */
817int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
818{
819 struct nfs_inode *nfsi = NFS_I(inode);
820 int ret = 0;
821
822 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
823 || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
824 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
825 if (ret < 0)
826 goto out;
827 }
828 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
829 ret = nfs_invalidate_mapping_nolock(inode, mapping);
830out:
831 return ret;
832}
833
834/** 773/**
835 * nfs_revalidate_mapping - Revalidate the pagecache 774 * nfs_revalidate_mapping - Revalidate the pagecache
836 * @inode - pointer to host inode 775 * @inode - pointer to host inode
837 * @mapping - pointer to mapping 776 * @mapping - pointer to mapping
838 *
839 * This version of the function will take the inode->i_mutex and attempt to
840 * flush out all dirty data if it needs to invalidate the page cache.
841 */ 777 */
842int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) 778int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
843{ 779{
@@ -1415,6 +1351,7 @@ static void init_once(void *foo)
1415 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1351 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1416 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1352 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1417 nfsi->npages = 0; 1353 nfsi->npages = 0;
1354 nfsi->ncommit = 0;
1418 atomic_set(&nfsi->silly_count, 1); 1355 atomic_set(&nfsi->silly_count, 1);
1419 INIT_HLIST_HEAD(&nfsi->silly_list); 1356 INIT_HLIST_HEAD(&nfsi->silly_list);
1420 init_waitqueue_head(&nfsi->waitqueue); 1357 init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d23b32..11f82f03c5de 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
211extern struct workqueue_struct *nfsiod_workqueue; 211extern struct workqueue_struct *nfsiod_workqueue;
212extern struct inode *nfs_alloc_inode(struct super_block *sb); 212extern struct inode *nfs_alloc_inode(struct super_block *sb);
213extern void nfs_destroy_inode(struct inode *); 213extern void nfs_destroy_inode(struct inode *);
214extern int nfs_write_inode(struct inode *,int); 214extern int nfs_write_inode(struct inode *, struct writeback_control *);
215extern void nfs_clear_inode(struct inode *); 215extern void nfs_clear_inode(struct inode *);
216#ifdef CONFIG_NFS_V4 216#ifdef CONFIG_NFS_V4
217extern void nfs4_clear_inode(struct inode *); 217extern void nfs4_clear_inode(struct inode *);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 46d779abafd3..1d8d5c813b01 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -57,12 +57,12 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
57} 57}
58#endif 58#endif
59 59
60static inline struct nfs_iostats *nfs_alloc_iostats(void) 60static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
61{ 61{
62 return alloc_percpu(struct nfs_iostats); 62 return alloc_percpu(struct nfs_iostats);
63} 63}
64 64
65static inline void nfs_free_iostats(struct nfs_iostats *stats) 65static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
66{ 66{
67 if (stats != NULL) 67 if (stats != NULL)
68 free_percpu(stats); 68 free_percpu(stats);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0adefc40cc89..59047f8d7d72 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -120,7 +120,7 @@ static struct {
120 { .status = MNT3ERR_INVAL, .errno = -EINVAL, }, 120 { .status = MNT3ERR_INVAL, .errno = -EINVAL, },
121 { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, }, 121 { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, },
122 { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, }, 122 { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, },
123 { .status = MNT3ERR_SERVERFAULT, .errno = -ESERVERFAULT, }, 123 { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, },
124}; 124};
125 125
126struct mountres { 126struct mountres {
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5e078b222b4e..7bc2da8efd4a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -699,7 +699,7 @@ static struct {
699 { NFSERR_BAD_COOKIE, -EBADCOOKIE }, 699 { NFSERR_BAD_COOKIE, -EBADCOOKIE },
700 { NFSERR_NOTSUPP, -ENOTSUPP }, 700 { NFSERR_NOTSUPP, -ENOTSUPP },
701 { NFSERR_TOOSMALL, -ETOOSMALL }, 701 { NFSERR_TOOSMALL, -ETOOSMALL },
702 { NFSERR_SERVERFAULT, -ESERVERFAULT }, 702 { NFSERR_SERVERFAULT, -EREMOTEIO },
703 { NFSERR_BADTYPE, -EBADTYPE }, 703 { NFSERR_BADTYPE, -EBADTYPE },
704 { NFSERR_JUKEBOX, -EJUKEBOX }, 704 { NFSERR_JUKEBOX, -EJUKEBOX },
705 { -1, -EIO } 705 { -1, -EIO }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index adc116c57e14..eda74c42d552 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -726,8 +726,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
726 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 726 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
727 if (p->o_arg.seqid == NULL) 727 if (p->o_arg.seqid == NULL)
728 goto err_free; 728 goto err_free;
729 p->path.mnt = mntget(path->mnt); 729 path_get(path);
730 p->path.dentry = dget(path->dentry); 730 p->path = *path;
731 p->dir = parent; 731 p->dir = parent;
732 p->owner = sp; 732 p->owner = sp;
733 atomic_inc(&sp->so_count); 733 atomic_inc(&sp->so_count);
@@ -1947,8 +1947,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1947 calldata->res.seqid = calldata->arg.seqid; 1947 calldata->res.seqid = calldata->arg.seqid;
1948 calldata->res.server = server; 1948 calldata->res.server = server;
1949 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; 1949 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
1950 calldata->path.mnt = mntget(path->mnt); 1950 path_get(path);
1951 calldata->path.dentry = dget(path->dentry); 1951 calldata->path = *path;
1952 1952
1953 msg.rpc_argp = &calldata->arg, 1953 msg.rpc_argp = &calldata->arg,
1954 msg.rpc_resp = &calldata->res, 1954 msg.rpc_resp = &calldata->res,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 020ebf151184..4d338be492cb 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4639,7 +4639,7 @@ static int decode_sequence(struct xdr_stream *xdr,
4639 * If the server returns different values for sessionID, slotID or 4639 * If the server returns different values for sessionID, slotID or
4640 * sequence number, the server is looney tunes. 4640 * sequence number, the server is looney tunes.
4641 */ 4641 */
4642 status = -ESERVERFAULT; 4642 status = -EREMOTEIO;
4643 4643
4644 if (memcmp(id.data, res->sr_session->sess_id.data, 4644 if (memcmp(id.data, res->sr_session->sess_id.data,
4645 NFS4_MAX_SESSIONID_LEN)) { 4645 NFS4_MAX_SESSIONID_LEN)) {
@@ -5782,7 +5782,7 @@ static struct {
5782 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, 5782 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
5783 { NFS4ERR_NOTSUPP, -ENOTSUPP }, 5783 { NFS4ERR_NOTSUPP, -ENOTSUPP },
5784 { NFS4ERR_TOOSMALL, -ETOOSMALL }, 5784 { NFS4ERR_TOOSMALL, -ETOOSMALL },
5785 { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, 5785 { NFS4ERR_SERVERFAULT, -EREMOTEIO },
5786 { NFS4ERR_BADTYPE, -EBADTYPE }, 5786 { NFS4ERR_BADTYPE, -EBADTYPE },
5787 { NFS4ERR_LOCKED, -EAGAIN }, 5787 { NFS4ERR_LOCKED, -EAGAIN },
5788 { NFS4ERR_SYMLINK, -ELOOP }, 5788 { NFS4ERR_SYMLINK, -ELOOP },
@@ -5809,7 +5809,7 @@ nfs4_stat_to_errno(int stat)
5809 } 5809 }
5810 if (stat <= 10000 || stat > 10100) { 5810 if (stat <= 10000 || stat > 10100) {
5811 /* The server is looney tunes. */ 5811 /* The server is looney tunes. */
5812 return -ESERVERFAULT; 5812 return -EREMOTEIO;
5813 } 5813 }
5814 /* If we cannot translate the error, the recovery routines should 5814 /* If we cannot translate the error, the recovery routines should
5815 * handle it. 5815 * handle it.
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 412738dbfbc7..2ea9e5c27e55 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -50,7 +50,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
50 struct page *page; 50 struct page *page;
51 void *err; 51 void *err;
52 52
53 err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping)); 53 err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
54 if (err) 54 if (err)
55 goto read_failed; 55 goto read_failed;
56 page = read_cache_page(&inode->i_data, 0, 56 page = read_cache_page(&inode->i_data, 0,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7b54b8bb101f..53ff70e23993 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -438,6 +438,7 @@ nfs_mark_request_commit(struct nfs_page *req)
438 radix_tree_tag_set(&nfsi->nfs_page_tree, 438 radix_tree_tag_set(&nfsi->nfs_page_tree,
439 req->wb_index, 439 req->wb_index,
440 NFS_PAGE_TAG_COMMIT); 440 NFS_PAGE_TAG_COMMIT);
441 nfsi->ncommit++;
441 spin_unlock(&inode->i_lock); 442 spin_unlock(&inode->i_lock);
442 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 443 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
443 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); 444 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
@@ -501,57 +502,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
501} 502}
502#endif 503#endif
503 504
504/*
505 * Wait for a request to complete.
506 *
507 * Interruptible by fatal signals only.
508 */
509static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
510{
511 struct nfs_inode *nfsi = NFS_I(inode);
512 struct nfs_page *req;
513 pgoff_t idx_end, next;
514 unsigned int res = 0;
515 int error;
516
517 if (npages == 0)
518 idx_end = ~0;
519 else
520 idx_end = idx_start + npages - 1;
521
522 next = idx_start;
523 while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
524 if (req->wb_index > idx_end)
525 break;
526
527 next = req->wb_index + 1;
528 BUG_ON(!NFS_WBACK_BUSY(req));
529
530 kref_get(&req->wb_kref);
531 spin_unlock(&inode->i_lock);
532 error = nfs_wait_on_request(req);
533 nfs_release_request(req);
534 spin_lock(&inode->i_lock);
535 if (error < 0)
536 return error;
537 res++;
538 }
539 return res;
540}
541
542static void nfs_cancel_commit_list(struct list_head *head)
543{
544 struct nfs_page *req;
545
546 while(!list_empty(head)) {
547 req = nfs_list_entry(head->next);
548 nfs_list_remove_request(req);
549 nfs_clear_request_commit(req);
550 nfs_inode_remove_request(req);
551 nfs_unlock_request(req);
552 }
553}
554
555#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 505#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
556static int 506static int
557nfs_need_commit(struct nfs_inode *nfsi) 507nfs_need_commit(struct nfs_inode *nfsi)
@@ -573,11 +523,17 @@ static int
573nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 523nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
574{ 524{
575 struct nfs_inode *nfsi = NFS_I(inode); 525 struct nfs_inode *nfsi = NFS_I(inode);
526 int ret;
576 527
577 if (!nfs_need_commit(nfsi)) 528 if (!nfs_need_commit(nfsi))
578 return 0; 529 return 0;
579 530
580 return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); 531 ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
532 if (ret > 0)
533 nfsi->ncommit -= ret;
534 if (nfs_need_commit(NFS_I(inode)))
535 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
536 return ret;
581} 537}
582#else 538#else
583static inline int nfs_need_commit(struct nfs_inode *nfsi) 539static inline int nfs_need_commit(struct nfs_inode *nfsi)
@@ -642,9 +598,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
642 spin_lock(&inode->i_lock); 598 spin_lock(&inode->i_lock);
643 } 599 }
644 600
645 if (nfs_clear_request_commit(req)) 601 if (nfs_clear_request_commit(req) &&
646 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, 602 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
647 req->wb_index, NFS_PAGE_TAG_COMMIT); 603 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL)
604 NFS_I(inode)->ncommit--;
648 605
649 /* Okay, the request matches. Update the region */ 606 /* Okay, the request matches. Update the region */
650 if (offset < req->wb_offset) { 607 if (offset < req->wb_offset) {
@@ -1391,7 +1348,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
1391 .rpc_release = nfs_commit_release, 1348 .rpc_release = nfs_commit_release,
1392}; 1349};
1393 1350
1394int nfs_commit_inode(struct inode *inode, int how) 1351static int nfs_commit_inode(struct inode *inode, int how)
1395{ 1352{
1396 LIST_HEAD(head); 1353 LIST_HEAD(head);
1397 int res; 1354 int res;
@@ -1406,92 +1363,51 @@ int nfs_commit_inode(struct inode *inode, int how)
1406 } 1363 }
1407 return res; 1364 return res;
1408} 1365}
1409#else
1410static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1411{
1412 return 0;
1413}
1414#endif
1415 1366
1416long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) 1367static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1417{ 1368{
1418 struct inode *inode = mapping->host; 1369 struct nfs_inode *nfsi = NFS_I(inode);
1419 pgoff_t idx_start, idx_end; 1370 int flags = FLUSH_SYNC;
1420 unsigned int npages = 0; 1371 int ret = 0;
1421 LIST_HEAD(head); 1372
1422 int nocommit = how & FLUSH_NOCOMMIT; 1373 /* Don't commit yet if this is a non-blocking flush and there are
1423 long pages, ret; 1374 * lots of outstanding writes for this mapping.
1424 1375 */
1425 /* FIXME */ 1376 if (wbc->sync_mode == WB_SYNC_NONE &&
1426 if (wbc->range_cyclic) 1377 nfsi->ncommit <= (nfsi->npages >> 1))
1427 idx_start = 0; 1378 goto out_mark_dirty;
1428 else { 1379
1429 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; 1380 if (wbc->nonblocking || wbc->for_background)
1430 idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; 1381 flags = 0;
1431 if (idx_end > idx_start) { 1382 ret = nfs_commit_inode(inode, flags);
1432 pgoff_t l_npages = 1 + idx_end - idx_start; 1383 if (ret >= 0) {
1433 npages = l_npages; 1384 if (wbc->sync_mode == WB_SYNC_NONE) {
1434 if (sizeof(npages) != sizeof(l_npages) && 1385 if (ret < wbc->nr_to_write)
1435 (pgoff_t)npages != l_npages) 1386 wbc->nr_to_write -= ret;
1436 npages = 0; 1387 else
1388 wbc->nr_to_write = 0;
1437 } 1389 }
1390 return 0;
1438 } 1391 }
1439 how &= ~FLUSH_NOCOMMIT; 1392out_mark_dirty:
1440 spin_lock(&inode->i_lock); 1393 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1441 do {
1442 ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
1443 if (ret != 0)
1444 continue;
1445 if (nocommit)
1446 break;
1447 pages = nfs_scan_commit(inode, &head, idx_start, npages);
1448 if (pages == 0)
1449 break;
1450 if (how & FLUSH_INVALIDATE) {
1451 spin_unlock(&inode->i_lock);
1452 nfs_cancel_commit_list(&head);
1453 ret = pages;
1454 spin_lock(&inode->i_lock);
1455 continue;
1456 }
1457 pages += nfs_scan_commit(inode, &head, 0, 0);
1458 spin_unlock(&inode->i_lock);
1459 ret = nfs_commit_list(inode, &head, how);
1460 spin_lock(&inode->i_lock);
1461
1462 } while (ret >= 0);
1463 spin_unlock(&inode->i_lock);
1464 return ret; 1394 return ret;
1465} 1395}
1466 1396#else
1467static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) 1397static int nfs_commit_inode(struct inode *inode, int how)
1468{ 1398{
1469 int ret;
1470
1471 ret = nfs_writepages(mapping, wbc);
1472 if (ret < 0)
1473 goto out;
1474 ret = nfs_sync_mapping_wait(mapping, wbc, how);
1475 if (ret < 0)
1476 goto out;
1477 return 0; 1399 return 0;
1478out:
1479 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1480 return ret;
1481} 1400}
1482 1401
1483/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */ 1402static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1484static int nfs_write_mapping(struct address_space *mapping, int how)
1485{ 1403{
1486 struct writeback_control wbc = { 1404 return 0;
1487 .bdi = mapping->backing_dev_info, 1405}
1488 .sync_mode = WB_SYNC_ALL, 1406#endif
1489 .nr_to_write = LONG_MAX,
1490 .range_start = 0,
1491 .range_end = LLONG_MAX,
1492 };
1493 1407
1494 return __nfs_write_mapping(mapping, &wbc, how); 1408int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1409{
1410 return nfs_commit_unstable_pages(inode, wbc);
1495} 1411}
1496 1412
1497/* 1413/*
@@ -1499,37 +1415,26 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1499 */ 1415 */
1500int nfs_wb_all(struct inode *inode) 1416int nfs_wb_all(struct inode *inode)
1501{ 1417{
1502 return nfs_write_mapping(inode->i_mapping, 0); 1418 struct writeback_control wbc = {
1503} 1419 .sync_mode = WB_SYNC_ALL,
1420 .nr_to_write = LONG_MAX,
1421 .range_start = 0,
1422 .range_end = LLONG_MAX,
1423 };
1504 1424
1505int nfs_wb_nocommit(struct inode *inode) 1425 return sync_inode(inode, &wbc);
1506{
1507 return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
1508} 1426}
1509 1427
1510int nfs_wb_page_cancel(struct inode *inode, struct page *page) 1428int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1511{ 1429{
1512 struct nfs_page *req; 1430 struct nfs_page *req;
1513 loff_t range_start = page_offset(page);
1514 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1515 struct writeback_control wbc = {
1516 .bdi = page->mapping->backing_dev_info,
1517 .sync_mode = WB_SYNC_ALL,
1518 .nr_to_write = LONG_MAX,
1519 .range_start = range_start,
1520 .range_end = range_end,
1521 };
1522 int ret = 0; 1431 int ret = 0;
1523 1432
1524 BUG_ON(!PageLocked(page)); 1433 BUG_ON(!PageLocked(page));
1525 for (;;) { 1434 for (;;) {
1526 req = nfs_page_find_request(page); 1435 req = nfs_page_find_request(page);
1527 if (req == NULL) 1436 if (req == NULL)
1528 goto out;
1529 if (test_bit(PG_CLEAN, &req->wb_flags)) {
1530 nfs_release_request(req);
1531 break; 1437 break;
1532 }
1533 if (nfs_lock_request_dontget(req)) { 1438 if (nfs_lock_request_dontget(req)) {
1534 nfs_inode_remove_request(req); 1439 nfs_inode_remove_request(req);
1535 /* 1440 /*
@@ -1543,54 +1448,54 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1543 ret = nfs_wait_on_request(req); 1448 ret = nfs_wait_on_request(req);
1544 nfs_release_request(req); 1449 nfs_release_request(req);
1545 if (ret < 0) 1450 if (ret < 0)
1546 goto out; 1451 break;
1547 } 1452 }
1548 if (!PagePrivate(page))
1549 return 0;
1550 ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
1551out:
1552 return ret; 1453 return ret;
1553} 1454}
1554 1455
1555static int nfs_wb_page_priority(struct inode *inode, struct page *page, 1456/*
1556 int how) 1457 * Write back all requests on one page - we do this before reading it.
1458 */
1459int nfs_wb_page(struct inode *inode, struct page *page)
1557{ 1460{
1558 loff_t range_start = page_offset(page); 1461 loff_t range_start = page_offset(page);
1559 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); 1462 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1560 struct writeback_control wbc = { 1463 struct writeback_control wbc = {
1561 .bdi = page->mapping->backing_dev_info,
1562 .sync_mode = WB_SYNC_ALL, 1464 .sync_mode = WB_SYNC_ALL,
1563 .nr_to_write = LONG_MAX, 1465 .nr_to_write = 0,
1564 .range_start = range_start, 1466 .range_start = range_start,
1565 .range_end = range_end, 1467 .range_end = range_end,
1566 }; 1468 };
1469 struct nfs_page *req;
1470 int need_commit;
1567 int ret; 1471 int ret;
1568 1472
1569 do { 1473 while(PagePrivate(page)) {
1570 if (clear_page_dirty_for_io(page)) { 1474 if (clear_page_dirty_for_io(page)) {
1571 ret = nfs_writepage_locked(page, &wbc); 1475 ret = nfs_writepage_locked(page, &wbc);
1572 if (ret < 0) 1476 if (ret < 0)
1573 goto out_error; 1477 goto out_error;
1574 } else if (!PagePrivate(page)) 1478 }
1479 req = nfs_find_and_lock_request(page);
1480 if (!req)
1575 break; 1481 break;
1576 ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); 1482 if (IS_ERR(req)) {
1577 if (ret < 0) 1483 ret = PTR_ERR(req);
1578 goto out_error; 1484 goto out_error;
1579 } while (PagePrivate(page)); 1485 }
1486 need_commit = test_bit(PG_CLEAN, &req->wb_flags);
1487 nfs_clear_page_tag_locked(req);
1488 if (need_commit) {
1489 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1490 if (ret < 0)
1491 goto out_error;
1492 }
1493 }
1580 return 0; 1494 return 0;
1581out_error: 1495out_error:
1582 __mark_inode_dirty(inode, I_DIRTY_PAGES);
1583 return ret; 1496 return ret;
1584} 1497}
1585 1498
1586/*
1587 * Write back all requests on one page - we do this before reading it.
1588 */
1589int nfs_wb_page(struct inode *inode, struct page* page)
1590{
1591 return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
1592}
1593
1594#ifdef CONFIG_MIGRATION 1499#ifdef CONFIG_MIGRATION
1595int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1500int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1596 struct page *page) 1501 struct page *page)
@@ -1598,8 +1503,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1598 struct nfs_page *req; 1503 struct nfs_page *req;
1599 int ret; 1504 int ret;
1600 1505
1601 if (PageFsCache(page)) 1506 nfs_fscache_release_page(page, GFP_KERNEL);
1602 nfs_fscache_release_page(page, GFP_KERNEL);
1603 1507
1604 req = nfs_find_and_lock_request(page); 1508 req = nfs_find_and_lock_request(page);
1605 ret = PTR_ERR(req); 1509 ret = PTR_ERR(req);
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index d3854d94b7cf..bf9cbd242ddd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -36,10 +36,9 @@ static struct file *do_open(char *name, int flags)
36 return ERR_PTR(error); 36 return ERR_PTR(error);
37 37
38 if (flags == O_RDWR) 38 if (flags == O_RDWR)
39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, 39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
40 FMODE_READ|FMODE_WRITE);
41 else 40 else
42 error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE); 41 error = may_open(&nd.path, MAY_WRITE, flags);
43 42
44 if (!error) 43 if (!error)
45 return dentry_open(nd.path.dentry, nd.path.mnt, flags, 44 return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a8587e90fd5a..bbf72d8f9fc0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2121,9 +2121,15 @@ out_acl:
2121 * and this is the root of a cross-mounted filesystem. 2121 * and this is the root of a cross-mounted filesystem.
2122 */ 2122 */
2123 if (ignore_crossmnt == 0 && 2123 if (ignore_crossmnt == 0 &&
2124 exp->ex_path.mnt->mnt_root->d_inode == dentry->d_inode) { 2124 dentry == exp->ex_path.mnt->mnt_root) {
2125 err = vfs_getattr(exp->ex_path.mnt->mnt_parent, 2125 struct path path = exp->ex_path;
2126 exp->ex_path.mnt->mnt_mountpoint, &stat); 2126 path_get(&path);
2127 while (follow_up(&path)) {
2128 if (path.dentry != path.mnt->mnt_root)
2129 break;
2130 }
2131 err = vfs_getattr(path.mnt, path.dentry, &stat);
2132 path_put(&path);
2127 if (err) 2133 if (err)
2128 goto out_nfserr; 2134 goto out_nfserr;
2129 } 2135 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 97d79eff6b7f..15dc2deaac5f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -361,7 +361,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
361 * If we are changing the size of the file, then 361 * If we are changing the size of the file, then
362 * we need to break all leases. 362 * we need to break all leases.
363 */ 363 */
364 host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); 364 host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
365 if (host_err == -EWOULDBLOCK) 365 if (host_err == -EWOULDBLOCK)
366 host_err = -ETIMEDOUT; 366 host_err = -ETIMEDOUT;
367 if (host_err) /* ENOMEM or EWOULDBLOCK */ 367 if (host_err) /* ENOMEM or EWOULDBLOCK */
@@ -734,7 +734,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
734 * Check to see if there are any leases on this file. 734 * Check to see if there are any leases on this file.
735 * This may block while leases are broken. 735 * This may block while leases are broken.
736 */ 736 */
737 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0)); 737 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
738 if (host_err == -EWOULDBLOCK) 738 if (host_err == -EWOULDBLOCK)
739 host_err = -ETIMEDOUT; 739 host_err = -ETIMEDOUT;
740 if (host_err) /* NOMEM or WOULDBLOCK */ 740 if (host_err) /* NOMEM or WOULDBLOCK */
@@ -752,7 +752,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
752 flags, current_cred()); 752 flags, current_cred());
753 if (IS_ERR(*filp)) 753 if (IS_ERR(*filp))
754 host_err = PTR_ERR(*filp); 754 host_err = PTR_ERR(*filp);
755 host_err = ima_file_check(*filp, access); 755 else
756 host_err = ima_file_check(*filp, access);
756out_nfserr: 757out_nfserr:
757 err = nfserrno(host_err); 758 err = nfserrno(host_err);
758out: 759out:
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 187dd07ba86c..9d1e5de91afb 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -388,8 +388,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
388 ret = -ENOENT; 388 ret = -ENOENT;
389 goto out; 389 goto out;
390 } 390 }
391 if (blocknrp != NULL) 391 *blocknrp = blocknr;
392 *blocknrp = blocknr;
393 392
394 out: 393 out:
395 kunmap_atomic(kaddr, KM_USER0); 394 kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 76d803e060a9..0092840492ee 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -224,7 +224,7 @@ fail:
224 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. 224 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
225 */ 225 */
226static int 226static int
227nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de) 227nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
228{ 228{
229 if (len != de->name_len) 229 if (len != de->name_len)
230 return 0; 230 return 0;
@@ -349,11 +349,11 @@ done:
349 * Entry is guaranteed to be valid. 349 * Entry is guaranteed to be valid.
350 */ 350 */
351struct nilfs_dir_entry * 351struct nilfs_dir_entry *
352nilfs_find_entry(struct inode *dir, struct dentry *dentry, 352nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
353 struct page **res_page) 353 struct page **res_page)
354{ 354{
355 const char *name = dentry->d_name.name; 355 const unsigned char *name = qstr->name;
356 int namelen = dentry->d_name.len; 356 int namelen = qstr->len;
357 unsigned reclen = NILFS_DIR_REC_LEN(namelen); 357 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
358 unsigned long start, n; 358 unsigned long start, n;
359 unsigned long npages = dir_pages(dir); 359 unsigned long npages = dir_pages(dir);
@@ -424,13 +424,13 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
424 return de; 424 return de;
425} 425}
426 426
427ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry) 427ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
428{ 428{
429 ino_t res = 0; 429 ino_t res = 0;
430 struct nilfs_dir_entry *de; 430 struct nilfs_dir_entry *de;
431 struct page *page; 431 struct page *page;
432 432
433 de = nilfs_find_entry(dir, dentry, &page); 433 de = nilfs_find_entry(dir, qstr, &page);
434 if (de) { 434 if (de) {
435 res = le64_to_cpu(de->inode); 435 res = le64_to_cpu(de->inode);
436 kunmap(page); 436 kunmap(page);
@@ -465,7 +465,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
465int nilfs_add_link(struct dentry *dentry, struct inode *inode) 465int nilfs_add_link(struct dentry *dentry, struct inode *inode)
466{ 466{
467 struct inode *dir = dentry->d_parent->d_inode; 467 struct inode *dir = dentry->d_parent->d_inode;
468 const char *name = dentry->d_name.name; 468 const unsigned char *name = dentry->d_name.name;
469 int namelen = dentry->d_name.len; 469 int namelen = dentry->d_name.len;
470 unsigned chunk_size = nilfs_chunk_size(dir); 470 unsigned chunk_size = nilfs_chunk_size(dir);
471 unsigned reclen = NILFS_DIR_REC_LEN(namelen); 471 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6b2b83de363..313d0a21da48 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -26,6 +26,7 @@
26#include <linux/capability.h> /* capable() */ 26#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */
29#include <linux/nilfs2_fs.h> 30#include <linux/nilfs2_fs.h>
30#include "nilfs.h" 31#include "nilfs.h"
31#include "segment.h" 32#include "segment.h"
@@ -107,20 +108,28 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
107 108
108 if (!capable(CAP_SYS_ADMIN)) 109 if (!capable(CAP_SYS_ADMIN))
109 return -EPERM; 110 return -EPERM;
111
112 ret = mnt_want_write(filp->f_path.mnt);
113 if (ret)
114 return ret;
115
116 ret = -EFAULT;
110 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 117 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
111 return -EFAULT; 118 goto out;
112 119
113 mutex_lock(&nilfs->ns_mount_mutex); 120 mutex_lock(&nilfs->ns_mount_mutex);
121
114 nilfs_transaction_begin(inode->i_sb, &ti, 0); 122 nilfs_transaction_begin(inode->i_sb, &ti, 0);
115 ret = nilfs_cpfile_change_cpmode( 123 ret = nilfs_cpfile_change_cpmode(
116 cpfile, cpmode.cm_cno, cpmode.cm_mode); 124 cpfile, cpmode.cm_cno, cpmode.cm_mode);
117 if (unlikely(ret < 0)) { 125 if (unlikely(ret < 0))
118 nilfs_transaction_abort(inode->i_sb); 126 nilfs_transaction_abort(inode->i_sb);
119 mutex_unlock(&nilfs->ns_mount_mutex); 127 else
120 return ret; 128 nilfs_transaction_commit(inode->i_sb); /* never fails */
121 } 129
122 nilfs_transaction_commit(inode->i_sb); /* never fails */
123 mutex_unlock(&nilfs->ns_mount_mutex); 130 mutex_unlock(&nilfs->ns_mount_mutex);
131out:
132 mnt_drop_write(filp->f_path.mnt);
124 return ret; 133 return ret;
125} 134}
126 135
@@ -135,16 +144,23 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
135 144
136 if (!capable(CAP_SYS_ADMIN)) 145 if (!capable(CAP_SYS_ADMIN))
137 return -EPERM; 146 return -EPERM;
147
148 ret = mnt_want_write(filp->f_path.mnt);
149 if (ret)
150 return ret;
151
152 ret = -EFAULT;
138 if (copy_from_user(&cno, argp, sizeof(cno))) 153 if (copy_from_user(&cno, argp, sizeof(cno)))
139 return -EFAULT; 154 goto out;
140 155
141 nilfs_transaction_begin(inode->i_sb, &ti, 0); 156 nilfs_transaction_begin(inode->i_sb, &ti, 0);
142 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); 157 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
143 if (unlikely(ret < 0)) { 158 if (unlikely(ret < 0))
144 nilfs_transaction_abort(inode->i_sb); 159 nilfs_transaction_abort(inode->i_sb);
145 return ret; 160 else
146 } 161 nilfs_transaction_commit(inode->i_sb); /* never fails */
147 nilfs_transaction_commit(inode->i_sb); /* never fails */ 162out:
163 mnt_drop_write(filp->f_path.mnt);
148 return ret; 164 return ret;
149} 165}
150 166
@@ -496,12 +512,19 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
496 if (!capable(CAP_SYS_ADMIN)) 512 if (!capable(CAP_SYS_ADMIN))
497 return -EPERM; 513 return -EPERM;
498 514
515 ret = mnt_want_write(filp->f_path.mnt);
516 if (ret)
517 return ret;
518
519 ret = -EFAULT;
499 if (copy_from_user(argv, argp, sizeof(argv))) 520 if (copy_from_user(argv, argp, sizeof(argv)))
500 return -EFAULT; 521 goto out;
501 522
523 ret = -EINVAL;
502 nsegs = argv[4].v_nmembs; 524 nsegs = argv[4].v_nmembs;
503 if (argv[4].v_size != argsz[4]) 525 if (argv[4].v_size != argsz[4])
504 return -EINVAL; 526 goto out;
527
505 /* 528 /*
506 * argv[4] points to segment numbers this ioctl cleans. We 529 * argv[4] points to segment numbers this ioctl cleans. We
507 * use kmalloc() for its buffer because memory used for the 530 * use kmalloc() for its buffer because memory used for the
@@ -509,9 +532,10 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
509 */ 532 */
510 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, 533 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
511 nsegs * sizeof(__u64)); 534 nsegs * sizeof(__u64));
512 if (IS_ERR(kbufs[4])) 535 if (IS_ERR(kbufs[4])) {
513 return PTR_ERR(kbufs[4]); 536 ret = PTR_ERR(kbufs[4]);
514 537 goto out;
538 }
515 nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 539 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
516 540
517 for (n = 0; n < 4; n++) { 541 for (n = 0; n < 4; n++) {
@@ -563,10 +587,12 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
563 nilfs_remove_all_gcinode(nilfs); 587 nilfs_remove_all_gcinode(nilfs);
564 clear_nilfs_gc_running(nilfs); 588 clear_nilfs_gc_running(nilfs);
565 589
566 out_free: 590out_free:
567 while (--n >= 0) 591 while (--n >= 0)
568 vfree(kbufs[n]); 592 vfree(kbufs[n]);
569 kfree(kbufs[4]); 593 kfree(kbufs[4]);
594out:
595 mnt_drop_write(filp->f_path.mnt);
570 return ret; 596 return ret;
571} 597}
572 598
@@ -575,13 +601,17 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
575{ 601{
576 __u64 cno; 602 __u64 cno;
577 int ret; 603 int ret;
604 struct the_nilfs *nilfs;
578 605
579 ret = nilfs_construct_segment(inode->i_sb); 606 ret = nilfs_construct_segment(inode->i_sb);
580 if (ret < 0) 607 if (ret < 0)
581 return ret; 608 return ret;
582 609
583 if (argp != NULL) { 610 if (argp != NULL) {
584 cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1; 611 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
612 down_read(&nilfs->ns_segctor_sem);
613 cno = nilfs->ns_cno - 1;
614 up_read(&nilfs->ns_segctor_sem);
585 if (copy_to_user(argp, &cno, sizeof(cno))) 615 if (copy_to_user(argp, &cno, sizeof(cno)))
586 return -EFAULT; 616 return -EFAULT;
587 } 617 }
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 07ba838ef089..ad6ed2cf19b4 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -67,7 +67,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
67 if (dentry->d_name.len > NILFS_NAME_LEN) 67 if (dentry->d_name.len > NILFS_NAME_LEN)
68 return ERR_PTR(-ENAMETOOLONG); 68 return ERR_PTR(-ENAMETOOLONG);
69 69
70 ino = nilfs_inode_by_name(dir, dentry); 70 ino = nilfs_inode_by_name(dir, &dentry->d_name);
71 inode = NULL; 71 inode = NULL;
72 if (ino) { 72 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino); 73 inode = nilfs_iget(dir->i_sb, ino);
@@ -81,10 +81,7 @@ struct dentry *nilfs_get_parent(struct dentry *child)
81{ 81{
82 unsigned long ino; 82 unsigned long ino;
83 struct inode *inode; 83 struct inode *inode;
84 struct dentry dotdot; 84 struct qstr dotdot = {.name = "..", .len = 2};
85
86 dotdot.d_name.name = "..";
87 dotdot.d_name.len = 2;
88 85
89 ino = nilfs_inode_by_name(child->d_inode, &dotdot); 86 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
90 if (!ino) 87 if (!ino)
@@ -296,7 +293,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
296 int err; 293 int err;
297 294
298 err = -ENOENT; 295 err = -ENOENT;
299 de = nilfs_find_entry(dir, dentry, &page); 296 de = nilfs_find_entry(dir, &dentry->d_name, &page);
300 if (!de) 297 if (!de)
301 goto out; 298 goto out;
302 299
@@ -389,7 +386,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
389 return err; 386 return err;
390 387
391 err = -ENOENT; 388 err = -ENOENT;
392 old_de = nilfs_find_entry(old_dir, old_dentry, &old_page); 389 old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
393 if (!old_de) 390 if (!old_de)
394 goto out; 391 goto out;
395 392
@@ -409,7 +406,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
409 goto out_dir; 406 goto out_dir;
410 407
411 err = -ENOENT; 408 err = -ENOENT;
412 new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); 409 new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
413 if (!new_de) 410 if (!new_de)
414 goto out_dir; 411 goto out_dir;
415 inc_nlink(old_inode); 412 inc_nlink(old_inode);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 4da6f67e9a91..8723e5bfd071 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -217,10 +217,10 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
217 217
218/* dir.c */ 218/* dir.c */
219extern int nilfs_add_link(struct dentry *, struct inode *); 219extern int nilfs_add_link(struct dentry *, struct inode *);
220extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *); 220extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
221extern int nilfs_make_empty(struct inode *, struct inode *); 221extern int nilfs_make_empty(struct inode *, struct inode *);
222extern struct nilfs_dir_entry * 222extern struct nilfs_dir_entry *
223nilfs_find_entry(struct inode *, struct dentry *, struct page **); 223nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
224extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); 224extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
225extern int nilfs_empty_dir(struct inode *); 225extern int nilfs_empty_dir(struct inode *);
226extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); 226extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index c9c96c7825dc..017bedc761a0 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -39,7 +39,6 @@ enum {
39 NILFS_SEG_FAIL_IO, 39 NILFS_SEG_FAIL_IO,
40 NILFS_SEG_FAIL_MAGIC, 40 NILFS_SEG_FAIL_MAGIC,
41 NILFS_SEG_FAIL_SEQ, 41 NILFS_SEG_FAIL_SEQ,
42 NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
43 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, 42 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
44 NILFS_SEG_FAIL_CHECKSUM_FULL, 43 NILFS_SEG_FAIL_CHECKSUM_FULL,
45 NILFS_SEG_FAIL_CONSISTENCY, 44 NILFS_SEG_FAIL_CONSISTENCY,
@@ -71,10 +70,6 @@ static int nilfs_warn_segment_error(int err)
71 printk(KERN_WARNING 70 printk(KERN_WARNING
72 "NILFS warning: Sequence number mismatch\n"); 71 "NILFS warning: Sequence number mismatch\n");
73 break; 72 break;
74 case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
75 printk(KERN_WARNING
76 "NILFS warning: Checksum error in segment summary\n");
77 break;
78 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: 73 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
79 printk(KERN_WARNING 74 printk(KERN_WARNING
80 "NILFS warning: Checksum error in super root\n"); 75 "NILFS warning: Checksum error in super root\n");
@@ -206,19 +201,15 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
206 * @pseg_start: start disk block number of partial segment 201 * @pseg_start: start disk block number of partial segment
207 * @seg_seq: sequence number requested 202 * @seg_seq: sequence number requested
208 * @ssi: pointer to nilfs_segsum_info struct to store information 203 * @ssi: pointer to nilfs_segsum_info struct to store information
209 * @full_check: full check flag
210 * (0: only checks segment summary CRC, 1: data CRC)
211 */ 204 */
212static int 205static int
213load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, 206load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
214 u64 seg_seq, struct nilfs_segsum_info *ssi, 207 u64 seg_seq, struct nilfs_segsum_info *ssi)
215 int full_check)
216{ 208{
217 struct buffer_head *bh_sum; 209 struct buffer_head *bh_sum;
218 struct nilfs_segment_summary *sum; 210 struct nilfs_segment_summary *sum;
219 unsigned long offset, nblock; 211 unsigned long nblock;
220 u64 check_bytes; 212 u32 crc;
221 u32 crc, crc_sum;
222 int ret = NILFS_SEG_FAIL_IO; 213 int ret = NILFS_SEG_FAIL_IO;
223 214
224 bh_sum = sb_bread(sbi->s_super, pseg_start); 215 bh_sum = sb_bread(sbi->s_super, pseg_start);
@@ -237,34 +228,24 @@ load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
237 ret = NILFS_SEG_FAIL_SEQ; 228 ret = NILFS_SEG_FAIL_SEQ;
238 goto failed; 229 goto failed;
239 } 230 }
240 if (full_check) {
241 offset = sizeof(sum->ss_datasum);
242 check_bytes =
243 ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
244 nblock = ssi->nblocks;
245 crc_sum = le32_to_cpu(sum->ss_datasum);
246 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
247 } else { /* only checks segment summary */
248 offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
249 check_bytes = ssi->sumbytes;
250 nblock = ssi->nsumblk;
251 crc_sum = le32_to_cpu(sum->ss_sumsum);
252 ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
253 }
254 231
232 nblock = ssi->nblocks;
255 if (unlikely(nblock == 0 || 233 if (unlikely(nblock == 0 ||
256 nblock > sbi->s_nilfs->ns_blocks_per_segment)) { 234 nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
257 /* This limits the number of blocks read in the CRC check */ 235 /* This limits the number of blocks read in the CRC check */
258 ret = NILFS_SEG_FAIL_CONSISTENCY; 236 ret = NILFS_SEG_FAIL_CONSISTENCY;
259 goto failed; 237 goto failed;
260 } 238 }
261 if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes, 239 if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
240 ((u64)nblock << sbi->s_super->s_blocksize_bits),
262 pseg_start, nblock)) { 241 pseg_start, nblock)) {
263 ret = NILFS_SEG_FAIL_IO; 242 ret = NILFS_SEG_FAIL_IO;
264 goto failed; 243 goto failed;
265 } 244 }
266 if (crc == crc_sum) 245 if (crc == le32_to_cpu(sum->ss_datasum))
267 ret = 0; 246 ret = 0;
247 else
248 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
268 failed: 249 failed:
269 brelse(bh_sum); 250 brelse(bh_sum);
270 out: 251 out:
@@ -598,7 +579,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
598 579
599 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { 580 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
600 581
601 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 582 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
602 if (ret) { 583 if (ret) {
603 if (ret == NILFS_SEG_FAIL_IO) { 584 if (ret == NILFS_SEG_FAIL_IO) {
604 err = -EIO; 585 err = -EIO;
@@ -821,7 +802,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
821 802
822 for (;;) { 803 for (;;) {
823 /* Load segment summary */ 804 /* Load segment summary */
824 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 805 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
825 if (ret) { 806 if (ret) {
826 if (ret == NILFS_SEG_FAIL_IO) 807 if (ret == NILFS_SEG_FAIL_IO)
827 goto failed; 808 goto failed;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 645c78656aa0..ab56fe44e377 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -40,6 +40,11 @@ struct nilfs_write_info {
40}; 40};
41 41
42 42
43static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
44 struct the_nilfs *nilfs);
45static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
46
47
43static struct kmem_cache *nilfs_segbuf_cachep; 48static struct kmem_cache *nilfs_segbuf_cachep;
44 49
45static void nilfs_segbuf_init_once(void *obj) 50static void nilfs_segbuf_init_once(void *obj)
@@ -302,6 +307,19 @@ void nilfs_truncate_logs(struct list_head *logs,
302 } 307 }
303} 308}
304 309
310int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
311{
312 struct nilfs_segment_buffer *segbuf;
313 int ret = 0;
314
315 list_for_each_entry(segbuf, logs, sb_list) {
316 ret = nilfs_segbuf_write(segbuf, nilfs);
317 if (ret)
318 break;
319 }
320 return ret;
321}
322
305int nilfs_wait_on_logs(struct list_head *logs) 323int nilfs_wait_on_logs(struct list_head *logs)
306{ 324{
307 struct nilfs_segment_buffer *segbuf; 325 struct nilfs_segment_buffer *segbuf;
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 6af1630fb401..94dfd3517bc0 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -166,13 +166,10 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
166 segbuf->sb_sum.nfileblk++; 166 segbuf->sb_sum.nfileblk++;
167} 167}
168 168
169int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
170 struct the_nilfs *nilfs);
171int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
172
173void nilfs_clear_logs(struct list_head *logs); 169void nilfs_clear_logs(struct list_head *logs);
174void nilfs_truncate_logs(struct list_head *logs, 170void nilfs_truncate_logs(struct list_head *logs,
175 struct nilfs_segment_buffer *last); 171 struct nilfs_segment_buffer *last);
172int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
176int nilfs_wait_on_logs(struct list_head *logs); 173int nilfs_wait_on_logs(struct list_head *logs);
177 174
178static inline void nilfs_destroy_logs(struct list_head *logs) 175static inline void nilfs_destroy_logs(struct list_head *logs)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 105b508b47a8..ada2f1b947a3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1764,14 +1764,9 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1764static int nilfs_segctor_write(struct nilfs_sc_info *sci, 1764static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1765 struct the_nilfs *nilfs) 1765 struct the_nilfs *nilfs)
1766{ 1766{
1767 struct nilfs_segment_buffer *segbuf; 1767 int ret;
1768 int ret = 0;
1769 1768
1770 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { 1769 ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
1771 ret = nilfs_segbuf_write(segbuf, nilfs);
1772 if (ret)
1773 break;
1774 }
1775 list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); 1770 list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
1776 return ret; 1771 return ret;
1777} 1772}
@@ -1937,8 +1932,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1937{ 1932{
1938 struct nilfs_segment_buffer *segbuf; 1933 struct nilfs_segment_buffer *segbuf;
1939 struct page *bd_page = NULL, *fs_page = NULL; 1934 struct page *bd_page = NULL, *fs_page = NULL;
1940 struct nilfs_sb_info *sbi = sci->sc_sbi; 1935 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
1941 struct the_nilfs *nilfs = sbi->s_nilfs;
1942 int update_sr = (sci->sc_super_root != NULL); 1936 int update_sr = (sci->sc_super_root != NULL);
1943 1937
1944 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1938 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -2020,7 +2014,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
2020 if (update_sr) { 2014 if (update_sr) {
2021 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, 2015 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2022 segbuf->sb_sum.seg_seq, nilfs->ns_cno++); 2016 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2023 sbi->s_super->s_dirt = 1; 2017 set_nilfs_sb_dirty(nilfs);
2024 2018
2025 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 2019 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2026 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); 2020 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2425,43 +2419,43 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2425 return err; 2419 return err;
2426} 2420}
2427 2421
2428struct nilfs_segctor_req {
2429 int mode;
2430 __u32 seq_accepted;
2431 int sc_err; /* construction failure */
2432 int sb_err; /* super block writeback failure */
2433};
2434
2435#define FLUSH_FILE_BIT (0x1) /* data file only */ 2422#define FLUSH_FILE_BIT (0x1) /* data file only */
2436#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ 2423#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
2437 2424
2438static void nilfs_segctor_accept(struct nilfs_sc_info *sci, 2425/**
2439 struct nilfs_segctor_req *req) 2426 * nilfs_segctor_accept - record accepted sequence count of log-write requests
2427 * @sci: segment constructor object
2428 */
2429static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
2440{ 2430{
2441 req->sc_err = req->sb_err = 0;
2442 spin_lock(&sci->sc_state_lock); 2431 spin_lock(&sci->sc_state_lock);
2443 req->seq_accepted = sci->sc_seq_request; 2432 sci->sc_seq_accepted = sci->sc_seq_request;
2444 spin_unlock(&sci->sc_state_lock); 2433 spin_unlock(&sci->sc_state_lock);
2445 2434
2446 if (sci->sc_timer) 2435 if (sci->sc_timer)
2447 del_timer_sync(sci->sc_timer); 2436 del_timer_sync(sci->sc_timer);
2448} 2437}
2449 2438
2450static void nilfs_segctor_notify(struct nilfs_sc_info *sci, 2439/**
2451 struct nilfs_segctor_req *req) 2440 * nilfs_segctor_notify - notify the result of request to caller threads
2441 * @sci: segment constructor object
2442 * @mode: mode of log forming
2443 * @err: error code to be notified
2444 */
2445static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2452{ 2446{
2453 /* Clear requests (even when the construction failed) */ 2447 /* Clear requests (even when the construction failed) */
2454 spin_lock(&sci->sc_state_lock); 2448 spin_lock(&sci->sc_state_lock);
2455 2449
2456 if (req->mode == SC_LSEG_SR) { 2450 if (mode == SC_LSEG_SR) {
2457 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; 2451 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2458 sci->sc_seq_done = req->seq_accepted; 2452 sci->sc_seq_done = sci->sc_seq_accepted;
2459 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); 2453 nilfs_segctor_wakeup(sci, err);
2460 sci->sc_flush_request = 0; 2454 sci->sc_flush_request = 0;
2461 } else { 2455 } else {
2462 if (req->mode == SC_FLUSH_FILE) 2456 if (mode == SC_FLUSH_FILE)
2463 sci->sc_flush_request &= ~FLUSH_FILE_BIT; 2457 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2464 else if (req->mode == SC_FLUSH_DAT) 2458 else if (mode == SC_FLUSH_DAT)
2465 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2459 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2466 2460
2467 /* re-enable timer if checkpoint creation was not done */ 2461 /* re-enable timer if checkpoint creation was not done */
@@ -2472,30 +2466,37 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2472 spin_unlock(&sci->sc_state_lock); 2466 spin_unlock(&sci->sc_state_lock);
2473} 2467}
2474 2468
2475static int nilfs_segctor_construct(struct nilfs_sc_info *sci, 2469/**
2476 struct nilfs_segctor_req *req) 2470 * nilfs_segctor_construct - form logs and write them to disk
2471 * @sci: segment constructor object
2472 * @mode: mode of log forming
2473 */
2474static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2477{ 2475{
2478 struct nilfs_sb_info *sbi = sci->sc_sbi; 2476 struct nilfs_sb_info *sbi = sci->sc_sbi;
2479 struct the_nilfs *nilfs = sbi->s_nilfs; 2477 struct the_nilfs *nilfs = sbi->s_nilfs;
2480 int err = 0; 2478 int err = 0;
2481 2479
2480 nilfs_segctor_accept(sci);
2481
2482 if (nilfs_discontinued(nilfs)) 2482 if (nilfs_discontinued(nilfs))
2483 req->mode = SC_LSEG_SR; 2483 mode = SC_LSEG_SR;
2484 if (!nilfs_segctor_confirm(sci)) { 2484 if (!nilfs_segctor_confirm(sci))
2485 err = nilfs_segctor_do_construct(sci, req->mode); 2485 err = nilfs_segctor_do_construct(sci, mode);
2486 req->sc_err = err; 2486
2487 }
2488 if (likely(!err)) { 2487 if (likely(!err)) {
2489 if (req->mode != SC_FLUSH_DAT) 2488 if (mode != SC_FLUSH_DAT)
2490 atomic_set(&nilfs->ns_ndirtyblks, 0); 2489 atomic_set(&nilfs->ns_ndirtyblks, 0);
2491 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2490 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2492 nilfs_discontinued(nilfs)) { 2491 nilfs_discontinued(nilfs)) {
2493 down_write(&nilfs->ns_sem); 2492 down_write(&nilfs->ns_sem);
2494 req->sb_err = nilfs_commit_super(sbi, 2493 err = nilfs_commit_super(
2495 nilfs_altsb_need_update(nilfs)); 2494 sbi, nilfs_altsb_need_update(nilfs));
2496 up_write(&nilfs->ns_sem); 2495 up_write(&nilfs->ns_sem);
2497 } 2496 }
2498 } 2497 }
2498
2499 nilfs_segctor_notify(sci, mode, err);
2499 return err; 2500 return err;
2500} 2501}
2501 2502
@@ -2526,7 +2527,6 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2526 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2527 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2527 struct the_nilfs *nilfs = sbi->s_nilfs; 2528 struct the_nilfs *nilfs = sbi->s_nilfs;
2528 struct nilfs_transaction_info ti; 2529 struct nilfs_transaction_info ti;
2529 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2530 int err; 2530 int err;
2531 2531
2532 if (unlikely(!sci)) 2532 if (unlikely(!sci))
@@ -2547,10 +2547,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2547 list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); 2547 list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
2548 2548
2549 for (;;) { 2549 for (;;) {
2550 nilfs_segctor_accept(sci, &req); 2550 err = nilfs_segctor_construct(sci, SC_LSEG_SR);
2551 err = nilfs_segctor_construct(sci, &req);
2552 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); 2551 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
2553 nilfs_segctor_notify(sci, &req);
2554 2552
2555 if (likely(!err)) 2553 if (likely(!err))
2556 break; 2554 break;
@@ -2560,6 +2558,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2560 set_current_state(TASK_INTERRUPTIBLE); 2558 set_current_state(TASK_INTERRUPTIBLE);
2561 schedule_timeout(sci->sc_interval); 2559 schedule_timeout(sci->sc_interval);
2562 } 2560 }
2561 if (nilfs_test_opt(sbi, DISCARD)) {
2562 int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
2563 sci->sc_nfreesegs);
2564 if (ret) {
2565 printk(KERN_WARNING
2566 "NILFS warning: error %d on discard request, "
2567 "turning discards off for the device\n", ret);
2568 nilfs_clear_opt(sbi, DISCARD);
2569 }
2570 }
2563 2571
2564 out_unlock: 2572 out_unlock:
2565 sci->sc_freesegs = NULL; 2573 sci->sc_freesegs = NULL;
@@ -2573,13 +2581,9 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2573{ 2581{
2574 struct nilfs_sb_info *sbi = sci->sc_sbi; 2582 struct nilfs_sb_info *sbi = sci->sc_sbi;
2575 struct nilfs_transaction_info ti; 2583 struct nilfs_transaction_info ti;
2576 struct nilfs_segctor_req req = { .mode = mode };
2577 2584
2578 nilfs_transaction_lock(sbi, &ti, 0); 2585 nilfs_transaction_lock(sbi, &ti, 0);
2579 2586 nilfs_segctor_construct(sci, mode);
2580 nilfs_segctor_accept(sci, &req);
2581 nilfs_segctor_construct(sci, &req);
2582 nilfs_segctor_notify(sci, &req);
2583 2587
2584 /* 2588 /*
2585 * Unclosed segment should be retried. We do this using sc_timer. 2589 * Unclosed segment should be retried. We do this using sc_timer.
@@ -2635,6 +2639,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2635static int nilfs_segctor_thread(void *arg) 2639static int nilfs_segctor_thread(void *arg)
2636{ 2640{
2637 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2641 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2642 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2638 struct timer_list timer; 2643 struct timer_list timer;
2639 int timeout = 0; 2644 int timeout = 0;
2640 2645
@@ -2680,7 +2685,6 @@ static int nilfs_segctor_thread(void *arg)
2680 } else { 2685 } else {
2681 DEFINE_WAIT(wait); 2686 DEFINE_WAIT(wait);
2682 int should_sleep = 1; 2687 int should_sleep = 1;
2683 struct the_nilfs *nilfs;
2684 2688
2685 prepare_to_wait(&sci->sc_wait_daemon, &wait, 2689 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2686 TASK_INTERRUPTIBLE); 2690 TASK_INTERRUPTIBLE);
@@ -2701,8 +2705,8 @@ static int nilfs_segctor_thread(void *arg)
2701 finish_wait(&sci->sc_wait_daemon, &wait); 2705 finish_wait(&sci->sc_wait_daemon, &wait);
2702 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2706 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2703 time_after_eq(jiffies, sci->sc_timer->expires)); 2707 time_after_eq(jiffies, sci->sc_timer->expires));
2704 nilfs = sci->sc_sbi->s_nilfs; 2708
2705 if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) 2709 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
2706 set_nilfs_discontinued(nilfs); 2710 set_nilfs_discontinued(nilfs);
2707 } 2711 }
2708 goto loop; 2712 goto loop;
@@ -2797,12 +2801,9 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2797 do { 2801 do {
2798 struct nilfs_sb_info *sbi = sci->sc_sbi; 2802 struct nilfs_sb_info *sbi = sci->sc_sbi;
2799 struct nilfs_transaction_info ti; 2803 struct nilfs_transaction_info ti;
2800 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2801 2804
2802 nilfs_transaction_lock(sbi, &ti, 0); 2805 nilfs_transaction_lock(sbi, &ti, 0);
2803 nilfs_segctor_accept(sci, &req); 2806 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
2804 ret = nilfs_segctor_construct(sci, &req);
2805 nilfs_segctor_notify(sci, &req);
2806 nilfs_transaction_unlock(sbi); 2807 nilfs_transaction_unlock(sbi);
2807 2808
2808 } while (ret && retrycount-- > 0); 2809 } while (ret && retrycount-- > 0);
@@ -2865,8 +2866,15 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2865 struct the_nilfs *nilfs = sbi->s_nilfs; 2866 struct the_nilfs *nilfs = sbi->s_nilfs;
2866 int err; 2867 int err;
2867 2868
2868 /* Each field of nilfs_segctor is cleared through the initialization 2869 if (NILFS_SC(sbi)) {
2869 of super-block info */ 2870 /*
2871 * This happens if the filesystem was remounted
2872 * read/write after nilfs_error degenerated it into a
2873 * read-only mount.
2874 */
2875 nilfs_detach_segment_constructor(sbi);
2876 }
2877
2870 sbi->s_sc_info = nilfs_segctor_new(sbi); 2878 sbi->s_sc_info = nilfs_segctor_new(sbi);
2871 if (!sbi->s_sc_info) 2879 if (!sbi->s_sc_info)
2872 return -ENOMEM; 2880 return -ENOMEM;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3d3ab2f9864c..3155e0c7f415 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -116,6 +116,7 @@ struct nilfs_segsum_pointer {
116 * @sc_wait_daemon: Daemon wait queue 116 * @sc_wait_daemon: Daemon wait queue
117 * @sc_wait_task: Start/end wait queue to control segctord task 117 * @sc_wait_task: Start/end wait queue to control segctord task
118 * @sc_seq_request: Request counter 118 * @sc_seq_request: Request counter
119 * @sc_seq_accept: Accepted request count
119 * @sc_seq_done: Completion counter 120 * @sc_seq_done: Completion counter
120 * @sc_sync: Request of explicit sync operation 121 * @sc_sync: Request of explicit sync operation
121 * @sc_interval: Timeout value of background construction 122 * @sc_interval: Timeout value of background construction
@@ -169,6 +170,7 @@ struct nilfs_sc_info {
169 wait_queue_head_t sc_wait_task; 170 wait_queue_head_t sc_wait_task;
170 171
171 __u32 sc_seq_request; 172 __u32 sc_seq_request;
173 __u32 sc_seq_accepted;
172 __u32 sc_seq_done; 174 __u32 sc_seq_done;
173 175
174 int sc_sync; 176 int sc_sync;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8173faee31e6..92579cc4c935 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -96,9 +96,6 @@ void nilfs_error(struct super_block *sb, const char *function,
96 if (!(sb->s_flags & MS_RDONLY)) { 96 if (!(sb->s_flags & MS_RDONLY)) {
97 struct the_nilfs *nilfs = sbi->s_nilfs; 97 struct the_nilfs *nilfs = sbi->s_nilfs;
98 98
99 if (!nilfs_test_opt(sbi, ERRORS_CONT))
100 nilfs_detach_segment_constructor(sbi);
101
102 down_write(&nilfs->ns_sem); 99 down_write(&nilfs->ns_sem);
103 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { 100 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
104 nilfs->ns_mount_state |= NILFS_ERROR_FS; 101 nilfs->ns_mount_state |= NILFS_ERROR_FS;
@@ -301,7 +298,7 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
301 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 298 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
302 nilfs->ns_sbwtime[1] = t; 299 nilfs->ns_sbwtime[1] = t;
303 } 300 }
304 sbi->s_super->s_dirt = 0; 301 clear_nilfs_sb_dirty(nilfs);
305 return nilfs_sync_super(sbi, dupsb); 302 return nilfs_sync_super(sbi, dupsb);
306} 303}
307 304
@@ -345,7 +342,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
345 err = nilfs_construct_segment(sb); 342 err = nilfs_construct_segment(sb);
346 343
347 down_write(&nilfs->ns_sem); 344 down_write(&nilfs->ns_sem);
348 if (sb->s_dirt) 345 if (nilfs_sb_dirty(nilfs))
349 nilfs_commit_super(sbi, 1); 346 nilfs_commit_super(sbi, 1);
350 up_write(&nilfs->ns_sem); 347 up_write(&nilfs->ns_sem);
351 348
@@ -481,6 +478,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
481 seq_printf(seq, ",order=strict"); 478 seq_printf(seq, ",order=strict");
482 if (nilfs_test_opt(sbi, NORECOVERY)) 479 if (nilfs_test_opt(sbi, NORECOVERY))
483 seq_printf(seq, ",norecovery"); 480 seq_printf(seq, ",norecovery");
481 if (nilfs_test_opt(sbi, DISCARD))
482 seq_printf(seq, ",discard");
484 483
485 return 0; 484 return 0;
486} 485}
@@ -550,7 +549,7 @@ static const struct export_operations nilfs_export_ops = {
550enum { 549enum {
551 Opt_err_cont, Opt_err_panic, Opt_err_ro, 550 Opt_err_cont, Opt_err_panic, Opt_err_ro,
552 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 551 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
553 Opt_err, 552 Opt_discard, Opt_err,
554}; 553};
555 554
556static match_table_t tokens = { 555static match_table_t tokens = {
@@ -561,6 +560,7 @@ static match_table_t tokens = {
561 {Opt_snapshot, "cp=%u"}, 560 {Opt_snapshot, "cp=%u"},
562 {Opt_order, "order=%s"}, 561 {Opt_order, "order=%s"},
563 {Opt_norecovery, "norecovery"}, 562 {Opt_norecovery, "norecovery"},
563 {Opt_discard, "discard"},
564 {Opt_err, NULL} 564 {Opt_err, NULL}
565}; 565};
566 566
@@ -614,6 +614,9 @@ static int parse_options(char *options, struct super_block *sb)
614 case Opt_norecovery: 614 case Opt_norecovery:
615 nilfs_set_opt(sbi, NORECOVERY); 615 nilfs_set_opt(sbi, NORECOVERY);
616 break; 616 break;
617 case Opt_discard:
618 nilfs_set_opt(sbi, DISCARD);
619 break;
617 default: 620 default:
618 printk(KERN_ERR 621 printk(KERN_ERR
619 "NILFS: Unrecognized mount option \"%s\"\n", p); 622 "NILFS: Unrecognized mount option \"%s\"\n", p);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6241e1722efc..92733d5651d2 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -646,6 +646,44 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
646 goto out; 646 goto out;
647} 647}
648 648
649int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
650 size_t nsegs)
651{
652 sector_t seg_start, seg_end;
653 sector_t start = 0, nblocks = 0;
654 unsigned int sects_per_block;
655 __u64 *sn;
656 int ret = 0;
657
658 sects_per_block = (1 << nilfs->ns_blocksize_bits) /
659 bdev_logical_block_size(nilfs->ns_bdev);
660 for (sn = segnump; sn < segnump + nsegs; sn++) {
661 nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
662
663 if (!nblocks) {
664 start = seg_start;
665 nblocks = seg_end - seg_start + 1;
666 } else if (start + nblocks == seg_start) {
667 nblocks += seg_end - seg_start + 1;
668 } else {
669 ret = blkdev_issue_discard(nilfs->ns_bdev,
670 start * sects_per_block,
671 nblocks * sects_per_block,
672 GFP_NOFS,
673 DISCARD_FL_BARRIER);
674 if (ret < 0)
675 return ret;
676 nblocks = 0;
677 }
678 }
679 if (nblocks)
680 ret = blkdev_issue_discard(nilfs->ns_bdev,
681 start * sects_per_block,
682 nblocks * sects_per_block,
683 GFP_NOFS, DISCARD_FL_BARRIER);
684 return ret;
685}
686
649int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) 687int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
650{ 688{
651 struct inode *dat = nilfs_dat_inode(nilfs); 689 struct inode *dat = nilfs_dat_inode(nilfs);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 589786e33464..e9795f1724d7 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -38,6 +38,7 @@ enum {
38 the latest checkpoint was loaded */ 38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40 THE_NILFS_GC_RUNNING, /* gc process is running */ 40 THE_NILFS_GC_RUNNING, /* gc process is running */
41 THE_NILFS_SB_DIRTY, /* super block is dirty */
41}; 42};
42 43
43/** 44/**
@@ -197,6 +198,7 @@ THE_NILFS_FNS(INIT, init)
197THE_NILFS_FNS(LOADED, loaded) 198THE_NILFS_FNS(LOADED, loaded)
198THE_NILFS_FNS(DISCONTINUED, discontinued) 199THE_NILFS_FNS(DISCONTINUED, discontinued)
199THE_NILFS_FNS(GC_RUNNING, gc_running) 200THE_NILFS_FNS(GC_RUNNING, gc_running)
201THE_NILFS_FNS(SB_DIRTY, sb_dirty)
200 202
201/* Minimum interval of periodical update of superblocks (in seconds) */ 203/* Minimum interval of periodical update of superblocks (in seconds) */
202#define NILFS_SB_FREQ 10 204#define NILFS_SB_FREQ 10
@@ -221,6 +223,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *);
221void put_nilfs(struct the_nilfs *); 223void put_nilfs(struct the_nilfs *);
222int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 224int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
223int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 225int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
226int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
224int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 227int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
225struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); 228struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
226int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); 229int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a94e8bd8eb1f..472cdf29ef82 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -29,14 +29,12 @@
29#include <linux/init.h> /* module_init */ 29#include <linux/init.h> /* module_init */
30#include <linux/inotify.h> 30#include <linux/inotify.h>
31#include <linux/kernel.h> /* roundup() */ 31#include <linux/kernel.h> /* roundup() */
32#include <linux/magic.h> /* superblock magic number */
33#include <linux/mount.h> /* mntget */
34#include <linux/namei.h> /* LOOKUP_FOLLOW */ 32#include <linux/namei.h> /* LOOKUP_FOLLOW */
35#include <linux/path.h> /* struct path */
36#include <linux/sched.h> /* struct user */ 33#include <linux/sched.h> /* struct user */
37#include <linux/slab.h> /* struct kmem_cache */ 34#include <linux/slab.h> /* struct kmem_cache */
38#include <linux/syscalls.h> 35#include <linux/syscalls.h>
39#include <linux/types.h> 36#include <linux/types.h>
37#include <linux/anon_inodes.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/poll.h> 39#include <linux/poll.h>
42#include <linux/wait.h> 40#include <linux/wait.h>
@@ -45,8 +43,6 @@
45 43
46#include <asm/ioctls.h> 44#include <asm/ioctls.h>
47 45
48static struct vfsmount *inotify_mnt __read_mostly;
49
50/* these are configurable via /proc/sys/fs/inotify/ */ 46/* these are configurable via /proc/sys/fs/inotify/ */
51static int inotify_max_user_instances __read_mostly; 47static int inotify_max_user_instances __read_mostly;
52static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
@@ -645,9 +641,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
645{ 641{
646 struct fsnotify_group *group; 642 struct fsnotify_group *group;
647 struct user_struct *user; 643 struct user_struct *user;
648 struct file *filp; 644 int ret;
649 struct path path;
650 int fd, ret;
651 645
652 /* Check the IN_* constants for consistency. */ 646 /* Check the IN_* constants for consistency. */
653 BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); 647 BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
@@ -656,10 +650,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
656 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) 650 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
657 return -EINVAL; 651 return -EINVAL;
658 652
659 fd = get_unused_fd_flags(flags & O_CLOEXEC);
660 if (fd < 0)
661 return fd;
662
663 user = get_current_user(); 653 user = get_current_user();
664 if (unlikely(atomic_read(&user->inotify_devs) >= 654 if (unlikely(atomic_read(&user->inotify_devs) >=
665 inotify_max_user_instances)) { 655 inotify_max_user_instances)) {
@@ -676,27 +666,14 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
676 666
677 atomic_inc(&user->inotify_devs); 667 atomic_inc(&user->inotify_devs);
678 668
679 path.mnt = inotify_mnt; 669 ret = anon_inode_getfd("inotify", &inotify_fops, group,
680 path.dentry = inotify_mnt->mnt_root; 670 O_RDONLY | flags);
681 path_get(&path); 671 if (ret >= 0)
682 filp = alloc_file(&path, FMODE_READ, &inotify_fops); 672 return ret;
683 if (!filp)
684 goto Enfile;
685 673
686 filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
687 filp->private_data = group;
688
689 fd_install(fd, filp);
690
691 return fd;
692
693Enfile:
694 ret = -ENFILE;
695 path_put(&path);
696 atomic_dec(&user->inotify_devs); 674 atomic_dec(&user->inotify_devs);
697out_free_uid: 675out_free_uid:
698 free_uid(user); 676 free_uid(user);
699 put_unused_fd(fd);
700 return ret; 677 return ret;
701} 678}
702 679
@@ -783,20 +760,6 @@ out:
783 return ret; 760 return ret;
784} 761}
785 762
786static int
787inotify_get_sb(struct file_system_type *fs_type, int flags,
788 const char *dev_name, void *data, struct vfsmount *mnt)
789{
790 return get_sb_pseudo(fs_type, "inotify", NULL,
791 INOTIFYFS_SUPER_MAGIC, mnt);
792}
793
794static struct file_system_type inotify_fs_type = {
795 .name = "inotifyfs",
796 .get_sb = inotify_get_sb,
797 .kill_sb = kill_anon_super,
798};
799
800/* 763/*
801 * inotify_user_setup - Our initialization function. Note that we cannnot return 764 * inotify_user_setup - Our initialization function. Note that we cannnot return
802 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 765 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
@@ -804,16 +767,6 @@ static struct file_system_type inotify_fs_type = {
804 */ 767 */
805static int __init inotify_user_setup(void) 768static int __init inotify_user_setup(void)
806{ 769{
807 int ret;
808
809 ret = register_filesystem(&inotify_fs_type);
810 if (unlikely(ret))
811 panic("inotify: register_filesystem returned %d!\n", ret);
812
813 inotify_mnt = kern_mount(&inotify_fs_type);
814 if (IS_ERR(inotify_mnt))
815 panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
816
817 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 770 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
818 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 771 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
819 772
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 5a9e34475e37..9173e82a45d1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1545,7 +1545,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
1545 write_inode_now(bmp_vi, !datasync); 1545 write_inode_now(bmp_vi, !datasync);
1546 iput(bmp_vi); 1546 iput(bmp_vi);
1547 } 1547 }
1548 ret = ntfs_write_inode(vi, 1); 1548 ret = __ntfs_write_inode(vi, 1);
1549 write_inode_now(vi, !datasync); 1549 write_inode_now(vi, !datasync);
1550 err = sync_blockdev(vi->i_sb->s_bdev); 1550 err = sync_blockdev(vi->i_sb->s_bdev);
1551 if (unlikely(err && !ret)) 1551 if (unlikely(err && !ret))
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 43179ddd336f..b681c71d7069 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2182,7 +2182,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
2182 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2182 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
2183 BUG_ON(S_ISDIR(vi->i_mode)); 2183 BUG_ON(S_ISDIR(vi->i_mode));
2184 if (!datasync || !NInoNonResident(NTFS_I(vi))) 2184 if (!datasync || !NInoNonResident(NTFS_I(vi)))
2185 ret = ntfs_write_inode(vi, 1); 2185 ret = __ntfs_write_inode(vi, 1);
2186 write_inode_now(vi, !datasync); 2186 write_inode_now(vi, !datasync);
2187 /* 2187 /*
2188 * NOTE: If we were to use mapping->private_list (see ext2 and 2188 * NOTE: If we were to use mapping->private_list (see ext2 and
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index dc2505abb6d7..4b57fb1eac2a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2957,7 +2957,7 @@ out:
2957 * 2957 *
2958 * Return 0 on success and -errno on error. 2958 * Return 0 on success and -errno on error.
2959 */ 2959 */
2960int ntfs_write_inode(struct inode *vi, int sync) 2960int __ntfs_write_inode(struct inode *vi, int sync)
2961{ 2961{
2962 sle64 nt; 2962 sle64 nt;
2963 ntfs_inode *ni = NTFS_I(vi); 2963 ntfs_inode *ni = NTFS_I(vi);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 117eaf8032a3..9a113544605d 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi);
307 307
308extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); 308extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
309 309
310extern int ntfs_write_inode(struct inode *vi, int sync); 310extern int __ntfs_write_inode(struct inode *vi, int sync);
311 311
312static inline void ntfs_commit_inode(struct inode *vi) 312static inline void ntfs_commit_inode(struct inode *vi)
313{ 313{
314 if (!is_bad_inode(vi)) 314 if (!is_bad_inode(vi))
315 ntfs_write_inode(vi, 1); 315 __ntfs_write_inode(vi, 1);
316 return; 316 return;
317} 317}
318 318
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 80b04770e8e9..1cf39dfaee7a 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -39,6 +39,7 @@
39#include "dir.h" 39#include "dir.h"
40#include "debug.h" 40#include "debug.h"
41#include "index.h" 41#include "index.h"
42#include "inode.h"
42#include "aops.h" 43#include "aops.h"
43#include "layout.h" 44#include "layout.h"
44#include "malloc.h" 45#include "malloc.h"
@@ -2662,6 +2663,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
2662 return 0; 2663 return 0;
2663} 2664}
2664 2665
2666#ifdef NTFS_RW
2667static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
2668{
2669 return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
2670}
2671#endif
2672
2665/** 2673/**
2666 * The complete super operations. 2674 * The complete super operations.
2667 */ 2675 */
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 600d2d2ade11..791c0886c060 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -46,6 +46,7 @@ ocfs2_stackglue-objs := stackglue.o
46ocfs2_stack_o2cb-objs := stack_o2cb.o 46ocfs2_stack_o2cb-objs := stack_o2cb.o
47ocfs2_stack_user-objs := stack_user.o 47ocfs2_stack_user-objs := stack_user.o
48 48
49obj-$(CONFIG_OCFS2_FS) += dlmfs/
49# cluster/ is always needed when OCFS2_FS for masklog support 50# cluster/ is always needed when OCFS2_FS for masklog support
50obj-$(CONFIG_OCFS2_FS) += cluster/ 51obj-$(CONFIG_OCFS2_FS) += cluster/
51obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ 52obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d17bdc718f74..2bbe1ecc08c0 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1050,7 +1050,8 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1051 eb->h_blkno = cpu_to_le64(first_blkno); 1051 eb->h_blkno = cpu_to_le64(first_blkno);
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1054 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1055 eb->h_list.l_count = 1056 eb->h_list.l_count =
1056 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -6037,7 +6038,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
6037 if (status < 0) 6038 if (status < 0)
6038 mlog_errno(status); 6039 mlog_errno(status);
6039 else 6040 else
6040 ocfs2_init_inode_steal_slot(osb); 6041 ocfs2_init_steal_slots(osb);
6041 6042
6042 mlog_exit(status); 6043 mlog_exit(status);
6043} 6044}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7e9df11260f4..4c2a6d282c4d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -577,8 +577,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
577 goto bail; 577 goto bail;
578 } 578 }
579 579
580 /* We should already CoW the refcounted extent. */ 580 /* We should already CoW the refcounted extent in case of create. */
581 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); 581 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
582
582 /* 583 /*
583 * get_more_blocks() expects us to describe a hole by clearing 584 * get_more_blocks() expects us to describe a hole by clearing
584 * the mapped bit on bh_result(). 585 * the mapped bit on bh_result().
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1cd2934de615..b39da877b12f 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -112,6 +112,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT), 114 define_mask(REFCOUNT),
115 define_mask(BASTS),
115 define_mask(ERROR), 116 define_mask(ERROR),
116 define_mask(NOTICE), 117 define_mask(NOTICE),
117 define_mask(KTHREAD), 118 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 9b4d11726cf2..3dfddbec32f2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,6 +114,7 @@
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ 116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
117#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */
117/* bits that are infrequently given and frequently matched in the high word */ 118/* bits that are infrequently given and frequently matched in the high word */
118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
@@ -194,9 +195,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
194 * previous token if args expands to nothing. 195 * previous token if args expands to nothing.
195 */ 196 */
196#define __mlog_printk(level, fmt, args...) \ 197#define __mlog_printk(level, fmt, args...) \
197 printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current), \ 198 printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
198 __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ 199 task_pid_nr(current), __mlog_cpu_guess, \
199 ##args) 200 __PRETTY_FUNCTION__, __LINE__ , ##args)
200 201
201#define mlog(mask, fmt, args...) do { \ 202#define mlog(mask, fmt, args...) do { \
202 u64 __m = MLOG_MASK_PREFIX | (mask); \ 203 u64 __m = MLOG_MASK_PREFIX | (mask); \
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec238796..765d66c70989 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2439,7 +2439,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2440 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num); 2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 190361375700..dcebf0d920fa 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
7 7
8ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 344bcf90cbf4..b4f99de2caf3 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data)
310 mlog(0, "dlm thread running for %s...\n", dlm->name); 310 mlog(0, "dlm thread running for %s...\n", dlm->name);
311 311
312 while (!kthread_should_stop()) { 312 while (!kthread_should_stop()) {
313 if (dlm_joined(dlm)) { 313 if (dlm_domain_fully_joined(dlm)) {
314 status = dlm_do_recovery(dlm); 314 status = dlm_do_recovery(dlm);
315 if (status == -EAGAIN) { 315 if (status == -EAGAIN) {
316 /* do not sleep, recheck immediately. */ 316 /* do not sleep, recheck immediately. */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000000..df69b4856d0d
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4
5ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 02bf17808bdc..1b0de157a08c 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -43,24 +43,17 @@
43#include <linux/init.h> 43#include <linux/init.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/backing-dev.h> 45#include <linux/backing-dev.h>
46#include <linux/poll.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
49 50#include "stackglue.h"
50#include "cluster/nodemanager.h"
51#include "cluster/heartbeat.h"
52#include "cluster/tcp.h"
53
54#include "dlmapi.h"
55
56#include "userdlm.h" 51#include "userdlm.h"
57
58#include "dlmfsver.h" 52#include "dlmfsver.h"
59 53
60#define MLOG_MASK_PREFIX ML_DLMFS 54#define MLOG_MASK_PREFIX ML_DLMFS
61#include "cluster/masklog.h" 55#include "cluster/masklog.h"
62 56
63#include "ocfs2_lockingver.h"
64 57
65static const struct super_operations dlmfs_ops; 58static const struct super_operations dlmfs_ops;
66static const struct file_operations dlmfs_file_operations; 59static const struct file_operations dlmfs_file_operations;
@@ -71,15 +64,46 @@ static struct kmem_cache *dlmfs_inode_cache;
71 64
72struct workqueue_struct *user_dlm_worker; 65struct workqueue_struct *user_dlm_worker;
73 66
67
68
74/* 69/*
75 * This is the userdlmfs locking protocol version. 70 * These are the ABI capabilities of dlmfs.
71 *
72 * Over time, dlmfs has added some features that were not part of the
73 * initial ABI. Unfortunately, some of these features are not detectable
74 * via standard usage. For example, Linux's default poll always returns
75 * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
76 * added poll support. Instead, we provide this list of new capabilities.
77 *
78 * Capabilities is a read-only attribute. We do it as a module parameter
79 * so we can discover it whether dlmfs is built in, loaded, or even not
80 * loaded.
76 * 81 *
77 * See fs/ocfs2/dlmglue.c for more details on locking versions. 82 * The ABI features are local to this machine's dlmfs mount. This is
83 * distinct from the locking protocol, which is concerned with inter-node
84 * interaction.
85 *
86 * Capabilities:
87 * - bast : POLLIN against the file descriptor of a held lock
88 * signifies a bast fired on the lock.
78 */ 89 */
79static const struct dlm_protocol_version user_locking_protocol = { 90#define DLMFS_CAPABILITIES "bast stackglue"
80 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, 91extern int param_set_dlmfs_capabilities(const char *val,
81 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, 92 struct kernel_param *kp)
82}; 93{
94 printk(KERN_ERR "%s: readonly parameter\n", kp->name);
95 return -EINVAL;
96}
97static int param_get_dlmfs_capabilities(char *buffer,
98 struct kernel_param *kp)
99{
100 return strlcpy(buffer, DLMFS_CAPABILITIES,
101 strlen(DLMFS_CAPABILITIES) + 1);
102}
103module_param_call(capabilities, param_set_dlmfs_capabilities,
104 param_get_dlmfs_capabilities, NULL, 0444);
105MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
106
83 107
84/* 108/*
85 * decodes a set of open flags into a valid lock level and a set of flags. 109 * decodes a set of open flags into a valid lock level and a set of flags.
@@ -179,13 +203,46 @@ static int dlmfs_file_release(struct inode *inode,
179 return 0; 203 return 0;
180} 204}
181 205
206/*
207 * We do ->setattr() just to override size changes. Our size is the size
208 * of the LVB and nothing else.
209 */
210static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
211{
212 int error;
213 struct inode *inode = dentry->d_inode;
214
215 attr->ia_valid &= ~ATTR_SIZE;
216 error = inode_change_ok(inode, attr);
217 if (!error)
218 error = inode_setattr(inode, attr);
219
220 return error;
221}
222
223static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
224{
225 int event = 0;
226 struct inode *inode = file->f_path.dentry->d_inode;
227 struct dlmfs_inode_private *ip = DLMFS_I(inode);
228
229 poll_wait(file, &ip->ip_lockres.l_event, wait);
230
231 spin_lock(&ip->ip_lockres.l_lock);
232 if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
233 event = POLLIN | POLLRDNORM;
234 spin_unlock(&ip->ip_lockres.l_lock);
235
236 return event;
237}
238
182static ssize_t dlmfs_file_read(struct file *filp, 239static ssize_t dlmfs_file_read(struct file *filp,
183 char __user *buf, 240 char __user *buf,
184 size_t count, 241 size_t count,
185 loff_t *ppos) 242 loff_t *ppos)
186{ 243{
187 int bytes_left; 244 int bytes_left;
188 ssize_t readlen; 245 ssize_t readlen, got;
189 char *lvb_buf; 246 char *lvb_buf;
190 struct inode *inode = filp->f_path.dentry->d_inode; 247 struct inode *inode = filp->f_path.dentry->d_inode;
191 248
@@ -211,9 +268,13 @@ static ssize_t dlmfs_file_read(struct file *filp,
211 if (!lvb_buf) 268 if (!lvb_buf)
212 return -ENOMEM; 269 return -ENOMEM;
213 270
214 user_dlm_read_lvb(inode, lvb_buf, readlen); 271 got = user_dlm_read_lvb(inode, lvb_buf, readlen);
215 bytes_left = __copy_to_user(buf, lvb_buf, readlen); 272 if (got) {
216 readlen -= bytes_left; 273 BUG_ON(got != readlen);
274 bytes_left = __copy_to_user(buf, lvb_buf, readlen);
275 readlen -= bytes_left;
276 } else
277 readlen = 0;
217 278
218 kfree(lvb_buf); 279 kfree(lvb_buf);
219 280
@@ -272,7 +333,7 @@ static void dlmfs_init_once(void *foo)
272 struct dlmfs_inode_private *ip = 333 struct dlmfs_inode_private *ip =
273 (struct dlmfs_inode_private *) foo; 334 (struct dlmfs_inode_private *) foo;
274 335
275 ip->ip_dlm = NULL; 336 ip->ip_conn = NULL;
276 ip->ip_parent = NULL; 337 ip->ip_parent = NULL;
277 338
278 inode_init_once(&ip->ip_vfs_inode); 339 inode_init_once(&ip->ip_vfs_inode);
@@ -314,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode)
314 goto clear_fields; 375 goto clear_fields;
315 } 376 }
316 377
317 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); 378 mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
318 /* we must be a directory. If required, lets unregister the 379 /* we must be a directory. If required, lets unregister the
319 * dlm context now. */ 380 * dlm context now. */
320 if (ip->ip_dlm) 381 if (ip->ip_conn)
321 user_dlm_unregister_context(ip->ip_dlm); 382 user_dlm_unregister(ip->ip_conn);
322clear_fields: 383clear_fields:
323 ip->ip_parent = NULL; 384 ip->ip_parent = NULL;
324 ip->ip_dlm = NULL; 385 ip->ip_conn = NULL;
325} 386}
326 387
327static struct backing_dev_info dlmfs_backing_dev_info = { 388static struct backing_dev_info dlmfs_backing_dev_info = {
@@ -371,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
371 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 432 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
372 433
373 ip = DLMFS_I(inode); 434 ip = DLMFS_I(inode);
374 ip->ip_dlm = DLMFS_I(parent)->ip_dlm; 435 ip->ip_conn = DLMFS_I(parent)->ip_conn;
375 436
376 switch (mode & S_IFMT) { 437 switch (mode & S_IFMT) {
377 default: 438 default:
@@ -425,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir,
425 struct inode *inode = NULL; 486 struct inode *inode = NULL;
426 struct qstr *domain = &dentry->d_name; 487 struct qstr *domain = &dentry->d_name;
427 struct dlmfs_inode_private *ip; 488 struct dlmfs_inode_private *ip;
428 struct dlm_ctxt *dlm; 489 struct ocfs2_cluster_connection *conn;
429 struct dlm_protocol_version proto = user_locking_protocol;
430 490
431 mlog(0, "mkdir %.*s\n", domain->len, domain->name); 491 mlog(0, "mkdir %.*s\n", domain->len, domain->name);
432 492
433 /* verify that we have a proper domain */ 493 /* verify that we have a proper domain */
434 if (domain->len >= O2NM_MAX_NAME_LEN) { 494 if (domain->len >= GROUP_NAME_MAX) {
435 status = -EINVAL; 495 status = -EINVAL;
436 mlog(ML_ERROR, "invalid domain name for directory.\n"); 496 mlog(ML_ERROR, "invalid domain name for directory.\n");
437 goto bail; 497 goto bail;
@@ -446,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir,
446 506
447 ip = DLMFS_I(inode); 507 ip = DLMFS_I(inode);
448 508
449 dlm = user_dlm_register_context(domain, &proto); 509 conn = user_dlm_register(domain);
450 if (IS_ERR(dlm)) { 510 if (IS_ERR(conn)) {
451 status = PTR_ERR(dlm); 511 status = PTR_ERR(conn);
452 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", 512 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
453 status, domain->len, domain->name); 513 status, domain->len, domain->name);
454 goto bail; 514 goto bail;
455 } 515 }
456 ip->ip_dlm = dlm; 516 ip->ip_conn = conn;
457 517
458 inc_nlink(dir); 518 inc_nlink(dir);
459 d_instantiate(dentry, inode); 519 d_instantiate(dentry, inode);
@@ -549,6 +609,7 @@ static int dlmfs_fill_super(struct super_block * sb,
549static const struct file_operations dlmfs_file_operations = { 609static const struct file_operations dlmfs_file_operations = {
550 .open = dlmfs_file_open, 610 .open = dlmfs_file_open,
551 .release = dlmfs_file_release, 611 .release = dlmfs_file_release,
612 .poll = dlmfs_file_poll,
552 .read = dlmfs_file_read, 613 .read = dlmfs_file_read,
553 .write = dlmfs_file_write, 614 .write = dlmfs_file_write,
554}; 615};
@@ -576,6 +637,7 @@ static const struct super_operations dlmfs_ops = {
576 637
577static const struct inode_operations dlmfs_file_inode_operations = { 638static const struct inode_operations dlmfs_file_inode_operations = {
578 .getattr = simple_getattr, 639 .getattr = simple_getattr,
640 .setattr = dlmfs_file_setattr,
579}; 641};
580 642
581static int dlmfs_get_sb(struct file_system_type *fs_type, 643static int dlmfs_get_sb(struct file_system_type *fs_type,
@@ -620,6 +682,7 @@ static int __init init_dlmfs_fs(void)
620 } 682 }
621 cleanup_worker = 1; 683 cleanup_worker = 1;
622 684
685 user_dlm_set_locking_protocol();
623 status = register_filesystem(&dlmfs_fs_type); 686 status = register_filesystem(&dlmfs_fs_type);
624bail: 687bail:
625 if (status) { 688 if (status) {
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
index a733b3321f83..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
index f35eadbed25c..f35eadbed25c 100644
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 4cb1d3dae250..0499e3fb7bdb 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
34#include <linux/types.h> 34#include <linux/types.h>
35#include <linux/crc32.h> 35#include <linux/crc32.h>
36 36
37 37#include "ocfs2_lockingver.h"
38#include "cluster/nodemanager.h" 38#include "stackglue.h"
39#include "cluster/heartbeat.h"
40#include "cluster/tcp.h"
41
42#include "dlmapi.h"
43
44#include "userdlm.h" 39#include "userdlm.h"
45 40
46#define MLOG_MASK_PREFIX ML_DLMFS 41#define MLOG_MASK_PREFIX ML_DLMFS
47#include "cluster/masklog.h" 42#include "cluster/masklog.h"
48 43
44
45static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
46{
47 return container_of(lksb, struct user_lock_res, l_lksb);
48}
49
49static inline int user_check_wait_flag(struct user_lock_res *lockres, 50static inline int user_check_wait_flag(struct user_lock_res *lockres,
50 int flag) 51 int flag)
51{ 52{
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
73} 74}
74 75
75/* I heart container_of... */ 76/* I heart container_of... */
76static inline struct dlm_ctxt * 77static inline struct ocfs2_cluster_connection *
77dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) 78cluster_connection_from_user_lockres(struct user_lock_res *lockres)
78{ 79{
79 struct dlmfs_inode_private *ip; 80 struct dlmfs_inode_private *ip;
80 81
81 ip = container_of(lockres, 82 ip = container_of(lockres,
82 struct dlmfs_inode_private, 83 struct dlmfs_inode_private,
83 ip_lockres); 84 ip_lockres);
84 return ip->ip_dlm; 85 return ip->ip_conn;
85} 86}
86 87
87static struct inode * 88static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
103} 104}
104 105
105#define user_log_dlm_error(_func, _stat, _lockres) do { \ 106#define user_log_dlm_error(_func, _stat, _lockres) do { \
106 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 107 mlog(ML_ERROR, "Dlm error %d while calling %s on " \
107 "resource %.*s: %s\n", dlm_errname(_stat), _func, \ 108 "resource %.*s\n", _stat, _func, \
108 _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ 109 _lockres->l_namelen, _lockres->l_name); \
109} while (0) 110} while (0)
110 111
111/* WARNING: This function lives in a world where the only three lock 112/* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
113 * lock types are added. */ 114 * lock types are added. */
114static inline int user_highest_compat_lock_level(int level) 115static inline int user_highest_compat_lock_level(int level)
115{ 116{
116 int new_level = LKM_EXMODE; 117 int new_level = DLM_LOCK_EX;
117 118
118 if (level == LKM_EXMODE) 119 if (level == DLM_LOCK_EX)
119 new_level = LKM_NLMODE; 120 new_level = DLM_LOCK_NL;
120 else if (level == LKM_PRMODE) 121 else if (level == DLM_LOCK_PR)
121 new_level = LKM_PRMODE; 122 new_level = DLM_LOCK_PR;
122 return new_level; 123 return new_level;
123} 124}
124 125
125static void user_ast(void *opaque) 126static void user_ast(struct ocfs2_dlm_lksb *lksb)
126{ 127{
127 struct user_lock_res *lockres = opaque; 128 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
128 struct dlm_lockstatus *lksb; 129 int status;
129 130
130 mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, 131 mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
131 lockres->l_name); 132 lockres->l_namelen, lockres->l_name, lockres->l_level,
133 lockres->l_requested);
132 134
133 spin_lock(&lockres->l_lock); 135 spin_lock(&lockres->l_lock);
134 136
135 lksb = &(lockres->l_lksb); 137 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
136 if (lksb->status != DLM_NORMAL) { 138 if (status) {
137 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", 139 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
138 lksb->status, lockres->l_namelen, lockres->l_name); 140 status, lockres->l_namelen, lockres->l_name);
139 spin_unlock(&lockres->l_lock); 141 spin_unlock(&lockres->l_lock);
140 return; 142 return;
141 } 143 }
142 144
143 mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, 145 mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
144 "Lockres %.*s, requested ivmode. flags 0x%x\n", 146 "Lockres %.*s, requested ivmode. flags 0x%x\n",
145 lockres->l_namelen, lockres->l_name, lockres->l_flags); 147 lockres->l_namelen, lockres->l_name, lockres->l_flags);
146 148
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
148 if (lockres->l_requested < lockres->l_level) { 150 if (lockres->l_requested < lockres->l_level) {
149 if (lockres->l_requested <= 151 if (lockres->l_requested <=
150 user_highest_compat_lock_level(lockres->l_blocking)) { 152 user_highest_compat_lock_level(lockres->l_blocking)) {
151 lockres->l_blocking = LKM_NLMODE; 153 lockres->l_blocking = DLM_LOCK_NL;
152 lockres->l_flags &= ~USER_LOCK_BLOCKED; 154 lockres->l_flags &= ~USER_LOCK_BLOCKED;
153 } 155 }
154 } 156 }
155 157
156 lockres->l_level = lockres->l_requested; 158 lockres->l_level = lockres->l_requested;
157 lockres->l_requested = LKM_IVMODE; 159 lockres->l_requested = DLM_LOCK_IV;
158 lockres->l_flags |= USER_LOCK_ATTACHED; 160 lockres->l_flags |= USER_LOCK_ATTACHED;
159 lockres->l_flags &= ~USER_LOCK_BUSY; 161 lockres->l_flags &= ~USER_LOCK_BUSY;
160 162
@@ -193,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
193 return; 195 return;
194 196
195 switch (lockres->l_blocking) { 197 switch (lockres->l_blocking) {
196 case LKM_EXMODE: 198 case DLM_LOCK_EX:
197 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 199 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
198 queue = 1; 200 queue = 1;
199 break; 201 break;
200 case LKM_PRMODE: 202 case DLM_LOCK_PR:
201 if (!lockres->l_ex_holders) 203 if (!lockres->l_ex_holders)
202 queue = 1; 204 queue = 1;
203 break; 205 break;
@@ -209,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
209 __user_dlm_queue_lockres(lockres); 211 __user_dlm_queue_lockres(lockres);
210} 212}
211 213
212static void user_bast(void *opaque, int level) 214static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
213{ 215{
214 struct user_lock_res *lockres = opaque; 216 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
215 217
216 mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", 218 mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
217 lockres->l_namelen, lockres->l_name, level); 219 lockres->l_namelen, lockres->l_name, level, lockres->l_level);
218 220
219 spin_lock(&lockres->l_lock); 221 spin_lock(&lockres->l_lock);
220 lockres->l_flags |= USER_LOCK_BLOCKED; 222 lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -227,15 +229,15 @@ static void user_bast(void *opaque, int level)
227 wake_up(&lockres->l_event); 229 wake_up(&lockres->l_event);
228} 230}
229 231
230static void user_unlock_ast(void *opaque, enum dlm_status status) 232static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
231{ 233{
232 struct user_lock_res *lockres = opaque; 234 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
233 235
234 mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, 236 mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
235 lockres->l_name); 237 lockres->l_namelen, lockres->l_name, lockres->l_flags);
236 238
237 if (status != DLM_NORMAL && status != DLM_CANCELGRANT) 239 if (status)
238 mlog(ML_ERROR, "Dlm returns status %d\n", status); 240 mlog(ML_ERROR, "dlm returns status %d\n", status);
239 241
240 spin_lock(&lockres->l_lock); 242 spin_lock(&lockres->l_lock);
241 /* The teardown flag gets set early during the unlock process, 243 /* The teardown flag gets set early during the unlock process,
@@ -243,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
243 * for a concurrent cancel. */ 245 * for a concurrent cancel. */
244 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN 246 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
245 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { 247 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
246 lockres->l_level = LKM_IVMODE; 248 lockres->l_level = DLM_LOCK_IV;
247 } else if (status == DLM_CANCELGRANT) { 249 } else if (status == DLM_CANCELGRANT) {
248 /* We tried to cancel a convert request, but it was 250 /* We tried to cancel a convert request, but it was
249 * already granted. Don't clear the busy flag - the 251 * already granted. Don't clear the busy flag - the
@@ -254,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
254 } else { 256 } else {
255 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); 257 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
256 /* Cancel succeeded, we want to re-queue */ 258 /* Cancel succeeded, we want to re-queue */
257 lockres->l_requested = LKM_IVMODE; /* cancel an 259 lockres->l_requested = DLM_LOCK_IV; /* cancel an
258 * upconvert 260 * upconvert
259 * request. */ 261 * request. */
260 lockres->l_flags &= ~USER_LOCK_IN_CANCEL; 262 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +273,21 @@ out_noclear:
271 wake_up(&lockres->l_event); 273 wake_up(&lockres->l_event);
272} 274}
273 275
276/*
277 * This is the userdlmfs locking protocol version.
278 *
279 * See fs/ocfs2/dlmglue.c for more details on locking versions.
280 */
281static struct ocfs2_locking_protocol user_dlm_lproto = {
282 .lp_max_version = {
283 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
284 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
285 },
286 .lp_lock_ast = user_ast,
287 .lp_blocking_ast = user_bast,
288 .lp_unlock_ast = user_unlock_ast,
289};
290
274static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) 291static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
275{ 292{
276 struct inode *inode; 293 struct inode *inode;
@@ -283,10 +300,10 @@ static void user_dlm_unblock_lock(struct work_struct *work)
283 int new_level, status; 300 int new_level, status;
284 struct user_lock_res *lockres = 301 struct user_lock_res *lockres =
285 container_of(work, struct user_lock_res, l_work); 302 container_of(work, struct user_lock_res, l_work);
286 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 303 struct ocfs2_cluster_connection *conn =
304 cluster_connection_from_user_lockres(lockres);
287 305
288 mlog(0, "processing lockres %.*s\n", lockres->l_namelen, 306 mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
289 lockres->l_name);
290 307
291 spin_lock(&lockres->l_lock); 308 spin_lock(&lockres->l_lock);
292 309
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
304 * flag, and finally we might get another bast which re-queues 321 * flag, and finally we might get another bast which re-queues
305 * us before our ast for the downconvert is called. */ 322 * us before our ast for the downconvert is called. */
306 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { 323 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
324 mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
325 lockres->l_namelen, lockres->l_name);
307 spin_unlock(&lockres->l_lock); 326 spin_unlock(&lockres->l_lock);
308 goto drop_ref; 327 goto drop_ref;
309 } 328 }
310 329
311 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 330 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
331 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
332 lockres->l_namelen, lockres->l_name);
312 spin_unlock(&lockres->l_lock); 333 spin_unlock(&lockres->l_lock);
313 goto drop_ref; 334 goto drop_ref;
314 } 335 }
315 336
316 if (lockres->l_flags & USER_LOCK_BUSY) { 337 if (lockres->l_flags & USER_LOCK_BUSY) {
317 if (lockres->l_flags & USER_LOCK_IN_CANCEL) { 338 if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
339 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
340 lockres->l_namelen, lockres->l_name);
318 spin_unlock(&lockres->l_lock); 341 spin_unlock(&lockres->l_lock);
319 goto drop_ref; 342 goto drop_ref;
320 } 343 }
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(struct work_struct *work)
322 lockres->l_flags |= USER_LOCK_IN_CANCEL; 345 lockres->l_flags |= USER_LOCK_IN_CANCEL;
323 spin_unlock(&lockres->l_lock); 346 spin_unlock(&lockres->l_lock);
324 347
325 status = dlmunlock(dlm, 348 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
326 &lockres->l_lksb, 349 DLM_LKF_CANCEL);
327 LKM_CANCEL, 350 if (status)
328 user_unlock_ast, 351 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
329 lockres);
330 if (status != DLM_NORMAL)
331 user_log_dlm_error("dlmunlock", status, lockres);
332 goto drop_ref; 352 goto drop_ref;
333 } 353 }
334 354
335 /* If there are still incompat holders, we can exit safely 355 /* If there are still incompat holders, we can exit safely
336 * without worrying about re-queueing this lock as that will 356 * without worrying about re-queueing this lock as that will
337 * happen on the last call to user_cluster_unlock. */ 357 * happen on the last call to user_cluster_unlock. */
338 if ((lockres->l_blocking == LKM_EXMODE) 358 if ((lockres->l_blocking == DLM_LOCK_EX)
339 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 359 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
340 spin_unlock(&lockres->l_lock); 360 spin_unlock(&lockres->l_lock);
341 mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", 361 mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
342 lockres->l_ro_holders, lockres->l_ex_holders); 362 lockres->l_namelen, lockres->l_name,
363 lockres->l_ex_holders, lockres->l_ro_holders);
343 goto drop_ref; 364 goto drop_ref;
344 } 365 }
345 366
346 if ((lockres->l_blocking == LKM_PRMODE) 367 if ((lockres->l_blocking == DLM_LOCK_PR)
347 && lockres->l_ex_holders) { 368 && lockres->l_ex_holders) {
348 spin_unlock(&lockres->l_lock); 369 spin_unlock(&lockres->l_lock);
349 mlog(0, "can't downconvert for pr: ex = %u\n", 370 mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
350 lockres->l_ex_holders); 371 lockres->l_namelen, lockres->l_name,
372 lockres->l_ex_holders);
351 goto drop_ref; 373 goto drop_ref;
352 } 374 }
353 375
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
355 new_level = user_highest_compat_lock_level(lockres->l_blocking); 377 new_level = user_highest_compat_lock_level(lockres->l_blocking);
356 lockres->l_requested = new_level; 378 lockres->l_requested = new_level;
357 lockres->l_flags |= USER_LOCK_BUSY; 379 lockres->l_flags |= USER_LOCK_BUSY;
358 mlog(0, "Downconvert lock from %d to %d\n", 380 mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
359 lockres->l_level, new_level); 381 lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
360 spin_unlock(&lockres->l_lock); 382 spin_unlock(&lockres->l_lock);
361 383
362 /* need lock downconvert request now... */ 384 /* need lock downconvert request now... */
363 status = dlmlock(dlm, 385 status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
364 new_level, 386 DLM_LKF_CONVERT|DLM_LKF_VALBLK,
365 &lockres->l_lksb, 387 lockres->l_name,
366 LKM_CONVERT|LKM_VALBLK, 388 lockres->l_namelen);
367 lockres->l_name, 389 if (status) {
368 lockres->l_namelen, 390 user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
369 user_ast,
370 lockres,
371 user_bast);
372 if (status != DLM_NORMAL) {
373 user_log_dlm_error("dlmlock", status, lockres);
374 user_recover_from_dlm_error(lockres); 391 user_recover_from_dlm_error(lockres);
375 } 392 }
376 393
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
382 int level) 399 int level)
383{ 400{
384 switch(level) { 401 switch(level) {
385 case LKM_EXMODE: 402 case DLM_LOCK_EX:
386 lockres->l_ex_holders++; 403 lockres->l_ex_holders++;
387 break; 404 break;
388 case LKM_PRMODE: 405 case DLM_LOCK_PR:
389 lockres->l_ro_holders++; 406 lockres->l_ro_holders++;
390 break; 407 break;
391 default: 408 default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
410 int lkm_flags) 427 int lkm_flags)
411{ 428{
412 int status, local_flags; 429 int status, local_flags;
413 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 430 struct ocfs2_cluster_connection *conn =
431 cluster_connection_from_user_lockres(lockres);
414 432
415 if (level != LKM_EXMODE && 433 if (level != DLM_LOCK_EX &&
416 level != LKM_PRMODE) { 434 level != DLM_LOCK_PR) {
417 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 435 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
418 lockres->l_namelen, lockres->l_name); 436 lockres->l_namelen, lockres->l_name);
419 status = -EINVAL; 437 status = -EINVAL;
420 goto bail; 438 goto bail;
421 } 439 }
422 440
423 mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", 441 mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
424 lockres->l_namelen, lockres->l_name, 442 lockres->l_namelen, lockres->l_name, level, lkm_flags);
425 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
426 lkm_flags);
427 443
428again: 444again:
429 if (signal_pending(current)) { 445 if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
457 } 473 }
458 474
459 if (level > lockres->l_level) { 475 if (level > lockres->l_level) {
460 local_flags = lkm_flags | LKM_VALBLK; 476 local_flags = lkm_flags | DLM_LKF_VALBLK;
461 if (lockres->l_level != LKM_IVMODE) 477 if (lockres->l_level != DLM_LOCK_IV)
462 local_flags |= LKM_CONVERT; 478 local_flags |= DLM_LKF_CONVERT;
463 479
464 lockres->l_requested = level; 480 lockres->l_requested = level;
465 lockres->l_flags |= USER_LOCK_BUSY; 481 lockres->l_flags |= USER_LOCK_BUSY;
466 spin_unlock(&lockres->l_lock); 482 spin_unlock(&lockres->l_lock);
467 483
468 BUG_ON(level == LKM_IVMODE); 484 BUG_ON(level == DLM_LOCK_IV);
469 BUG_ON(level == LKM_NLMODE); 485 BUG_ON(level == DLM_LOCK_NL);
470 486
471 /* call dlm_lock to upgrade lock now */ 487 /* call dlm_lock to upgrade lock now */
472 status = dlmlock(dlm, 488 status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
473 level, 489 local_flags, lockres->l_name,
474 &lockres->l_lksb, 490 lockres->l_namelen);
475 local_flags, 491 if (status) {
476 lockres->l_name, 492 if ((lkm_flags & DLM_LKF_NOQUEUE) &&
477 lockres->l_namelen, 493 (status != -EAGAIN))
478 user_ast, 494 user_log_dlm_error("ocfs2_dlm_lock",
479 lockres, 495 status, lockres);
480 user_bast);
481 if (status != DLM_NORMAL) {
482 if ((lkm_flags & LKM_NOQUEUE) &&
483 (status == DLM_NOTQUEUED))
484 status = -EAGAIN;
485 else {
486 user_log_dlm_error("dlmlock", status, lockres);
487 status = -EINVAL;
488 }
489 user_recover_from_dlm_error(lockres); 496 user_recover_from_dlm_error(lockres);
490 goto bail; 497 goto bail;
491 } 498 }
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
506 int level) 513 int level)
507{ 514{
508 switch(level) { 515 switch(level) {
509 case LKM_EXMODE: 516 case DLM_LOCK_EX:
510 BUG_ON(!lockres->l_ex_holders); 517 BUG_ON(!lockres->l_ex_holders);
511 lockres->l_ex_holders--; 518 lockres->l_ex_holders--;
512 break; 519 break;
513 case LKM_PRMODE: 520 case DLM_LOCK_PR:
514 BUG_ON(!lockres->l_ro_holders); 521 BUG_ON(!lockres->l_ro_holders);
515 lockres->l_ro_holders--; 522 lockres->l_ro_holders--;
516 break; 523 break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
522void user_dlm_cluster_unlock(struct user_lock_res *lockres, 529void user_dlm_cluster_unlock(struct user_lock_res *lockres,
523 int level) 530 int level)
524{ 531{
525 if (level != LKM_EXMODE && 532 if (level != DLM_LOCK_EX &&
526 level != LKM_PRMODE) { 533 level != DLM_LOCK_PR) {
527 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 534 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
528 lockres->l_namelen, lockres->l_name); 535 lockres->l_namelen, lockres->l_name);
529 return; 536 return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
540 unsigned int len) 547 unsigned int len)
541{ 548{
542 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 549 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
543 char *lvb = lockres->l_lksb.lvb; 550 char *lvb;
544 551
545 BUG_ON(len > DLM_LVB_LEN); 552 BUG_ON(len > DLM_LVB_LEN);
546 553
547 spin_lock(&lockres->l_lock); 554 spin_lock(&lockres->l_lock);
548 555
549 BUG_ON(lockres->l_level < LKM_EXMODE); 556 BUG_ON(lockres->l_level < DLM_LOCK_EX);
557 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
550 memcpy(lvb, val, len); 558 memcpy(lvb, val, len);
551 559
552 spin_unlock(&lockres->l_lock); 560 spin_unlock(&lockres->l_lock);
553} 561}
554 562
555void user_dlm_read_lvb(struct inode *inode, 563ssize_t user_dlm_read_lvb(struct inode *inode,
556 char *val, 564 char *val,
557 unsigned int len) 565 unsigned int len)
558{ 566{
559 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 567 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
560 char *lvb = lockres->l_lksb.lvb; 568 char *lvb;
569 ssize_t ret = len;
561 570
562 BUG_ON(len > DLM_LVB_LEN); 571 BUG_ON(len > DLM_LVB_LEN);
563 572
564 spin_lock(&lockres->l_lock); 573 spin_lock(&lockres->l_lock);
565 574
566 BUG_ON(lockres->l_level < LKM_PRMODE); 575 BUG_ON(lockres->l_level < DLM_LOCK_PR);
567 memcpy(val, lvb, len); 576 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
577 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
578 memcpy(val, lvb, len);
579 } else
580 ret = 0;
568 581
569 spin_unlock(&lockres->l_lock); 582 spin_unlock(&lockres->l_lock);
583 return ret;
570} 584}
571 585
572void user_dlm_lock_res_init(struct user_lock_res *lockres, 586void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
576 590
577 spin_lock_init(&lockres->l_lock); 591 spin_lock_init(&lockres->l_lock);
578 init_waitqueue_head(&lockres->l_event); 592 init_waitqueue_head(&lockres->l_event);
579 lockres->l_level = LKM_IVMODE; 593 lockres->l_level = DLM_LOCK_IV;
580 lockres->l_requested = LKM_IVMODE; 594 lockres->l_requested = DLM_LOCK_IV;
581 lockres->l_blocking = LKM_IVMODE; 595 lockres->l_blocking = DLM_LOCK_IV;
582 596
583 /* should have been checked before getting here. */ 597 /* should have been checked before getting here. */
584 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); 598 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
592int user_dlm_destroy_lock(struct user_lock_res *lockres) 606int user_dlm_destroy_lock(struct user_lock_res *lockres)
593{ 607{
594 int status = -EBUSY; 608 int status = -EBUSY;
595 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 609 struct ocfs2_cluster_connection *conn =
610 cluster_connection_from_user_lockres(lockres);
596 611
597 mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); 612 mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
598 613
599 spin_lock(&lockres->l_lock); 614 spin_lock(&lockres->l_lock);
600 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 615 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
627 lockres->l_flags |= USER_LOCK_BUSY; 642 lockres->l_flags |= USER_LOCK_BUSY;
628 spin_unlock(&lockres->l_lock); 643 spin_unlock(&lockres->l_lock);
629 644
630 status = dlmunlock(dlm, 645 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
631 &lockres->l_lksb, 646 if (status) {
632 LKM_VALBLK, 647 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
633 user_unlock_ast,
634 lockres);
635 if (status != DLM_NORMAL) {
636 user_log_dlm_error("dlmunlock", status, lockres);
637 status = -EINVAL;
638 goto bail; 648 goto bail;
639 } 649 }
640 650
@@ -645,32 +655,34 @@ bail:
645 return status; 655 return status;
646} 656}
647 657
648struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 658static void user_dlm_recovery_handler_noop(int node_num,
649 struct dlm_protocol_version *proto) 659 void *recovery_data)
650{ 660{
651 struct dlm_ctxt *dlm; 661 /* We ignore recovery events */
652 u32 dlm_key; 662 return;
653 char *domain; 663}
654
655 domain = kmalloc(name->len + 1, GFP_NOFS);
656 if (!domain) {
657 mlog_errno(-ENOMEM);
658 return ERR_PTR(-ENOMEM);
659 }
660 664
661 dlm_key = crc32_le(0, name->name, name->len); 665void user_dlm_set_locking_protocol(void)
666{
667 ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
668}
662 669
663 snprintf(domain, name->len + 1, "%.*s", name->len, name->name); 670struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
671{
672 int rc;
673 struct ocfs2_cluster_connection *conn;
664 674
665 dlm = dlm_register_domain(domain, dlm_key, proto); 675 rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
666 if (IS_ERR(dlm)) 676 &user_dlm_lproto,
667 mlog_errno(PTR_ERR(dlm)); 677 user_dlm_recovery_handler_noop,
678 NULL, &conn);
679 if (rc)
680 mlog_errno(rc);
668 681
669 kfree(domain); 682 return rc ? ERR_PTR(rc) : conn;
670 return dlm;
671} 683}
672 684
673void user_dlm_unregister_context(struct dlm_ctxt *dlm) 685void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
674{ 686{
675 dlm_unregister_domain(dlm); 687 ocfs2_cluster_disconnect(conn, 0);
676} 688}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61fa..3b42d79531d7 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
57 int l_level; 57 int l_level;
58 unsigned int l_ro_holders; 58 unsigned int l_ro_holders;
59 unsigned int l_ex_holders; 59 unsigned int l_ex_holders;
60 struct dlm_lockstatus l_lksb; 60 struct ocfs2_dlm_lksb l_lksb;
61 61
62 int l_requested; 62 int l_requested;
63 int l_blocking; 63 int l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
80void user_dlm_write_lvb(struct inode *inode, 80void user_dlm_write_lvb(struct inode *inode,
81 const char *val, 81 const char *val,
82 unsigned int len); 82 unsigned int len);
83void user_dlm_read_lvb(struct inode *inode, 83ssize_t user_dlm_read_lvb(struct inode *inode,
84 char *val, 84 char *val,
85 unsigned int len); 85 unsigned int len);
86struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 86struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
87 struct dlm_protocol_version *proto); 87void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
88void user_dlm_unregister_context(struct dlm_ctxt *dlm); 88void user_dlm_set_locking_protocol(void);
89 89
90struct dlmfs_inode_private { 90struct dlmfs_inode_private {
91 struct dlm_ctxt *ip_dlm; 91 struct ocfs2_cluster_connection *ip_conn;
92 92
93 struct user_lock_res ip_lockres; /* unused for directories. */ 93 struct user_lock_res ip_lockres; /* unused for directories. */
94 struct inode *ip_parent; 94 struct inode *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e044019cb3b1..8298608d4165 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
298} 298}
299 299
300static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
301{
302 return container_of(lksb, struct ocfs2_lock_res, l_lksb);
303}
304
300static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 305static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
301{ 306{
302 BUG_ON(!ocfs2_is_inode_lock(lockres)); 307 BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -927,6 +932,10 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
927 lockres->l_blocking = level; 932 lockres->l_blocking = level;
928 } 933 }
929 934
935 mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
936 lockres->l_name, level, lockres->l_level, lockres->l_blocking,
937 needs_downconvert);
938
930 if (needs_downconvert) 939 if (needs_downconvert)
931 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 940 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
932 941
@@ -1040,18 +1049,17 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
1040 return lockres->l_pending_gen; 1049 return lockres->l_pending_gen;
1041} 1050}
1042 1051
1043 1052static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
1044static void ocfs2_blocking_ast(void *opaque, int level)
1045{ 1053{
1046 struct ocfs2_lock_res *lockres = opaque; 1054 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1047 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1055 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1048 int needs_downconvert; 1056 int needs_downconvert;
1049 unsigned long flags; 1057 unsigned long flags;
1050 1058
1051 BUG_ON(level <= DLM_LOCK_NL); 1059 BUG_ON(level <= DLM_LOCK_NL);
1052 1060
1053 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", 1061 mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
1054 lockres->l_name, level, lockres->l_level, 1062 "type %s\n", lockres->l_name, level, lockres->l_level,
1055 ocfs2_lock_type_string(lockres->l_type)); 1063 ocfs2_lock_type_string(lockres->l_type));
1056 1064
1057 /* 1065 /*
@@ -1072,9 +1080,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
1072 ocfs2_wake_downconvert_thread(osb); 1080 ocfs2_wake_downconvert_thread(osb);
1073} 1081}
1074 1082
1075static void ocfs2_locking_ast(void *opaque) 1083static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
1076{ 1084{
1077 struct ocfs2_lock_res *lockres = opaque; 1085 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1078 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1086 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1079 unsigned long flags; 1087 unsigned long flags;
1080 int status; 1088 int status;
@@ -1095,6 +1103,10 @@ static void ocfs2_locking_ast(void *opaque)
1095 return; 1103 return;
1096 } 1104 }
1097 1105
1106 mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
1107 "level %d => %d\n", lockres->l_name, lockres->l_action,
1108 lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
1109
1098 switch(lockres->l_action) { 1110 switch(lockres->l_action) {
1099 case OCFS2_AST_ATTACH: 1111 case OCFS2_AST_ATTACH:
1100 ocfs2_generic_handle_attach_action(lockres); 1112 ocfs2_generic_handle_attach_action(lockres);
@@ -1107,8 +1119,8 @@ static void ocfs2_locking_ast(void *opaque)
1107 ocfs2_generic_handle_downconvert_action(lockres); 1119 ocfs2_generic_handle_downconvert_action(lockres);
1108 break; 1120 break;
1109 default: 1121 default:
1110 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " 1122 mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
1111 "lockres flags = 0x%lx, unlock action: %u\n", 1123 "flags 0x%lx, unlock: %u\n",
1112 lockres->l_name, lockres->l_action, lockres->l_flags, 1124 lockres->l_name, lockres->l_action, lockres->l_flags,
1113 lockres->l_unlock_action); 1125 lockres->l_unlock_action);
1114 BUG(); 1126 BUG();
@@ -1134,6 +1146,88 @@ out:
1134 spin_unlock_irqrestore(&lockres->l_lock, flags); 1146 spin_unlock_irqrestore(&lockres->l_lock, flags);
1135} 1147}
1136 1148
1149static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1150{
1151 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1152 unsigned long flags;
1153
1154 mlog_entry_void();
1155
1156 mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
1157 lockres->l_name, lockres->l_unlock_action);
1158
1159 spin_lock_irqsave(&lockres->l_lock, flags);
1160 if (error) {
1161 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
1162 "unlock_action %d\n", error, lockres->l_name,
1163 lockres->l_unlock_action);
1164 spin_unlock_irqrestore(&lockres->l_lock, flags);
1165 mlog_exit_void();
1166 return;
1167 }
1168
1169 switch(lockres->l_unlock_action) {
1170 case OCFS2_UNLOCK_CANCEL_CONVERT:
1171 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
1172 lockres->l_action = OCFS2_AST_INVALID;
1173 /* Downconvert thread may have requeued this lock, we
1174 * need to wake it. */
1175 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
1176 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
1177 break;
1178 case OCFS2_UNLOCK_DROP_LOCK:
1179 lockres->l_level = DLM_LOCK_IV;
1180 break;
1181 default:
1182 BUG();
1183 }
1184
1185 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1187 wake_up(&lockres->l_event);
1188 spin_unlock_irqrestore(&lockres->l_lock, flags);
1189
1190 mlog_exit_void();
1191}
1192
1193/*
1194 * This is the filesystem locking protocol. It provides the lock handling
1195 * hooks for the underlying DLM. It has a maximum version number.
1196 * The version number allows interoperability with systems running at
1197 * the same major number and an equal or smaller minor number.
1198 *
1199 * Whenever the filesystem does new things with locks (adds or removes a
1200 * lock, orders them differently, does different things underneath a lock),
1201 * the version must be changed. The protocol is negotiated when joining
1202 * the dlm domain. A node may join the domain if its major version is
1203 * identical to all other nodes and its minor version is greater than
1204 * or equal to all other nodes. When its minor version is greater than
1205 * the other nodes, it will run at the minor version specified by the
1206 * other nodes.
1207 *
1208 * If a locking change is made that will not be compatible with older
1209 * versions, the major number must be increased and the minor version set
1210 * to zero. If a change merely adds a behavior that can be disabled when
1211 * speaking to older versions, the minor version must be increased. If a
1212 * change adds a fully backwards compatible change (eg, LVB changes that
1213 * are just ignored by older versions), the version does not need to be
1214 * updated.
1215 */
1216static struct ocfs2_locking_protocol lproto = {
1217 .lp_max_version = {
1218 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
1219 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
1220 },
1221 .lp_lock_ast = ocfs2_locking_ast,
1222 .lp_blocking_ast = ocfs2_blocking_ast,
1223 .lp_unlock_ast = ocfs2_unlock_ast,
1224};
1225
1226void ocfs2_set_locking_protocol(void)
1227{
1228 ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
1229}
1230
1137static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 1231static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1138 int convert) 1232 int convert)
1139{ 1233{
@@ -1189,8 +1283,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
1189 &lockres->l_lksb, 1283 &lockres->l_lksb,
1190 dlm_flags, 1284 dlm_flags,
1191 lockres->l_name, 1285 lockres->l_name,
1192 OCFS2_LOCK_ID_MAX_LEN - 1, 1286 OCFS2_LOCK_ID_MAX_LEN - 1);
1193 lockres);
1194 lockres_clear_pending(lockres, gen, osb); 1287 lockres_clear_pending(lockres, gen, osb);
1195 if (ret) { 1288 if (ret) {
1196 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1289 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1412,7 +1505,7 @@ again:
1412 BUG_ON(level == DLM_LOCK_IV); 1505 BUG_ON(level == DLM_LOCK_IV);
1413 BUG_ON(level == DLM_LOCK_NL); 1506 BUG_ON(level == DLM_LOCK_NL);
1414 1507
1415 mlog(0, "lock %s, convert from %d to level = %d\n", 1508 mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
1416 lockres->l_name, lockres->l_level, level); 1509 lockres->l_name, lockres->l_level, level);
1417 1510
1418 /* call dlm_lock to upgrade lock now */ 1511 /* call dlm_lock to upgrade lock now */
@@ -1421,8 +1514,7 @@ again:
1421 &lockres->l_lksb, 1514 &lockres->l_lksb,
1422 lkm_flags, 1515 lkm_flags,
1423 lockres->l_name, 1516 lockres->l_name,
1424 OCFS2_LOCK_ID_MAX_LEN - 1, 1517 OCFS2_LOCK_ID_MAX_LEN - 1);
1425 lockres);
1426 lockres_clear_pending(lockres, gen, osb); 1518 lockres_clear_pending(lockres, gen, osb);
1427 if (ret) { 1519 if (ret) {
1428 if (!(lkm_flags & DLM_LKF_NOQUEUE) || 1520 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1859,8 +1951,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1859 spin_unlock_irqrestore(&lockres->l_lock, flags); 1951 spin_unlock_irqrestore(&lockres->l_lock, flags);
1860 1952
1861 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, 1953 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
1862 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, 1954 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
1863 lockres);
1864 if (ret) { 1955 if (ret) {
1865 if (!trylock || (ret != -EAGAIN)) { 1956 if (!trylock || (ret != -EAGAIN)) {
1866 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1957 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -2989,7 +3080,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2989 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 3080 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2990 osb->uuid_str, 3081 osb->uuid_str,
2991 strlen(osb->uuid_str), 3082 strlen(osb->uuid_str),
2992 ocfs2_do_node_down, osb, 3083 &lproto, ocfs2_do_node_down, osb,
2993 &conn); 3084 &conn);
2994 if (status) { 3085 if (status) {
2995 mlog_errno(status); 3086 mlog_errno(status);
@@ -3056,50 +3147,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
3056 mlog_exit_void(); 3147 mlog_exit_void();
3057} 3148}
3058 3149
3059static void ocfs2_unlock_ast(void *opaque, int error)
3060{
3061 struct ocfs2_lock_res *lockres = opaque;
3062 unsigned long flags;
3063
3064 mlog_entry_void();
3065
3066 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
3067 lockres->l_unlock_action);
3068
3069 spin_lock_irqsave(&lockres->l_lock, flags);
3070 if (error) {
3071 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
3072 "unlock_action %d\n", error, lockres->l_name,
3073 lockres->l_unlock_action);
3074 spin_unlock_irqrestore(&lockres->l_lock, flags);
3075 mlog_exit_void();
3076 return;
3077 }
3078
3079 switch(lockres->l_unlock_action) {
3080 case OCFS2_UNLOCK_CANCEL_CONVERT:
3081 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
3082 lockres->l_action = OCFS2_AST_INVALID;
3083 /* Downconvert thread may have requeued this lock, we
3084 * need to wake it. */
3085 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
3086 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
3087 break;
3088 case OCFS2_UNLOCK_DROP_LOCK:
3089 lockres->l_level = DLM_LOCK_IV;
3090 break;
3091 default:
3092 BUG();
3093 }
3094
3095 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
3096 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
3097 wake_up(&lockres->l_event);
3098 spin_unlock_irqrestore(&lockres->l_lock, flags);
3099
3100 mlog_exit_void();
3101}
3102
3103static int ocfs2_drop_lock(struct ocfs2_super *osb, 3150static int ocfs2_drop_lock(struct ocfs2_super *osb,
3104 struct ocfs2_lock_res *lockres) 3151 struct ocfs2_lock_res *lockres)
3105{ 3152{
@@ -3167,8 +3214,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
3167 3214
3168 mlog(0, "lock %s\n", lockres->l_name); 3215 mlog(0, "lock %s\n", lockres->l_name);
3169 3216
3170 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags, 3217 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
3171 lockres);
3172 if (ret) { 3218 if (ret) {
3173 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3219 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3174 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 3220 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3276,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
3276 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); 3322 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
3277 3323
3278 if (lockres->l_level <= new_level) { 3324 if (lockres->l_level <= new_level) {
3279 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n", 3325 mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
3280 lockres->l_level, new_level); 3326 "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
3327 "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
3328 new_level, list_empty(&lockres->l_blocked_list),
3329 list_empty(&lockres->l_mask_waiters), lockres->l_type,
3330 lockres->l_flags, lockres->l_ro_holders,
3331 lockres->l_ex_holders, lockres->l_action,
3332 lockres->l_unlock_action, lockres->l_requested,
3333 lockres->l_blocking, lockres->l_pending_gen);
3281 BUG(); 3334 BUG();
3282 } 3335 }
3283 3336
3284 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", 3337 mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
3285 lockres->l_name, new_level, lockres->l_blocking); 3338 lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
3286 3339
3287 lockres->l_action = OCFS2_AST_DOWNCONVERT; 3340 lockres->l_action = OCFS2_AST_DOWNCONVERT;
3288 lockres->l_requested = new_level; 3341 lockres->l_requested = new_level;
@@ -3301,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3301 3354
3302 mlog_entry_void(); 3355 mlog_entry_void();
3303 3356
3357 mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
3358 lockres->l_level, new_level);
3359
3304 if (lvb) 3360 if (lvb)
3305 dlm_flags |= DLM_LKF_VALBLK; 3361 dlm_flags |= DLM_LKF_VALBLK;
3306 3362
@@ -3309,8 +3365,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3309 &lockres->l_lksb, 3365 &lockres->l_lksb,
3310 dlm_flags, 3366 dlm_flags,
3311 lockres->l_name, 3367 lockres->l_name,
3312 OCFS2_LOCK_ID_MAX_LEN - 1, 3368 OCFS2_LOCK_ID_MAX_LEN - 1);
3313 lockres);
3314 lockres_clear_pending(lockres, generation, osb); 3369 lockres_clear_pending(lockres, generation, osb);
3315 if (ret) { 3370 if (ret) {
3316 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 3371 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3331,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3331 assert_spin_locked(&lockres->l_lock); 3386 assert_spin_locked(&lockres->l_lock);
3332 3387
3333 mlog_entry_void(); 3388 mlog_entry_void();
3334 mlog(0, "lock %s\n", lockres->l_name);
3335 3389
3336 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 3390 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
3337 /* If we're already trying to cancel a lock conversion 3391 /* If we're already trying to cancel a lock conversion
3338 * then just drop the spinlock and allow the caller to 3392 * then just drop the spinlock and allow the caller to
3339 * requeue this lock. */ 3393 * requeue this lock. */
3340 3394 mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
3341 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
3342 return 0; 3395 return 0;
3343 } 3396 }
3344 3397
@@ -3353,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3353 "lock %s, invalid flags: 0x%lx\n", 3406 "lock %s, invalid flags: 0x%lx\n",
3354 lockres->l_name, lockres->l_flags); 3407 lockres->l_name, lockres->l_flags);
3355 3408
3409 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3410
3356 return 1; 3411 return 1;
3357} 3412}
3358 3413
@@ -3362,16 +3417,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
3362 int ret; 3417 int ret;
3363 3418
3364 mlog_entry_void(); 3419 mlog_entry_void();
3365 mlog(0, "lock %s\n", lockres->l_name);
3366 3420
3367 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, 3421 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
3368 DLM_LKF_CANCEL, lockres); 3422 DLM_LKF_CANCEL);
3369 if (ret) { 3423 if (ret) {
3370 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3424 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3371 ocfs2_recover_from_dlm_error(lockres, 0); 3425 ocfs2_recover_from_dlm_error(lockres, 0);
3372 } 3426 }
3373 3427
3374 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name); 3428 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3375 3429
3376 mlog_exit(ret); 3430 mlog_exit(ret);
3377 return ret; 3431 return ret;
@@ -3428,8 +3482,11 @@ recheck:
3428 * at the same time they set OCFS2_DLM_BUSY. They must 3482 * at the same time they set OCFS2_DLM_BUSY. They must
3429 * clear OCFS2_DLM_PENDING after dlm_lock() returns. 3483 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3430 */ 3484 */
3431 if (lockres->l_flags & OCFS2_LOCK_PENDING) 3485 if (lockres->l_flags & OCFS2_LOCK_PENDING) {
3486 mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
3487 lockres->l_name);
3432 goto leave_requeue; 3488 goto leave_requeue;
3489 }
3433 3490
3434 ctl->requeue = 1; 3491 ctl->requeue = 1;
3435 ret = ocfs2_prepare_cancel_convert(osb, lockres); 3492 ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3461,6 +3518,7 @@ recheck:
3461 */ 3518 */
3462 if (lockres->l_level == DLM_LOCK_NL) { 3519 if (lockres->l_level == DLM_LOCK_NL) {
3463 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders); 3520 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
3521 mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
3464 lockres->l_blocking = DLM_LOCK_NL; 3522 lockres->l_blocking = DLM_LOCK_NL;
3465 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 3523 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
3466 spin_unlock_irqrestore(&lockres->l_lock, flags); 3524 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3470,28 +3528,41 @@ recheck:
3470 /* if we're blocking an exclusive and we have *any* holders, 3528 /* if we're blocking an exclusive and we have *any* holders,
3471 * then requeue. */ 3529 * then requeue. */
3472 if ((lockres->l_blocking == DLM_LOCK_EX) 3530 if ((lockres->l_blocking == DLM_LOCK_EX)
3473 && (lockres->l_ex_holders || lockres->l_ro_holders)) 3531 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
3532 mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
3533 lockres->l_name, lockres->l_ex_holders,
3534 lockres->l_ro_holders);
3474 goto leave_requeue; 3535 goto leave_requeue;
3536 }
3475 3537
3476 /* If it's a PR we're blocking, then only 3538 /* If it's a PR we're blocking, then only
3477 * requeue if we've got any EX holders */ 3539 * requeue if we've got any EX holders */
3478 if (lockres->l_blocking == DLM_LOCK_PR && 3540 if (lockres->l_blocking == DLM_LOCK_PR &&
3479 lockres->l_ex_holders) 3541 lockres->l_ex_holders) {
3542 mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
3543 lockres->l_name, lockres->l_ex_holders);
3480 goto leave_requeue; 3544 goto leave_requeue;
3545 }
3481 3546
3482 /* 3547 /*
3483 * Can we get a lock in this state if the holder counts are 3548 * Can we get a lock in this state if the holder counts are
3484 * zero? The meta data unblock code used to check this. 3549 * zero? The meta data unblock code used to check this.
3485 */ 3550 */
3486 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 3551 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
3487 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) 3552 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
3553 mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
3554 lockres->l_name);
3488 goto leave_requeue; 3555 goto leave_requeue;
3556 }
3489 3557
3490 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 3558 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
3491 3559
3492 if (lockres->l_ops->check_downconvert 3560 if (lockres->l_ops->check_downconvert
3493 && !lockres->l_ops->check_downconvert(lockres, new_level)) 3561 && !lockres->l_ops->check_downconvert(lockres, new_level)) {
3562 mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
3563 lockres->l_name);
3494 goto leave_requeue; 3564 goto leave_requeue;
3565 }
3495 3566
3496 /* If we get here, then we know that there are no more 3567 /* If we get here, then we know that there are no more
3497 * incompatible holders (and anyone asking for an incompatible 3568 * incompatible holders (and anyone asking for an incompatible
@@ -3509,13 +3580,19 @@ recheck:
3509 3580
3510 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); 3581 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
3511 3582
3512 if (ctl->unblock_action == UNBLOCK_STOP_POST) 3583 if (ctl->unblock_action == UNBLOCK_STOP_POST) {
3584 mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
3585 lockres->l_name);
3513 goto leave; 3586 goto leave;
3587 }
3514 3588
3515 spin_lock_irqsave(&lockres->l_lock, flags); 3589 spin_lock_irqsave(&lockres->l_lock, flags);
3516 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) { 3590 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
3517 /* If this changed underneath us, then we can't drop 3591 /* If this changed underneath us, then we can't drop
3518 * it just yet. */ 3592 * it just yet. */
3593 mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
3594 "Recheck\n", lockres->l_name, blocking,
3595 lockres->l_blocking, level, lockres->l_level);
3519 goto recheck; 3596 goto recheck;
3520 } 3597 }
3521 3598
@@ -3910,45 +3987,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3910 ocfs2_cluster_unlock(osb, lockres, level); 3987 ocfs2_cluster_unlock(osb, lockres, level);
3911} 3988}
3912 3989
3913/*
3914 * This is the filesystem locking protocol. It provides the lock handling
3915 * hooks for the underlying DLM. It has a maximum version number.
3916 * The version number allows interoperability with systems running at
3917 * the same major number and an equal or smaller minor number.
3918 *
3919 * Whenever the filesystem does new things with locks (adds or removes a
3920 * lock, orders them differently, does different things underneath a lock),
3921 * the version must be changed. The protocol is negotiated when joining
3922 * the dlm domain. A node may join the domain if its major version is
3923 * identical to all other nodes and its minor version is greater than
3924 * or equal to all other nodes. When its minor version is greater than
3925 * the other nodes, it will run at the minor version specified by the
3926 * other nodes.
3927 *
3928 * If a locking change is made that will not be compatible with older
3929 * versions, the major number must be increased and the minor version set
3930 * to zero. If a change merely adds a behavior that can be disabled when
3931 * speaking to older versions, the minor version must be increased. If a
3932 * change adds a fully backwards compatible change (eg, LVB changes that
3933 * are just ignored by older versions), the version does not need to be
3934 * updated.
3935 */
3936static struct ocfs2_locking_protocol lproto = {
3937 .lp_max_version = {
3938 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
3939 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
3940 },
3941 .lp_lock_ast = ocfs2_locking_ast,
3942 .lp_blocking_ast = ocfs2_blocking_ast,
3943 .lp_unlock_ast = ocfs2_unlock_ast,
3944};
3945
3946void ocfs2_set_locking_protocol(void)
3947{
3948 ocfs2_stack_glue_set_locking_protocol(&lproto);
3949}
3950
3951
3952static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 3990static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3953 struct ocfs2_lock_res *lockres) 3991 struct ocfs2_lock_res *lockres)
3954{ 3992{
@@ -3965,7 +4003,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3965 BUG_ON(!lockres); 4003 BUG_ON(!lockres);
3966 BUG_ON(!lockres->l_ops); 4004 BUG_ON(!lockres->l_ops);
3967 4005
3968 mlog(0, "lockres %s blocked.\n", lockres->l_name); 4006 mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
3969 4007
3970 /* Detect whether a lock has been marked as going away while 4008 /* Detect whether a lock has been marked as going away while
3971 * the downconvert thread was processing other things. A lock can 4009 * the downconvert thread was processing other things. A lock can
@@ -3988,7 +4026,7 @@ unqueue:
3988 } else 4026 } else
3989 ocfs2_schedule_blocked_lock(osb, lockres); 4027 ocfs2_schedule_blocked_lock(osb, lockres);
3990 4028
3991 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, 4029 mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
3992 ctl.requeue ? "yes" : "no"); 4030 ctl.requeue ? "yes" : "no");
3993 spin_unlock_irqrestore(&lockres->l_lock, flags); 4031 spin_unlock_irqrestore(&lockres->l_lock, flags);
3994 4032
@@ -4010,7 +4048,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
4010 /* Do not schedule a lock for downconvert when it's on 4048 /* Do not schedule a lock for downconvert when it's on
4011 * the way to destruction - any nodes wanting access 4049 * the way to destruction - any nodes wanting access
4012 * to the resource will get it soon. */ 4050 * to the resource will get it soon. */
4013 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", 4051 mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
4014 lockres->l_name, lockres->l_flags); 4052 lockres->l_name, lockres->l_flags);
4015 return; 4053 return;
4016 } 4054 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 558ce0312421..5b52547d6299 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -993,10 +993,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
993 } 993 }
994 994
995 if (size_change && attr->ia_size != i_size_read(inode)) { 995 if (size_change && attr->ia_size != i_size_read(inode)) {
996 if (attr->ia_size > sb->s_maxbytes) { 996 status = inode_newsize_ok(inode, attr->ia_size);
997 status = -EFBIG; 997 if (status)
998 goto bail_unlock; 998 goto bail_unlock;
999 }
1000 999
1001 if (i_size_read(inode) > attr->ia_size) { 1000 if (i_size_read(inode) > attr->ia_size) {
1002 if (ocfs2_should_order_data(inode)) { 1001 if (ocfs2_should_order_data(inode)) {
@@ -1836,6 +1835,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1836 &meta_level); 1835 &meta_level);
1837 if (has_refcount) 1836 if (has_refcount)
1838 *has_refcount = 1; 1837 *has_refcount = 1;
1838 if (direct_io)
1839 *direct_io = 0;
1839 } 1840 }
1840 1841
1841 if (ret < 0) { 1842 if (ret < 0) {
@@ -1859,10 +1860,6 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1859 break; 1860 break;
1860 } 1861 }
1861 1862
1862 if (has_refcount && *has_refcount == 1) {
1863 *direct_io = 0;
1864 break;
1865 }
1866 /* 1863 /*
1867 * Allowing concurrent direct writes means 1864 * Allowing concurrent direct writes means
1868 * i_size changes wouldn't be synchronized, so 1865 * i_size changes wouldn't be synchronized, so
@@ -2043,7 +2040,7 @@ out_dio:
2043 * async dio is going to do it in the future or an end_io after an 2040 * async dio is going to do it in the future or an end_io after an
2044 * error has already done it. 2041 * error has already done it.
2045 */ 2042 */
2046 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2043 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2047 rw_level = -1; 2044 rw_level = -1;
2048 have_alloc_sem = 0; 2045 have_alloc_sem = 0;
2049 } 2046 }
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fef..0cd5323bd3f0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
7 * 7 *
8 */ 8 */
9 9
10#ifndef OCFS2_IOCTL_H 10#ifndef OCFS2_IOCTL_PROTO_H
11#define OCFS2_IOCTL_H 11#define OCFS2_IOCTL_PROTO_H
12 12
13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); 14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
15 15
16#endif /* OCFS2_IOCTL_H */ 16#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ac10f83edb95..ca992d91f511 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,7 +476,7 @@ out_mutex:
476 476
477out: 477out:
478 if (!status) 478 if (!status)
479 ocfs2_init_inode_steal_slot(osb); 479 ocfs2_init_steal_slots(osb);
480 mlog_exit(status); 480 mlog_exit(status);
481 return status; 481 return status;
482} 482}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740f448041e2..1238b491db90 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -42,6 +42,7 @@
42 42
43#include "ocfs2_fs.h" 43#include "ocfs2_fs.h"
44#include "ocfs2_lockid.h" 44#include "ocfs2_lockid.h"
45#include "ocfs2_ioctl.h"
45 46
46/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
47#include "blockcheck.h" 48#include "blockcheck.h"
@@ -159,7 +160,7 @@ struct ocfs2_lock_res {
159 int l_level; 160 int l_level;
160 unsigned int l_ro_holders; 161 unsigned int l_ro_holders;
161 unsigned int l_ex_holders; 162 unsigned int l_ex_holders;
162 union ocfs2_dlm_lksb l_lksb; 163 struct ocfs2_dlm_lksb l_lksb;
163 164
164 /* used from AST/BAST funcs. */ 165 /* used from AST/BAST funcs. */
165 enum ocfs2_ast_action l_action; 166 enum ocfs2_ast_action l_action;
@@ -305,7 +306,9 @@ struct ocfs2_super
305 u32 s_next_generation; 306 u32 s_next_generation;
306 unsigned long osb_flags; 307 unsigned long osb_flags;
307 s16 s_inode_steal_slot; 308 s16 s_inode_steal_slot;
309 s16 s_meta_steal_slot;
308 atomic_t s_num_inodes_stolen; 310 atomic_t s_num_inodes_stolen;
311 atomic_t s_num_meta_stolen;
309 312
310 unsigned long s_mount_opt; 313 unsigned long s_mount_opt;
311 unsigned int s_atime_quantum; 314 unsigned int s_atime_quantum;
@@ -760,33 +763,6 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
760 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
761} 764}
762 765
763static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
764{
765 spin_lock(&osb->osb_lock);
766 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
767 spin_unlock(&osb->osb_lock);
768 atomic_set(&osb->s_num_inodes_stolen, 0);
769}
770
771static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
772 s16 slot)
773{
774 spin_lock(&osb->osb_lock);
775 osb->s_inode_steal_slot = slot;
776 spin_unlock(&osb->osb_lock);
777}
778
779static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
780{
781 s16 slot;
782
783 spin_lock(&osb->osb_lock);
784 slot = osb->s_inode_steal_slot;
785 spin_unlock(&osb->osb_lock);
786
787 return slot;
788}
789
790#define ocfs2_set_bit ext2_set_bit 766#define ocfs2_set_bit ext2_set_bit
791#define ocfs2_clear_bit ext2_clear_bit 767#define ocfs2_clear_bit ext2_clear_bit
792#define ocfs2_test_bit ext2_test_bit 768#define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7638a38c32bc..bb37218a7978 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -254,63 +254,6 @@
254 * refcount tree */ 254 * refcount tree */
255 255
256/* 256/*
257 * ioctl commands
258 */
259#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
260#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
261#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
262#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
263
264/*
265 * Space reservation / allocation / free ioctls and argument structure
266 * are designed to be compatible with XFS.
267 *
268 * ALLOCSP* and FREESP* are not and will never be supported, but are
269 * included here for completeness.
270 */
271struct ocfs2_space_resv {
272 __s16 l_type;
273 __s16 l_whence;
274 __s64 l_start;
275 __s64 l_len; /* len == 0 means until end of file */
276 __s32 l_sysid;
277 __u32 l_pid;
278 __s32 l_pad[4]; /* reserve area */
279};
280
281#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
282#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
283#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
284#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
285#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
286#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
287#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
288#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
289
290/* Used to pass group descriptor data when online resize is done */
291struct ocfs2_new_group_input {
292 __u64 group; /* Group descriptor's blkno. */
293 __u32 clusters; /* Total number of clusters in this group */
294 __u32 frees; /* Total free clusters in this group */
295 __u16 chain; /* Chain for this group */
296 __u16 reserved1;
297 __u32 reserved2;
298};
299
300#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
301#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
302#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
303
304/* Used to pass 2 file names to reflink. */
305struct reflink_arguments {
306 __u64 old_path;
307 __u64 new_path;
308 __u64 preserve;
309};
310#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
311
312
313/*
314 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 257 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
315 */ 258 */
316#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 259#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000000..2d3420af1a83
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,79 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_ioctl.h
5 *
6 * Defines OCFS2 ioctls.
7 *
8 * Copyright (C) 2010 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_IOCTL_H
21#define OCFS2_IOCTL_H
22
23/*
24 * ioctl commands
25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
30
31/*
32 * Space reservation / allocation / free ioctls and argument structure
33 * are designed to be compatible with XFS.
34 *
35 * ALLOCSP* and FREESP* are not and will never be supported, but are
36 * included here for completeness.
37 */
38struct ocfs2_space_resv {
39 __s16 l_type;
40 __s16 l_whence;
41 __s64 l_start;
42 __s64 l_len; /* len == 0 means until end of file */
43 __s32 l_sysid;
44 __u32 l_pid;
45 __s32 l_pad[4]; /* reserve area */
46};
47
48#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
49#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
50#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
51#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
52#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
53#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
54#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
55#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
56
57/* Used to pass group descriptor data when online resize is done */
58struct ocfs2_new_group_input {
59 __u64 group; /* Group descriptor's blkno. */
60 __u32 clusters; /* Total number of clusters in this group */
61 __u32 frees; /* Total free clusters in this group */
62 __u16 chain; /* Chain for this group */
63 __u16 reserved1;
64 __u32 reserved2;
65};
66
67#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
68#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
69#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
70
71/* Used to pass 2 file names to reflink. */
72struct reflink_arguments {
73 __u64 old_path;
74 __u64 new_path;
75 __u64 preserve;
76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78
79#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0fff..2e45c8d2ea7e 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
23/* 23/*
24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for 24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for
25 * more details. 25 * more details.
26 *
27 * 1.0 - Initial locking version from ocfs2 1.4.
26 */ 28 */
27#define OCFS2_LOCKING_PROTOCOL_MAJOR 1 29#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
28#define OCFS2_LOCKING_PROTOCOL_MINOR 0 30#define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 8ae65c9c020c..fb6aa7acf54b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -626,7 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
626 rb = (struct ocfs2_refcount_block *)new_bh->b_data; 626 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
627 memset(rb, 0, inode->i_sb->s_blocksize); 627 memset(rb, 0, inode->i_sb->s_blocksize);
628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
629 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); 629 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
632 rb->rf_blkno = cpu_to_le64(first_blkno); 632 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -1330,7 +1330,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1330 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); 1330 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1331 1331
1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1333 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1333 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1335 new_rb->rf_blkno = cpu_to_le64(blkno); 1335 new_rb->rf_blkno = cpu_to_le64(blkno);
1336 new_rb->rf_cpos = cpu_to_le32(0); 1336 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1576,7 +1576,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1576 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1576 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1577 memset(new_rb, 0, sb->s_blocksize); 1577 memset(new_rb, 0, sb->s_blocksize);
1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1579 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1579 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1582 new_rb->rf_blkno = cpu_to_le64(blkno); 1582 new_rb->rf_blkno = cpu_to_le64(blkno);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3038c92af493..7020e1253ffa 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -161,24 +161,23 @@ static int dlm_status_to_errno(enum dlm_status status)
161 161
162static void o2dlm_lock_ast_wrapper(void *astarg) 162static void o2dlm_lock_ast_wrapper(void *astarg)
163{ 163{
164 BUG_ON(o2cb_stack.sp_proto == NULL); 164 struct ocfs2_dlm_lksb *lksb = astarg;
165 165
166 o2cb_stack.sp_proto->lp_lock_ast(astarg); 166 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
167} 167}
168 168
169static void o2dlm_blocking_ast_wrapper(void *astarg, int level) 169static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
170{ 170{
171 BUG_ON(o2cb_stack.sp_proto == NULL); 171 struct ocfs2_dlm_lksb *lksb = astarg;
172 172
173 o2cb_stack.sp_proto->lp_blocking_ast(astarg, level); 173 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
174} 174}
175 175
176static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) 176static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
177{ 177{
178 struct ocfs2_dlm_lksb *lksb = astarg;
178 int error = dlm_status_to_errno(status); 179 int error = dlm_status_to_errno(status);
179 180
180 BUG_ON(o2cb_stack.sp_proto == NULL);
181
182 /* 181 /*
183 * In o2dlm, you can get both the lock_ast() for the lock being 182 * In o2dlm, you can get both the lock_ast() for the lock being
184 * granted and the unlock_ast() for the CANCEL failing. A 183 * granted and the unlock_ast() for the CANCEL failing. A
@@ -193,16 +192,15 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
193 if (status == DLM_CANCELGRANT) 192 if (status == DLM_CANCELGRANT)
194 return; 193 return;
195 194
196 o2cb_stack.sp_proto->lp_unlock_ast(astarg, error); 195 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
197} 196}
198 197
199static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, 198static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
200 int mode, 199 int mode,
201 union ocfs2_dlm_lksb *lksb, 200 struct ocfs2_dlm_lksb *lksb,
202 u32 flags, 201 u32 flags,
203 void *name, 202 void *name,
204 unsigned int namelen, 203 unsigned int namelen)
205 void *astarg)
206{ 204{
207 enum dlm_status status; 205 enum dlm_status status;
208 int o2dlm_mode = mode_to_o2dlm(mode); 206 int o2dlm_mode = mode_to_o2dlm(mode);
@@ -211,28 +209,27 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
211 209
212 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, 210 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
213 o2dlm_flags, name, namelen, 211 o2dlm_flags, name, namelen,
214 o2dlm_lock_ast_wrapper, astarg, 212 o2dlm_lock_ast_wrapper, lksb,
215 o2dlm_blocking_ast_wrapper); 213 o2dlm_blocking_ast_wrapper);
216 ret = dlm_status_to_errno(status); 214 ret = dlm_status_to_errno(status);
217 return ret; 215 return ret;
218} 216}
219 217
220static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, 218static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
221 union ocfs2_dlm_lksb *lksb, 219 struct ocfs2_dlm_lksb *lksb,
222 u32 flags, 220 u32 flags)
223 void *astarg)
224{ 221{
225 enum dlm_status status; 222 enum dlm_status status;
226 int o2dlm_flags = flags_to_o2dlm(flags); 223 int o2dlm_flags = flags_to_o2dlm(flags);
227 int ret; 224 int ret;
228 225
229 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, 226 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
230 o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg); 227 o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
231 ret = dlm_status_to_errno(status); 228 ret = dlm_status_to_errno(status);
232 return ret; 229 return ret;
233} 230}
234 231
235static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 232static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
236{ 233{
237 return dlm_status_to_errno(lksb->lksb_o2dlm.status); 234 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
238} 235}
@@ -242,17 +239,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
242 * contents, it will zero out the LVB. Thus the caller can always trust 239 * contents, it will zero out the LVB. Thus the caller can always trust
243 * the contents. 240 * the contents.
244 */ 241 */
245static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 242static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
246{ 243{
247 return 1; 244 return 1;
248} 245}
249 246
250static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) 247static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
251{ 248{
252 return (void *)(lksb->lksb_o2dlm.lvb); 249 return (void *)(lksb->lksb_o2dlm.lvb);
253} 250}
254 251
255static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb) 252static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256{ 253{
257 dlm_print_one_lock(lksb->lksb_o2dlm.lockid); 254 dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
258} 255}
@@ -280,7 +277,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
280 struct dlm_protocol_version fs_version; 277 struct dlm_protocol_version fs_version;
281 278
282 BUG_ON(conn == NULL); 279 BUG_ON(conn == NULL);
283 BUG_ON(o2cb_stack.sp_proto == NULL); 280 BUG_ON(conn->cc_proto == NULL);
284 281
285 /* for now we only have one cluster/node, make sure we see it 282 /* for now we only have one cluster/node, make sure we see it
286 * in the heartbeat universe */ 283 * in the heartbeat universe */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index da78a2a334fd..5ae8812b2864 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -25,7 +25,6 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27 27
28#include "ocfs2.h" /* For struct ocfs2_lock_res */
29#include "stackglue.h" 28#include "stackglue.h"
30 29
31#include <linux/dlm_plock.h> 30#include <linux/dlm_plock.h>
@@ -63,8 +62,8 @@
63 * negotiated by the client. The client negotiates based on the maximum 62 * negotiated by the client. The client negotiates based on the maximum
64 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major 63 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
65 * number from the "SETV" message must match 64 * number from the "SETV" message must match
66 * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number 65 * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
67 * must be less than or equal to ...->lp_max_version.pv_minor. 66 * must be less than or equal to ...sp_max_version.pv_minor.
68 * 67 *
69 * Once this information has been set, mounts will be allowed. From this 68 * Once this information has been set, mounts will be allowed. From this
70 * point on, the "DOWN" message can be sent for node down notification. 69 * point on, the "DOWN" message can be sent for node down notification.
@@ -401,7 +400,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
401 char *ptr = NULL; 400 char *ptr = NULL;
402 struct ocfs2_control_private *p = file->private_data; 401 struct ocfs2_control_private *p = file->private_data;
403 struct ocfs2_protocol_version *max = 402 struct ocfs2_protocol_version *max =
404 &ocfs2_user_plugin.sp_proto->lp_max_version; 403 &ocfs2_user_plugin.sp_max_proto;
405 404
406 if (ocfs2_control_get_handshake_state(file) != 405 if (ocfs2_control_get_handshake_state(file) !=
407 OCFS2_CONTROL_HANDSHAKE_PROTOCOL) 406 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -664,18 +663,10 @@ static void ocfs2_control_exit(void)
664 -rc); 663 -rc);
665} 664}
666 665
667static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
668{
669 struct ocfs2_lock_res *res = astarg;
670 return &res->l_lksb.lksb_fsdlm;
671}
672
673static void fsdlm_lock_ast_wrapper(void *astarg) 666static void fsdlm_lock_ast_wrapper(void *astarg)
674{ 667{
675 struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); 668 struct ocfs2_dlm_lksb *lksb = astarg;
676 int status = lksb->sb_status; 669 int status = lksb->lksb_fsdlm.sb_status;
677
678 BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
679 670
680 /* 671 /*
681 * For now we're punting on the issue of other non-standard errors 672 * For now we're punting on the issue of other non-standard errors
@@ -688,25 +679,24 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
688 */ 679 */
689 680
690 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) 681 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
691 ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0); 682 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
692 else 683 else
693 ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg); 684 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
694} 685}
695 686
696static void fsdlm_blocking_ast_wrapper(void *astarg, int level) 687static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
697{ 688{
698 BUG_ON(ocfs2_user_plugin.sp_proto == NULL); 689 struct ocfs2_dlm_lksb *lksb = astarg;
699 690
700 ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level); 691 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
701} 692}
702 693
703static int user_dlm_lock(struct ocfs2_cluster_connection *conn, 694static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
704 int mode, 695 int mode,
705 union ocfs2_dlm_lksb *lksb, 696 struct ocfs2_dlm_lksb *lksb,
706 u32 flags, 697 u32 flags,
707 void *name, 698 void *name,
708 unsigned int namelen, 699 unsigned int namelen)
709 void *astarg)
710{ 700{
711 int ret; 701 int ret;
712 702
@@ -716,36 +706,35 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
716 706
717 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, 707 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
718 flags|DLM_LKF_NODLCKWT, name, namelen, 0, 708 flags|DLM_LKF_NODLCKWT, name, namelen, 0,
719 fsdlm_lock_ast_wrapper, astarg, 709 fsdlm_lock_ast_wrapper, lksb,
720 fsdlm_blocking_ast_wrapper); 710 fsdlm_blocking_ast_wrapper);
721 return ret; 711 return ret;
722} 712}
723 713
724static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, 714static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
725 union ocfs2_dlm_lksb *lksb, 715 struct ocfs2_dlm_lksb *lksb,
726 u32 flags, 716 u32 flags)
727 void *astarg)
728{ 717{
729 int ret; 718 int ret;
730 719
731 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, 720 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
732 flags, &lksb->lksb_fsdlm, astarg); 721 flags, &lksb->lksb_fsdlm, lksb);
733 return ret; 722 return ret;
734} 723}
735 724
736static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 725static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
737{ 726{
738 return lksb->lksb_fsdlm.sb_status; 727 return lksb->lksb_fsdlm.sb_status;
739} 728}
740 729
741static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 730static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
742{ 731{
743 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID; 732 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
744 733
745 return !invalid; 734 return !invalid;
746} 735}
747 736
748static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) 737static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
749{ 738{
750 if (!lksb->lksb_fsdlm.sb_lvbptr) 739 if (!lksb->lksb_fsdlm.sb_lvbptr)
751 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + 740 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -753,7 +742,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
753 return (void *)(lksb->lksb_fsdlm.sb_lvbptr); 742 return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
754} 743}
755 744
756static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 745static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
757{ 746{
758} 747}
759 748
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index f3df0baa9a48..39abf89697ed 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -36,7 +36,7 @@
36#define OCFS2_STACK_PLUGIN_USER "user" 36#define OCFS2_STACK_PLUGIN_USER "user"
37#define OCFS2_MAX_HB_CTL_PATH 256 37#define OCFS2_MAX_HB_CTL_PATH 256
38 38
39static struct ocfs2_locking_protocol *lproto; 39static struct ocfs2_protocol_version locking_max_version;
40static DEFINE_SPINLOCK(ocfs2_stack_lock); 40static DEFINE_SPINLOCK(ocfs2_stack_lock);
41static LIST_HEAD(ocfs2_stack_list); 41static LIST_HEAD(ocfs2_stack_list);
42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; 42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
176 spin_lock(&ocfs2_stack_lock); 176 spin_lock(&ocfs2_stack_lock);
177 if (!ocfs2_stack_lookup(plugin->sp_name)) { 177 if (!ocfs2_stack_lookup(plugin->sp_name)) {
178 plugin->sp_count = 0; 178 plugin->sp_count = 0;
179 plugin->sp_proto = lproto; 179 plugin->sp_max_proto = locking_max_version;
180 list_add(&plugin->sp_list, &ocfs2_stack_list); 180 list_add(&plugin->sp_list, &ocfs2_stack_list);
181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", 181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
182 plugin->sp_name); 182 plugin->sp_name);
@@ -213,77 +213,76 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
213} 213}
214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); 214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
215 215
216void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) 216void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
217{ 217{
218 struct ocfs2_stack_plugin *p; 218 struct ocfs2_stack_plugin *p;
219 219
220 BUG_ON(proto == NULL);
221
222 spin_lock(&ocfs2_stack_lock); 220 spin_lock(&ocfs2_stack_lock);
223 BUG_ON(active_stack != NULL); 221 if (memcmp(max_proto, &locking_max_version,
222 sizeof(struct ocfs2_protocol_version))) {
223 BUG_ON(locking_max_version.pv_major != 0);
224 224
225 lproto = proto; 225 locking_max_version = *max_proto;
226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) { 226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
227 p->sp_proto = lproto; 227 p->sp_max_proto = locking_max_version;
228 }
228 } 229 }
229
230 spin_unlock(&ocfs2_stack_lock); 230 spin_unlock(&ocfs2_stack_lock);
231} 231}
232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); 232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
233 233
234 234
235/* 235/*
236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take 236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
237 * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the 237 * for the ast and bast functions. They will pass the lksb to the ast
238 * underlying stack plugins need to pilfer the lksb off of the lock_res. 238 * and bast. The caller can wrap the lksb with their own structure to
239 * If some other structure needs to be passed as an astarg, the plugins 239 * get more information.
240 * will need to be given a different avenue to the lksb.
241 */ 240 */
242int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 241int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
243 int mode, 242 int mode,
244 union ocfs2_dlm_lksb *lksb, 243 struct ocfs2_dlm_lksb *lksb,
245 u32 flags, 244 u32 flags,
246 void *name, 245 void *name,
247 unsigned int namelen, 246 unsigned int namelen)
248 struct ocfs2_lock_res *astarg)
249{ 247{
250 BUG_ON(lproto == NULL); 248 if (!lksb->lksb_conn)
251 249 lksb->lksb_conn = conn;
250 else
251 BUG_ON(lksb->lksb_conn != conn);
252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, 252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
253 name, namelen, astarg); 253 name, namelen);
254} 254}
255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); 255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
256 256
257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
258 union ocfs2_dlm_lksb *lksb, 258 struct ocfs2_dlm_lksb *lksb,
259 u32 flags, 259 u32 flags)
260 struct ocfs2_lock_res *astarg)
261{ 260{
262 BUG_ON(lproto == NULL); 261 BUG_ON(lksb->lksb_conn == NULL);
263 262
264 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg); 263 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
265} 264}
266EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); 265EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
267 266
268int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 267int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
269{ 268{
270 return active_stack->sp_ops->lock_status(lksb); 269 return active_stack->sp_ops->lock_status(lksb);
271} 270}
272EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); 271EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
273 272
274int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 273int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
275{ 274{
276 return active_stack->sp_ops->lvb_valid(lksb); 275 return active_stack->sp_ops->lvb_valid(lksb);
277} 276}
278EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); 277EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
279 278
280void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) 279void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
281{ 280{
282 return active_stack->sp_ops->lock_lvb(lksb); 281 return active_stack->sp_ops->lock_lvb(lksb);
283} 282}
284EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); 283EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
285 284
286void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 285void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
287{ 286{
288 active_stack->sp_ops->dump_lksb(lksb); 287 active_stack->sp_ops->dump_lksb(lksb);
289} 288}
@@ -312,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock);
312int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
313 const char *group, 312 const char *group,
314 int grouplen, 313 int grouplen,
314 struct ocfs2_locking_protocol *lproto,
315 void (*recovery_handler)(int node_num, 315 void (*recovery_handler)(int node_num,
316 void *recovery_data), 316 void *recovery_data),
317 void *recovery_data, 317 void *recovery_data,
@@ -329,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name,
329 goto out; 329 goto out;
330 } 330 }
331 331
332 if (memcmp(&lproto->lp_max_version, &locking_max_version,
333 sizeof(struct ocfs2_protocol_version))) {
334 rc = -EINVAL;
335 goto out;
336 }
337
332 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), 338 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
333 GFP_KERNEL); 339 GFP_KERNEL);
334 if (!new_conn) { 340 if (!new_conn) {
@@ -341,6 +347,7 @@ int ocfs2_cluster_connect(const char *stack_name,
341 new_conn->cc_recovery_handler = recovery_handler; 347 new_conn->cc_recovery_handler = recovery_handler;
342 new_conn->cc_recovery_data = recovery_data; 348 new_conn->cc_recovery_data = recovery_data;
343 349
350 new_conn->cc_proto = lproto;
344 /* Start the new connection at our maximum compatibility level */ 351 /* Start the new connection at our maximum compatibility level */
345 new_conn->cc_version = lproto->lp_max_version; 352 new_conn->cc_version = lproto->lp_max_version;
346 353
@@ -366,6 +373,24 @@ out:
366} 373}
367EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); 374EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
368 375
376/* The caller will ensure all nodes have the same cluster stack */
377int ocfs2_cluster_connect_agnostic(const char *group,
378 int grouplen,
379 struct ocfs2_locking_protocol *lproto,
380 void (*recovery_handler)(int node_num,
381 void *recovery_data),
382 void *recovery_data,
383 struct ocfs2_cluster_connection **conn)
384{
385 char *stack_name = NULL;
386
387 if (cluster_stack_name[0])
388 stack_name = cluster_stack_name;
389 return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
390 recovery_handler, recovery_data, conn);
391}
392EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
393
369/* If hangup_pending is 0, the stack driver will be dropped */ 394/* If hangup_pending is 0, the stack driver will be dropped */
370int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 395int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
371 int hangup_pending) 396 int hangup_pending)
@@ -453,10 +478,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
453 ssize_t ret = 0; 478 ssize_t ret = 0;
454 479
455 spin_lock(&ocfs2_stack_lock); 480 spin_lock(&ocfs2_stack_lock);
456 if (lproto) 481 if (locking_max_version.pv_major)
457 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", 482 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
458 lproto->lp_max_version.pv_major, 483 locking_max_version.pv_major,
459 lproto->lp_max_version.pv_minor); 484 locking_max_version.pv_minor);
460 spin_unlock(&ocfs2_stack_lock); 485 spin_unlock(&ocfs2_stack_lock);
461 486
462 return ret; 487 return ret;
@@ -685,7 +710,10 @@ static int __init ocfs2_stack_glue_init(void)
685 710
686static void __exit ocfs2_stack_glue_exit(void) 711static void __exit ocfs2_stack_glue_exit(void)
687{ 712{
688 lproto = NULL; 713 memset(&locking_max_version, 0,
714 sizeof(struct ocfs2_protocol_version));
715 locking_max_version.pv_major = 0;
716 locking_max_version.pv_minor = 0;
689 ocfs2_sysfs_exit(); 717 ocfs2_sysfs_exit();
690 if (ocfs2_table_header) 718 if (ocfs2_table_header)
691 unregister_sysctl_table(ocfs2_table_header); 719 unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 03a44d60eac9..8ce7398ae1d2 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -56,17 +56,6 @@ struct ocfs2_protocol_version {
56}; 56};
57 57
58/* 58/*
59 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
60 */
61struct ocfs2_locking_protocol {
62 struct ocfs2_protocol_version lp_max_version;
63 void (*lp_lock_ast)(void *astarg);
64 void (*lp_blocking_ast)(void *astarg, int level);
65 void (*lp_unlock_ast)(void *astarg, int error);
66};
67
68
69/*
70 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only 59 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
71 * has a pointer to separately allocated lvb space. This struct exists only to 60 * has a pointer to separately allocated lvb space. This struct exists only to
72 * include in the lksb union to make space for a combined dlm_lksb and lvb. 61 * include in the lksb union to make space for a combined dlm_lksb and lvb.
@@ -81,12 +70,27 @@ struct fsdlm_lksb_plus_lvb {
81 * size of the union is known. Lock status structures are embedded in 70 * size of the union is known. Lock status structures are embedded in
82 * ocfs2 inodes. 71 * ocfs2 inodes.
83 */ 72 */
84union ocfs2_dlm_lksb { 73struct ocfs2_cluster_connection;
85 struct dlm_lockstatus lksb_o2dlm; 74struct ocfs2_dlm_lksb {
86 struct dlm_lksb lksb_fsdlm; 75 union {
87 struct fsdlm_lksb_plus_lvb padding; 76 struct dlm_lockstatus lksb_o2dlm;
77 struct dlm_lksb lksb_fsdlm;
78 struct fsdlm_lksb_plus_lvb padding;
79 };
80 struct ocfs2_cluster_connection *lksb_conn;
81};
82
83/*
84 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
85 */
86struct ocfs2_locking_protocol {
87 struct ocfs2_protocol_version lp_max_version;
88 void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
89 void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
90 void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
88}; 91};
89 92
93
90/* 94/*
91 * A cluster connection. Mostly opaque to ocfs2, the connection holds 95 * A cluster connection. Mostly opaque to ocfs2, the connection holds
92 * state for the underlying stack. ocfs2 does use cc_version to determine 96 * state for the underlying stack. ocfs2 does use cc_version to determine
@@ -96,6 +100,7 @@ struct ocfs2_cluster_connection {
96 char cc_name[GROUP_NAME_MAX]; 100 char cc_name[GROUP_NAME_MAX];
97 int cc_namelen; 101 int cc_namelen;
98 struct ocfs2_protocol_version cc_version; 102 struct ocfs2_protocol_version cc_version;
103 struct ocfs2_locking_protocol *cc_proto;
99 void (*cc_recovery_handler)(int node_num, void *recovery_data); 104 void (*cc_recovery_handler)(int node_num, void *recovery_data);
100 void *cc_recovery_data; 105 void *cc_recovery_data;
101 void *cc_lockspace; 106 void *cc_lockspace;
@@ -155,27 +160,29 @@ struct ocfs2_stack_operations {
155 * 160 *
156 * ast and bast functions are not part of the call because the 161 * ast and bast functions are not part of the call because the
157 * stack will likely want to wrap ast and bast calls before passing 162 * stack will likely want to wrap ast and bast calls before passing
158 * them to stack->sp_proto. 163 * them to stack->sp_proto. There is no astarg. The lksb will
164 * be passed back to the ast and bast functions. The caller can
165 * use this to find their object.
159 */ 166 */
160 int (*dlm_lock)(struct ocfs2_cluster_connection *conn, 167 int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
161 int mode, 168 int mode,
162 union ocfs2_dlm_lksb *lksb, 169 struct ocfs2_dlm_lksb *lksb,
163 u32 flags, 170 u32 flags,
164 void *name, 171 void *name,
165 unsigned int namelen, 172 unsigned int namelen);
166 void *astarg);
167 173
168 /* 174 /*
169 * Call the underlying dlm unlock function. The ->dlm_unlock() 175 * Call the underlying dlm unlock function. The ->dlm_unlock()
170 * function should convert the flags as appropriate. 176 * function should convert the flags as appropriate.
171 * 177 *
172 * The unlock ast is not passed, as the stack will want to wrap 178 * The unlock ast is not passed, as the stack will want to wrap
173 * it before calling stack->sp_proto->lp_unlock_ast(). 179 * it before calling stack->sp_proto->lp_unlock_ast(). There is
180 * no astarg. The lksb will be passed back to the unlock ast
181 * function. The caller can use this to find their object.
174 */ 182 */
175 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, 183 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
176 union ocfs2_dlm_lksb *lksb, 184 struct ocfs2_dlm_lksb *lksb,
177 u32 flags, 185 u32 flags);
178 void *astarg);
179 186
180 /* 187 /*
181 * Return the status of the current lock status block. The fs 188 * Return the status of the current lock status block. The fs
@@ -183,17 +190,17 @@ struct ocfs2_stack_operations {
183 * callback pulls out the stack-specific lksb, converts the status 190 * callback pulls out the stack-specific lksb, converts the status
184 * to a proper errno, and returns it. 191 * to a proper errno, and returns it.
185 */ 192 */
186 int (*lock_status)(union ocfs2_dlm_lksb *lksb); 193 int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
187 194
188 /* 195 /*
189 * Return non-zero if the LVB is valid. 196 * Return non-zero if the LVB is valid.
190 */ 197 */
191 int (*lvb_valid)(union ocfs2_dlm_lksb *lksb); 198 int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
192 199
193 /* 200 /*
194 * Pull the lvb pointer off of the stack-specific lksb. 201 * Pull the lvb pointer off of the stack-specific lksb.
195 */ 202 */
196 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); 203 void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
197 204
198 /* 205 /*
199 * Cluster-aware posix locks 206 * Cluster-aware posix locks
@@ -210,7 +217,7 @@ struct ocfs2_stack_operations {
210 * This is an optoinal debugging hook. If provided, the 217 * This is an optoinal debugging hook. If provided, the
211 * stack can dump debugging information about this lock. 218 * stack can dump debugging information about this lock.
212 */ 219 */
213 void (*dump_lksb)(union ocfs2_dlm_lksb *lksb); 220 void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
214}; 221};
215 222
216/* 223/*
@@ -226,7 +233,7 @@ struct ocfs2_stack_plugin {
226 /* These are managed by the stackglue code. */ 233 /* These are managed by the stackglue code. */
227 struct list_head sp_list; 234 struct list_head sp_list;
228 unsigned int sp_count; 235 unsigned int sp_count;
229 struct ocfs2_locking_protocol *sp_proto; 236 struct ocfs2_protocol_version sp_max_proto;
230}; 237};
231 238
232 239
@@ -234,10 +241,22 @@ struct ocfs2_stack_plugin {
234int ocfs2_cluster_connect(const char *stack_name, 241int ocfs2_cluster_connect(const char *stack_name,
235 const char *group, 242 const char *group,
236 int grouplen, 243 int grouplen,
244 struct ocfs2_locking_protocol *lproto,
237 void (*recovery_handler)(int node_num, 245 void (*recovery_handler)(int node_num,
238 void *recovery_data), 246 void *recovery_data),
239 void *recovery_data, 247 void *recovery_data,
240 struct ocfs2_cluster_connection **conn); 248 struct ocfs2_cluster_connection **conn);
249/*
250 * Used by callers that don't store their stack name. They must ensure
251 * all nodes have the same stack.
252 */
253int ocfs2_cluster_connect_agnostic(const char *group,
254 int grouplen,
255 struct ocfs2_locking_protocol *lproto,
256 void (*recovery_handler)(int node_num,
257 void *recovery_data),
258 void *recovery_data,
259 struct ocfs2_cluster_connection **conn);
241int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 260int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
242 int hangup_pending); 261 int hangup_pending);
243void ocfs2_cluster_hangup(const char *group, int grouplen); 262void ocfs2_cluster_hangup(const char *group, int grouplen);
@@ -246,26 +265,24 @@ int ocfs2_cluster_this_node(unsigned int *node);
246struct ocfs2_lock_res; 265struct ocfs2_lock_res;
247int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 266int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
248 int mode, 267 int mode,
249 union ocfs2_dlm_lksb *lksb, 268 struct ocfs2_dlm_lksb *lksb,
250 u32 flags, 269 u32 flags,
251 void *name, 270 void *name,
252 unsigned int namelen, 271 unsigned int namelen);
253 struct ocfs2_lock_res *astarg);
254int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 272int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
255 union ocfs2_dlm_lksb *lksb, 273 struct ocfs2_dlm_lksb *lksb,
256 u32 flags, 274 u32 flags);
257 struct ocfs2_lock_res *astarg);
258 275
259int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); 276int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
260int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb); 277int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
261void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); 278void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
262void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); 279void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
263 280
264int ocfs2_stack_supports_plocks(void); 281int ocfs2_stack_supports_plocks(void);
265int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, 282int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
266 struct file *file, int cmd, struct file_lock *fl); 283 struct file *file, int cmd, struct file_lock *fl);
267 284
268void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); 285void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
269 286
270 287
271/* Used by stack plugins */ 288/* Used by stack plugins */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..c3c60bc3e072 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -51,7 +51,7 @@
51#define ALLOC_NEW_GROUP 0x1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2 52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
53 53
54#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -637,12 +637,113 @@ bail:
637 return status; 637 return status;
638} 638}
639 639
640static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
641{
642 spin_lock(&osb->osb_lock);
643 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
644 spin_unlock(&osb->osb_lock);
645 atomic_set(&osb->s_num_inodes_stolen, 0);
646}
647
648static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
649{
650 spin_lock(&osb->osb_lock);
651 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
652 spin_unlock(&osb->osb_lock);
653 atomic_set(&osb->s_num_meta_stolen, 0);
654}
655
656void ocfs2_init_steal_slots(struct ocfs2_super *osb)
657{
658 ocfs2_init_inode_steal_slot(osb);
659 ocfs2_init_meta_steal_slot(osb);
660}
661
662static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
663{
664 spin_lock(&osb->osb_lock);
665 if (type == INODE_ALLOC_SYSTEM_INODE)
666 osb->s_inode_steal_slot = slot;
667 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
668 osb->s_meta_steal_slot = slot;
669 spin_unlock(&osb->osb_lock);
670}
671
672static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
673{
674 int slot = OCFS2_INVALID_SLOT;
675
676 spin_lock(&osb->osb_lock);
677 if (type == INODE_ALLOC_SYSTEM_INODE)
678 slot = osb->s_inode_steal_slot;
679 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
680 slot = osb->s_meta_steal_slot;
681 spin_unlock(&osb->osb_lock);
682
683 return slot;
684}
685
686static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
687{
688 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
689}
690
691static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
692{
693 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
694}
695
696static int ocfs2_steal_resource(struct ocfs2_super *osb,
697 struct ocfs2_alloc_context *ac,
698 int type)
699{
700 int i, status = -ENOSPC;
701 int slot = __ocfs2_get_steal_slot(osb, type);
702
703 /* Start to steal resource from the first slot after ours. */
704 if (slot == OCFS2_INVALID_SLOT)
705 slot = osb->slot_num + 1;
706
707 for (i = 0; i < osb->max_slots; i++, slot++) {
708 if (slot == osb->max_slots)
709 slot = 0;
710
711 if (slot == osb->slot_num)
712 continue;
713
714 status = ocfs2_reserve_suballoc_bits(osb, ac,
715 type,
716 (u32)slot, NULL,
717 NOT_ALLOC_NEW_GROUP);
718 if (status >= 0) {
719 __ocfs2_set_steal_slot(osb, slot, type);
720 break;
721 }
722
723 ocfs2_free_ac_resource(ac);
724 }
725
726 return status;
727}
728
729static int ocfs2_steal_inode(struct ocfs2_super *osb,
730 struct ocfs2_alloc_context *ac)
731{
732 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
733}
734
735static int ocfs2_steal_meta(struct ocfs2_super *osb,
736 struct ocfs2_alloc_context *ac)
737{
738 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
739}
740
640int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 741int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
641 int blocks, 742 int blocks,
642 struct ocfs2_alloc_context **ac) 743 struct ocfs2_alloc_context **ac)
643{ 744{
644 int status; 745 int status;
645 u32 slot; 746 int slot = ocfs2_get_meta_steal_slot(osb);
646 747
647 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 748 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
648 if (!(*ac)) { 749 if (!(*ac)) {
@@ -653,12 +754,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
653 754
654 (*ac)->ac_bits_wanted = blocks; 755 (*ac)->ac_bits_wanted = blocks;
655 (*ac)->ac_which = OCFS2_AC_USE_META; 756 (*ac)->ac_which = OCFS2_AC_USE_META;
656 slot = osb->slot_num;
657 (*ac)->ac_group_search = ocfs2_block_group_search; 757 (*ac)->ac_group_search = ocfs2_block_group_search;
658 758
759 if (slot != OCFS2_INVALID_SLOT &&
760 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
761 goto extent_steal;
762
763 atomic_set(&osb->s_num_meta_stolen, 0);
659 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 764 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
660 EXTENT_ALLOC_SYSTEM_INODE, 765 EXTENT_ALLOC_SYSTEM_INODE,
661 slot, NULL, ALLOC_NEW_GROUP); 766 (u32)osb->slot_num, NULL,
767 ALLOC_NEW_GROUP);
768
769
770 if (status >= 0) {
771 status = 0;
772 if (slot != OCFS2_INVALID_SLOT)
773 ocfs2_init_meta_steal_slot(osb);
774 goto bail;
775 } else if (status < 0 && status != -ENOSPC) {
776 mlog_errno(status);
777 goto bail;
778 }
779
780 ocfs2_free_ac_resource(*ac);
781
782extent_steal:
783 status = ocfs2_steal_meta(osb, *ac);
784 atomic_inc(&osb->s_num_meta_stolen);
662 if (status < 0) { 785 if (status < 0) {
663 if (status != -ENOSPC) 786 if (status != -ENOSPC)
664 mlog_errno(status); 787 mlog_errno(status);
@@ -685,43 +808,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
685 ac); 808 ac);
686} 809}
687 810
688static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
689 struct ocfs2_alloc_context *ac)
690{
691 int i, status = -ENOSPC;
692 s16 slot = ocfs2_get_inode_steal_slot(osb);
693
694 /* Start to steal inodes from the first slot after ours. */
695 if (slot == OCFS2_INVALID_SLOT)
696 slot = osb->slot_num + 1;
697
698 for (i = 0; i < osb->max_slots; i++, slot++) {
699 if (slot == osb->max_slots)
700 slot = 0;
701
702 if (slot == osb->slot_num)
703 continue;
704
705 status = ocfs2_reserve_suballoc_bits(osb, ac,
706 INODE_ALLOC_SYSTEM_INODE,
707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
709 if (status >= 0) {
710 ocfs2_set_inode_steal_slot(osb, slot);
711 break;
712 }
713
714 ocfs2_free_ac_resource(ac);
715 }
716
717 return status;
718}
719
720int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 811int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
721 struct ocfs2_alloc_context **ac) 812 struct ocfs2_alloc_context **ac)
722{ 813{
723 int status; 814 int status;
724 s16 slot = ocfs2_get_inode_steal_slot(osb); 815 int slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group; 816 u64 alloc_group;
726 817
727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 818 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +845,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
754 * need to check our slots to see whether there is some space for us. 845 * need to check our slots to see whether there is some space for us.
755 */ 846 */
756 if (slot != OCFS2_INVALID_SLOT && 847 if (slot != OCFS2_INVALID_SLOT &&
757 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL) 848 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
758 goto inode_steal; 849 goto inode_steal;
759 850
760 atomic_set(&osb->s_num_inodes_stolen, 0); 851 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group; 852 alloc_group = osb->osb_inode_alloc_group;
762 status = ocfs2_reserve_suballoc_bits(osb, *ac, 853 status = ocfs2_reserve_suballoc_bits(osb, *ac,
763 INODE_ALLOC_SYSTEM_INODE, 854 INODE_ALLOC_SYSTEM_INODE,
764 osb->slot_num, 855 (u32)osb->slot_num,
765 &alloc_group, 856 &alloc_group,
766 ALLOC_NEW_GROUP | 857 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL); 858 ALLOC_GROUPS_FROM_GLOBAL);
@@ -789,7 +880,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
789 ocfs2_free_ac_resource(*ac); 880 ocfs2_free_ac_resource(*ac);
790 881
791inode_steal: 882inode_steal:
792 status = ocfs2_steal_inode_from_other_nodes(osb, *ac); 883 status = ocfs2_steal_inode(osb, *ac);
793 atomic_inc(&osb->s_num_inodes_stolen); 884 atomic_inc(&osb->s_num_inodes_stolen);
794 if (status < 0) { 885 if (status < 0) {
795 if (status != -ENOSPC) 886 if (status != -ENOSPC)
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a43164..fa60723c43e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,7 @@ struct ocfs2_alloc_context {
56 is the same as ~0 - unlimited */ 56 is the same as ~0 - unlimited */
57}; 57};
58 58
59void ocfs2_init_steal_slots(struct ocfs2_super *osb);
59void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); 60void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
60static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) 61static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
61{ 62{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 755cd49a5ef3..dee03197a494 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
69#include "xattr.h" 69#include "xattr.h"
70#include "quota.h" 70#include "quota.h"
71#include "refcounttree.h" 71#include "refcounttree.h"
72#include "suballoc.h"
72 73
73#include "buffer_head_io.h" 74#include "buffer_head_io.h"
74 75
@@ -301,9 +302,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
301 302
302 spin_lock(&osb->osb_lock); 303 spin_lock(&osb->osb_lock);
303 out += snprintf(buf + out, len - out, 304 out += snprintf(buf + out, len - out,
304 "%10s => Slot: %d NumStolen: %d\n", "Steal", 305 "%10s => InodeSlot: %d StolenInodes: %d, "
306 "MetaSlot: %d StolenMeta: %d\n", "Steal",
305 osb->s_inode_steal_slot, 307 osb->s_inode_steal_slot,
306 atomic_read(&osb->s_num_inodes_stolen)); 308 atomic_read(&osb->s_num_inodes_stolen),
309 osb->s_meta_steal_slot,
310 atomic_read(&osb->s_num_meta_stolen));
307 spin_unlock(&osb->osb_lock); 311 spin_unlock(&osb->osb_lock);
308 312
309 out += snprintf(buf + out, len - out, "OrphanScan => "); 313 out += snprintf(buf + out, len - out, "OrphanScan => ");
@@ -1997,7 +2001,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1997 osb->blocked_lock_count = 0; 2001 osb->blocked_lock_count = 0;
1998 spin_lock_init(&osb->osb_lock); 2002 spin_lock_init(&osb->osb_lock);
1999 spin_lock_init(&osb->osb_xattr_lock); 2003 spin_lock_init(&osb->osb_xattr_lock);
2000 ocfs2_init_inode_steal_slot(osb); 2004 ocfs2_init_steal_slots(osb);
2001 2005
2002 atomic_set(&osb->alloc_stats.moves, 0); 2006 atomic_set(&osb->alloc_stats.moves, 0);
2003 atomic_set(&osb->alloc_stats.local_data, 0); 2007 atomic_set(&osb->alloc_stats.local_data, 0);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8fc6fb071c6d..d1b0d386f6d1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -116,10 +116,11 @@ static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
116}; 116};
117 117
118struct ocfs2_xattr_info { 118struct ocfs2_xattr_info {
119 int name_index; 119 int xi_name_index;
120 const char *name; 120 const char *xi_name;
121 const void *value; 121 int xi_name_len;
122 size_t value_len; 122 const void *xi_value;
123 size_t xi_value_len;
123}; 124};
124 125
125struct ocfs2_xattr_search { 126struct ocfs2_xattr_search {
@@ -137,6 +138,115 @@ struct ocfs2_xattr_search {
137 int not_found; 138 int not_found;
138}; 139};
139 140
141/* Operations on struct ocfs2_xa_entry */
142struct ocfs2_xa_loc;
143struct ocfs2_xa_loc_operations {
144 /*
145 * Journal functions
146 */
147 int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
148 int type);
149 void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
150
151 /*
152 * Return a pointer to the appropriate buffer in loc->xl_storage
153 * at the given offset from loc->xl_header.
154 */
155 void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
156
157 /* Can we reuse the existing entry for the new value? */
158 int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
159 struct ocfs2_xattr_info *xi);
160
161 /* How much space is needed for the new value? */
162 int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
163 struct ocfs2_xattr_info *xi);
164
165 /*
166 * Return the offset of the first name+value pair. This is
167 * the start of our downward-filling free space.
168 */
169 int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
170
171 /*
172 * Remove the name+value at this location. Do whatever is
173 * appropriate with the remaining name+value pairs.
174 */
175 void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
176
177 /* Fill xl_entry with a new entry */
178 void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
179
180 /* Add name+value storage to an entry */
181 void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
182
183 /*
184 * Initialize the value buf's access and bh fields for this entry.
185 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
186 */
187 void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
188 struct ocfs2_xattr_value_buf *vb);
189};
190
191/*
192 * Describes an xattr entry location. This is a memory structure
193 * tracking the on-disk structure.
194 */
195struct ocfs2_xa_loc {
196 /* This xattr belongs to this inode */
197 struct inode *xl_inode;
198
199 /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
200 struct ocfs2_xattr_header *xl_header;
201
202 /* Bytes from xl_header to the end of the storage */
203 int xl_size;
204
205 /*
206 * The ocfs2_xattr_entry this location describes. If this is
207 * NULL, this location describes the on-disk structure where it
208 * would have been.
209 */
210 struct ocfs2_xattr_entry *xl_entry;
211
212 /*
213 * Internal housekeeping
214 */
215
216 /* Buffer(s) containing this entry */
217 void *xl_storage;
218
219 /* Operations on the storage backing this location */
220 const struct ocfs2_xa_loc_operations *xl_ops;
221};
222
223/*
224 * Convenience functions to calculate how much space is needed for a
225 * given name+value pair
226 */
227static int namevalue_size(int name_len, uint64_t value_len)
228{
229 if (value_len > OCFS2_XATTR_INLINE_SIZE)
230 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
231 else
232 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
233}
234
235static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
236{
237 return namevalue_size(xi->xi_name_len, xi->xi_value_len);
238}
239
240static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
241{
242 u64 value_len = le64_to_cpu(xe->xe_value_size);
243
244 BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
245 ocfs2_xattr_is_local(xe));
246 return namevalue_size(xe->xe_name_len, value_len);
247}
248
249
140static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, 250static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
141 struct ocfs2_xattr_header *xh, 251 struct ocfs2_xattr_header *xh,
142 int index, 252 int index,
@@ -212,14 +322,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
212 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); 322 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
213} 323}
214 324
215static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
216{
217 u16 len = sb->s_blocksize -
218 offsetof(struct ocfs2_xattr_header, xh_entries);
219
220 return len / sizeof(struct ocfs2_xattr_entry);
221}
222
223#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) 325#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
224#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) 326#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
225#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) 327#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -463,35 +565,22 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
463 return hash; 565 return hash;
464} 566}
465 567
466/* 568static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
467 * ocfs2_xattr_hash_entry()
468 *
469 * Compute the hash of an extended attribute.
470 */
471static void ocfs2_xattr_hash_entry(struct inode *inode,
472 struct ocfs2_xattr_header *header,
473 struct ocfs2_xattr_entry *entry)
474{ 569{
475 u32 hash = 0; 570 return namevalue_size(name_len, value_len) +
476 char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); 571 sizeof(struct ocfs2_xattr_entry);
477
478 hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
479 entry->xe_name_hash = cpu_to_le32(hash);
480
481 return;
482} 572}
483 573
484static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) 574static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
485{ 575{
486 int size = 0; 576 return namevalue_size_xi(xi) +
487 577 sizeof(struct ocfs2_xattr_entry);
488 if (value_len <= OCFS2_XATTR_INLINE_SIZE) 578}
489 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
490 else
491 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
492 size += sizeof(struct ocfs2_xattr_entry);
493 579
494 return size; 580static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
581{
582 return namevalue_size_xe(xe) +
583 sizeof(struct ocfs2_xattr_entry);
495} 584}
496 585
497int ocfs2_calc_security_init(struct inode *dir, 586int ocfs2_calc_security_init(struct inode *dir,
@@ -1308,452 +1397,897 @@ out:
1308 return ret; 1397 return ret;
1309} 1398}
1310 1399
1311static int ocfs2_xattr_cleanup(struct inode *inode, 1400static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
1312 handle_t *handle, 1401 int num_entries)
1313 struct ocfs2_xattr_info *xi,
1314 struct ocfs2_xattr_search *xs,
1315 struct ocfs2_xattr_value_buf *vb,
1316 size_t offs)
1317{ 1402{
1318 int ret = 0; 1403 int free_space;
1319 size_t name_len = strlen(xi->name);
1320 void *val = xs->base + offs;
1321 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1322 1404
1323 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1405 if (!needed_space)
1324 OCFS2_JOURNAL_ACCESS_WRITE); 1406 return 0;
1325 if (ret) {
1326 mlog_errno(ret);
1327 goto out;
1328 }
1329 /* Decrease xattr count */
1330 le16_add_cpu(&xs->header->xh_count, -1);
1331 /* Remove the xattr entry and tree root which has already be set*/
1332 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
1333 memset(val, 0, size);
1334 1407
1335 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1408 free_space = free_start -
1336 if (ret < 0) 1409 sizeof(struct ocfs2_xattr_header) -
1337 mlog_errno(ret); 1410 (num_entries * sizeof(struct ocfs2_xattr_entry)) -
1338out: 1411 OCFS2_XATTR_HEADER_GAP;
1339 return ret; 1412 if (free_space < 0)
1413 return -EIO;
1414 if (free_space < needed_space)
1415 return -ENOSPC;
1416
1417 return 0;
1340} 1418}
1341 1419
1342static int ocfs2_xattr_update_entry(struct inode *inode, 1420static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
1343 handle_t *handle, 1421 int type)
1344 struct ocfs2_xattr_info *xi,
1345 struct ocfs2_xattr_search *xs,
1346 struct ocfs2_xattr_value_buf *vb,
1347 size_t offs)
1348{ 1422{
1349 int ret; 1423 return loc->xl_ops->xlo_journal_access(handle, loc, type);
1424}
1350 1425
1351 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1426static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
1352 OCFS2_JOURNAL_ACCESS_WRITE); 1427{
1353 if (ret) { 1428 loc->xl_ops->xlo_journal_dirty(handle, loc);
1354 mlog_errno(ret); 1429}
1355 goto out;
1356 }
1357 1430
1358 xs->here->xe_name_offset = cpu_to_le16(offs); 1431/* Give a pointer into the storage for the given offset */
1359 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1432static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
1360 if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) 1433{
1361 ocfs2_xattr_set_local(xs->here, 1); 1434 BUG_ON(offset >= loc->xl_size);
1362 else 1435 return loc->xl_ops->xlo_offset_pointer(loc, offset);
1363 ocfs2_xattr_set_local(xs->here, 0); 1436}
1364 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1365 1437
1366 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1438/*
1367 if (ret < 0) 1439 * Wipe the name+value pair and allow the storage to reclaim it. This
1368 mlog_errno(ret); 1440 * must be followed by either removal of the entry or a call to
1369out: 1441 * ocfs2_xa_add_namevalue().
1370 return ret; 1442 */
1443static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
1444{
1445 loc->xl_ops->xlo_wipe_namevalue(loc);
1371} 1446}
1372 1447
1373/* 1448/*
1374 * ocfs2_xattr_set_value_outside() 1449 * Find lowest offset to a name+value pair. This is the start of our
1375 * 1450 * downward-growing free space.
1376 * Set large size value in B tree.
1377 */ 1451 */
1378static int ocfs2_xattr_set_value_outside(struct inode *inode, 1452static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
1379 struct ocfs2_xattr_info *xi,
1380 struct ocfs2_xattr_search *xs,
1381 struct ocfs2_xattr_set_ctxt *ctxt,
1382 struct ocfs2_xattr_value_buf *vb,
1383 size_t offs)
1384{ 1453{
1385 size_t name_len = strlen(xi->name); 1454 return loc->xl_ops->xlo_get_free_start(loc);
1386 void *val = xs->base + offs; 1455}
1387 struct ocfs2_xattr_value_root *xv = NULL;
1388 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1389 int ret = 0;
1390 1456
1391 memset(val, 0, size); 1457/* Can we reuse loc->xl_entry for xi? */
1392 memcpy(val, xi->name, name_len); 1458static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
1393 xv = (struct ocfs2_xattr_value_root *) 1459 struct ocfs2_xattr_info *xi)
1394 (val + OCFS2_XATTR_SIZE(name_len)); 1460{
1395 xv->xr_clusters = 0; 1461 return loc->xl_ops->xlo_can_reuse(loc, xi);
1396 xv->xr_last_eb_blk = 0; 1462}
1397 xv->xr_list.l_tree_depth = 0; 1463
1398 xv->xr_list.l_count = cpu_to_le16(1); 1464/* How much free space is needed to set the new value */
1399 xv->xr_list.l_next_free_rec = 0; 1465static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
1400 vb->vb_xv = xv; 1466 struct ocfs2_xattr_info *xi)
1401 1467{
1402 ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt); 1468 return loc->xl_ops->xlo_check_space(loc, xi);
1403 if (ret < 0) { 1469}
1404 mlog_errno(ret); 1470
1405 return ret; 1471static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1472{
1473 loc->xl_ops->xlo_add_entry(loc, name_hash);
1474 loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
1475 /*
1476 * We can't leave the new entry's xe_name_offset at zero or
1477 * add_namevalue() will go nuts. We set it to the size of our
1478 * storage so that it can never be less than any other entry.
1479 */
1480 loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
1481}
1482
1483static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
1484 struct ocfs2_xattr_info *xi)
1485{
1486 int size = namevalue_size_xi(xi);
1487 int nameval_offset;
1488 char *nameval_buf;
1489
1490 loc->xl_ops->xlo_add_namevalue(loc, size);
1491 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1492 loc->xl_entry->xe_name_len = xi->xi_name_len;
1493 ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
1494 ocfs2_xattr_set_local(loc->xl_entry,
1495 xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
1496
1497 nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1498 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
1499 memset(nameval_buf, 0, size);
1500 memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
1501}
1502
1503static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
1504 struct ocfs2_xattr_value_buf *vb)
1505{
1506 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1507 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1508
1509 /* Value bufs are for value trees */
1510 BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
1511 BUG_ON(namevalue_size_xe(loc->xl_entry) !=
1512 (name_size + OCFS2_XATTR_ROOT_SIZE));
1513
1514 loc->xl_ops->xlo_fill_value_buf(loc, vb);
1515 vb->vb_xv =
1516 (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
1517 nameval_offset +
1518 name_size);
1519}
1520
1521static int ocfs2_xa_block_journal_access(handle_t *handle,
1522 struct ocfs2_xa_loc *loc, int type)
1523{
1524 struct buffer_head *bh = loc->xl_storage;
1525 ocfs2_journal_access_func access;
1526
1527 if (loc->xl_size == (bh->b_size -
1528 offsetof(struct ocfs2_xattr_block,
1529 xb_attrs.xb_header)))
1530 access = ocfs2_journal_access_xb;
1531 else
1532 access = ocfs2_journal_access_di;
1533 return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
1534}
1535
1536static void ocfs2_xa_block_journal_dirty(handle_t *handle,
1537 struct ocfs2_xa_loc *loc)
1538{
1539 struct buffer_head *bh = loc->xl_storage;
1540
1541 ocfs2_journal_dirty(handle, bh);
1542}
1543
1544static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
1545 int offset)
1546{
1547 return (char *)loc->xl_header + offset;
1548}
1549
1550static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
1551 struct ocfs2_xattr_info *xi)
1552{
1553 /*
1554 * Block storage is strict. If the sizes aren't exact, we will
1555 * remove the old one and reinsert the new.
1556 */
1557 return namevalue_size_xe(loc->xl_entry) ==
1558 namevalue_size_xi(xi);
1559}
1560
1561static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
1562{
1563 struct ocfs2_xattr_header *xh = loc->xl_header;
1564 int i, count = le16_to_cpu(xh->xh_count);
1565 int offset, free_start = loc->xl_size;
1566
1567 for (i = 0; i < count; i++) {
1568 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1569 if (offset < free_start)
1570 free_start = offset;
1406 } 1571 }
1407 ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs); 1572
1408 if (ret < 0) { 1573 return free_start;
1409 mlog_errno(ret); 1574}
1410 return ret; 1575
1576static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
1577 struct ocfs2_xattr_info *xi)
1578{
1579 int count = le16_to_cpu(loc->xl_header->xh_count);
1580 int free_start = ocfs2_xa_get_free_start(loc);
1581 int needed_space = ocfs2_xi_entry_usage(xi);
1582
1583 /*
1584 * Block storage will reclaim the original entry before inserting
1585 * the new value, so we only need the difference. If the new
1586 * entry is smaller than the old one, we don't need anything.
1587 */
1588 if (loc->xl_entry) {
1589 /* Don't need space if we're reusing! */
1590 if (ocfs2_xa_can_reuse_entry(loc, xi))
1591 needed_space = 0;
1592 else
1593 needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
1411 } 1594 }
1412 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, 1595 if (needed_space < 0)
1413 xi->value, xi->value_len); 1596 needed_space = 0;
1414 if (ret < 0) 1597 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1415 mlog_errno(ret); 1598}
1416 1599
1417 return ret; 1600/*
1601 * Block storage for xattrs keeps the name+value pairs compacted. When
1602 * we remove one, we have to shift any that preceded it towards the end.
1603 */
1604static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1605{
1606 int i, offset;
1607 int namevalue_offset, first_namevalue_offset, namevalue_size;
1608 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1609 struct ocfs2_xattr_header *xh = loc->xl_header;
1610 int count = le16_to_cpu(xh->xh_count);
1611
1612 namevalue_offset = le16_to_cpu(entry->xe_name_offset);
1613 namevalue_size = namevalue_size_xe(entry);
1614 first_namevalue_offset = ocfs2_xa_get_free_start(loc);
1615
1616 /* Shift the name+value pairs */
1617 memmove((char *)xh + first_namevalue_offset + namevalue_size,
1618 (char *)xh + first_namevalue_offset,
1619 namevalue_offset - first_namevalue_offset);
1620 memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
1621
1622 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset < namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size);
1628 }
1629
1630 /*
1631 * Note that we don't update xh_free_start or xh_name_value_len
1632 * because they're not used in block-stored xattrs.
1633 */
1634}
1635
1636static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1637{
1638 int count = le16_to_cpu(loc->xl_header->xh_count);
1639 loc->xl_entry = &(loc->xl_header->xh_entries[count]);
1640 le16_add_cpu(&loc->xl_header->xh_count, 1);
1641 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1642}
1643
1644static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1645{
1646 int free_start = ocfs2_xa_get_free_start(loc);
1647
1648 loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
1649}
1650
1651static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
1652 struct ocfs2_xattr_value_buf *vb)
1653{
1654 struct buffer_head *bh = loc->xl_storage;
1655
1656 if (loc->xl_size == (bh->b_size -
1657 offsetof(struct ocfs2_xattr_block,
1658 xb_attrs.xb_header)))
1659 vb->vb_access = ocfs2_journal_access_xb;
1660 else
1661 vb->vb_access = ocfs2_journal_access_di;
1662 vb->vb_bh = bh;
1418} 1663}
1419 1664
1420/* 1665/*
1421 * ocfs2_xattr_set_entry_local() 1666 * Operations for xattrs stored in blocks. This includes inline inode
1422 * 1667 * storage and unindexed ocfs2_xattr_blocks.
1423 * Set, replace or remove extended attribute in local.
1424 */ 1668 */
1425static void ocfs2_xattr_set_entry_local(struct inode *inode, 1669static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
1426 struct ocfs2_xattr_info *xi, 1670 .xlo_journal_access = ocfs2_xa_block_journal_access,
1427 struct ocfs2_xattr_search *xs, 1671 .xlo_journal_dirty = ocfs2_xa_block_journal_dirty,
1428 struct ocfs2_xattr_entry *last, 1672 .xlo_offset_pointer = ocfs2_xa_block_offset_pointer,
1429 size_t min_offs) 1673 .xlo_check_space = ocfs2_xa_block_check_space,
1674 .xlo_can_reuse = ocfs2_xa_block_can_reuse,
1675 .xlo_get_free_start = ocfs2_xa_block_get_free_start,
1676 .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue,
1677 .xlo_add_entry = ocfs2_xa_block_add_entry,
1678 .xlo_add_namevalue = ocfs2_xa_block_add_namevalue,
1679 .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf,
1680};
1681
1682static int ocfs2_xa_bucket_journal_access(handle_t *handle,
1683 struct ocfs2_xa_loc *loc, int type)
1430{ 1684{
1431 size_t name_len = strlen(xi->name); 1685 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1432 int i;
1433 1686
1434 if (xi->value && xs->not_found) { 1687 return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
1435 /* Insert the new xattr entry. */ 1688}
1436 le16_add_cpu(&xs->header->xh_count, 1); 1689
1437 ocfs2_xattr_set_type(last, xi->name_index); 1690static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
1438 ocfs2_xattr_set_local(last, 1); 1691 struct ocfs2_xa_loc *loc)
1439 last->xe_name_len = name_len; 1692{
1440 } else { 1693 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1441 void *first_val; 1694
1442 void *val; 1695 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
1443 size_t offs, size; 1696}
1444 1697
1445 first_val = xs->base + min_offs; 1698static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
1446 offs = le16_to_cpu(xs->here->xe_name_offset); 1699 int offset)
1447 val = xs->base + offs; 1700{
1448 1701 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1449 if (le64_to_cpu(xs->here->xe_value_size) > 1702 int block, block_offset;
1450 OCFS2_XATTR_INLINE_SIZE) 1703
1451 size = OCFS2_XATTR_SIZE(name_len) + 1704 /* The header is at the front of the bucket */
1452 OCFS2_XATTR_ROOT_SIZE; 1705 block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
1706 block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
1707
1708 return bucket_block(bucket, block) + block_offset;
1709}
1710
1711static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
1712 struct ocfs2_xattr_info *xi)
1713{
1714 return namevalue_size_xe(loc->xl_entry) >=
1715 namevalue_size_xi(xi);
1716}
1717
1718static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
1719{
1720 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1721 return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
1722}
1723
1724static int ocfs2_bucket_align_free_start(struct super_block *sb,
1725 int free_start, int size)
1726{
1727 /*
1728 * We need to make sure that the name+value pair fits within
1729 * one block.
1730 */
1731 if (((free_start - size) >> sb->s_blocksize_bits) !=
1732 ((free_start - 1) >> sb->s_blocksize_bits))
1733 free_start -= free_start % sb->s_blocksize;
1734
1735 return free_start;
1736}
1737
1738static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
1739 struct ocfs2_xattr_info *xi)
1740{
1741 int rc;
1742 int count = le16_to_cpu(loc->xl_header->xh_count);
1743 int free_start = ocfs2_xa_get_free_start(loc);
1744 int needed_space = ocfs2_xi_entry_usage(xi);
1745 int size = namevalue_size_xi(xi);
1746 struct super_block *sb = loc->xl_inode->i_sb;
1747
1748 /*
1749 * Bucket storage does not reclaim name+value pairs it cannot
1750 * reuse. They live as holes until the bucket fills, and then
1751 * the bucket is defragmented. However, the bucket can reclaim
1752 * the ocfs2_xattr_entry.
1753 */
1754 if (loc->xl_entry) {
1755 /* Don't need space if we're reusing! */
1756 if (ocfs2_xa_can_reuse_entry(loc, xi))
1757 needed_space = 0;
1453 else 1758 else
1454 size = OCFS2_XATTR_SIZE(name_len) + 1759 needed_space -= sizeof(struct ocfs2_xattr_entry);
1455 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 1760 }
1456 1761 BUG_ON(needed_space < 0);
1457 if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
1458 OCFS2_XATTR_SIZE(xi->value_len)) {
1459 /* The old and the new value have the
1460 same size. Just replace the value. */
1461 ocfs2_xattr_set_local(xs->here, 1);
1462 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1463 /* Clear value bytes. */
1464 memset(val + OCFS2_XATTR_SIZE(name_len),
1465 0,
1466 OCFS2_XATTR_SIZE(xi->value_len));
1467 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1468 xi->value,
1469 xi->value_len);
1470 return;
1471 }
1472 /* Remove the old name+value. */
1473 memmove(first_val + size, first_val, val - first_val);
1474 memset(first_val, 0, size);
1475 xs->here->xe_name_hash = 0;
1476 xs->here->xe_name_offset = 0;
1477 ocfs2_xattr_set_local(xs->here, 1);
1478 xs->here->xe_value_size = 0;
1479
1480 min_offs += size;
1481
1482 /* Adjust all value offsets. */
1483 last = xs->header->xh_entries;
1484 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1485 size_t o = le16_to_cpu(last->xe_name_offset);
1486
1487 if (o < offs)
1488 last->xe_name_offset = cpu_to_le16(o + size);
1489 last += 1;
1490 }
1491 1762
1492 if (!xi->value) { 1763 if (free_start < size) {
1493 /* Remove the old entry. */ 1764 if (needed_space)
1494 last -= 1; 1765 return -ENOSPC;
1495 memmove(xs->here, xs->here + 1, 1766 } else {
1496 (void *)last - (void *)xs->here); 1767 /*
1497 memset(last, 0, sizeof(struct ocfs2_xattr_entry)); 1768 * First we check if it would fit in the first place.
1498 le16_add_cpu(&xs->header->xh_count, -1); 1769 * Below, we align the free start to a block. This may
1499 } 1770 * slide us below the minimum gap. By checking unaligned
1771 * first, we avoid that error.
1772 */
1773 rc = ocfs2_xa_check_space_helper(needed_space, free_start,
1774 count);
1775 if (rc)
1776 return rc;
1777 free_start = ocfs2_bucket_align_free_start(sb, free_start,
1778 size);
1500 } 1779 }
1501 if (xi->value) { 1780 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1502 /* Insert the new name+value. */ 1781}
1503 size_t size = OCFS2_XATTR_SIZE(name_len) + 1782
1504 OCFS2_XATTR_SIZE(xi->value_len); 1783static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
1505 void *val = xs->base + min_offs - size; 1784{
1785 le16_add_cpu(&loc->xl_header->xh_name_value_len,
1786 -namevalue_size_xe(loc->xl_entry));
1787}
1506 1788
1507 xs->here->xe_name_offset = cpu_to_le16(min_offs - size); 1789static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1508 memset(val, 0, size); 1790{
1509 memcpy(val, xi->name, name_len); 1791 struct ocfs2_xattr_header *xh = loc->xl_header;
1510 memcpy(val + OCFS2_XATTR_SIZE(name_len), 1792 int count = le16_to_cpu(xh->xh_count);
1511 xi->value, 1793 int low = 0, high = count - 1, tmp;
1512 xi->value_len); 1794 struct ocfs2_xattr_entry *tmp_xe;
1513 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1795
1514 ocfs2_xattr_set_local(xs->here, 1); 1796 /*
1515 ocfs2_xattr_hash_entry(inode, xs->header, xs->here); 1797 * We keep buckets sorted by name_hash, so we need to find
1798 * our insert place.
1799 */
1800 while (low <= high && count) {
1801 tmp = (low + high) / 2;
1802 tmp_xe = &xh->xh_entries[tmp];
1803
1804 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
1805 low = tmp + 1;
1806 else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
1807 high = tmp - 1;
1808 else {
1809 low = tmp;
1810 break;
1811 }
1516 } 1812 }
1517 1813
1518 return; 1814 if (low != count)
1815 memmove(&xh->xh_entries[low + 1],
1816 &xh->xh_entries[low],
1817 ((count - low) * sizeof(struct ocfs2_xattr_entry)));
1818
1819 le16_add_cpu(&xh->xh_count, 1);
1820 loc->xl_entry = &xh->xh_entries[low];
1821 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1822}
1823
1824static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1825{
1826 int free_start = ocfs2_xa_get_free_start(loc);
1827 struct ocfs2_xattr_header *xh = loc->xl_header;
1828 struct super_block *sb = loc->xl_inode->i_sb;
1829 int nameval_offset;
1830
1831 free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
1832 nameval_offset = free_start - size;
1833 loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
1834 xh->xh_free_start = cpu_to_le16(nameval_offset);
1835 le16_add_cpu(&xh->xh_name_value_len, size);
1836
1837}
1838
1839static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
1840 struct ocfs2_xattr_value_buf *vb)
1841{
1842 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1843 struct super_block *sb = loc->xl_inode->i_sb;
1844 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1845 int size = namevalue_size_xe(loc->xl_entry);
1846 int block_offset = nameval_offset >> sb->s_blocksize_bits;
1847
1848 /* Values are not allowed to straddle block boundaries */
1849 BUG_ON(block_offset !=
1850 ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
1851 /* We expect the bucket to be filled in */
1852 BUG_ON(!bucket->bu_bhs[block_offset]);
1853
1854 vb->vb_access = ocfs2_journal_access;
1855 vb->vb_bh = bucket->bu_bhs[block_offset];
1856}
1857
1858/* Operations for xattrs stored in buckets. */
1859static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
1860 .xlo_journal_access = ocfs2_xa_bucket_journal_access,
1861 .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty,
1862 .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer,
1863 .xlo_check_space = ocfs2_xa_bucket_check_space,
1864 .xlo_can_reuse = ocfs2_xa_bucket_can_reuse,
1865 .xlo_get_free_start = ocfs2_xa_bucket_get_free_start,
1866 .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue,
1867 .xlo_add_entry = ocfs2_xa_bucket_add_entry,
1868 .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue,
1869 .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf,
1870};
1871
1872static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
1873{
1874 struct ocfs2_xattr_value_buf vb;
1875
1876 if (ocfs2_xattr_is_local(loc->xl_entry))
1877 return 0;
1878
1879 ocfs2_xa_fill_value_buf(loc, &vb);
1880 return le32_to_cpu(vb.vb_xv->xr_clusters);
1881}
1882
1883static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
1884 struct ocfs2_xattr_set_ctxt *ctxt)
1885{
1886 int trunc_rc, access_rc;
1887 struct ocfs2_xattr_value_buf vb;
1888
1889 ocfs2_xa_fill_value_buf(loc, &vb);
1890 trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
1891 ctxt);
1892
1893 /*
1894 * The caller of ocfs2_xa_value_truncate() has already called
1895 * ocfs2_xa_journal_access on the loc. However, The truncate code
1896 * calls ocfs2_extend_trans(). This may commit the previous
1897 * transaction and open a new one. If this is a bucket, truncate
1898 * could leave only vb->vb_bh set up for journaling. Meanwhile,
1899 * the caller is expecting to dirty the entire bucket. So we must
1900 * reset the journal work. We do this even if truncate has failed,
1901 * as it could have failed after committing the extend.
1902 */
1903 access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
1904 OCFS2_JOURNAL_ACCESS_WRITE);
1905
1906 /* Errors in truncate take precedence */
1907 return trunc_rc ? trunc_rc : access_rc;
1908}
1909
1910static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
1911{
1912 int index, count;
1913 struct ocfs2_xattr_header *xh = loc->xl_header;
1914 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1915
1916 ocfs2_xa_wipe_namevalue(loc);
1917 loc->xl_entry = NULL;
1918
1919 le16_add_cpu(&xh->xh_count, -1);
1920 count = le16_to_cpu(xh->xh_count);
1921
1922 /*
1923 * Only zero out the entry if there are more remaining. This is
1924 * important for an empty bucket, as it keeps track of the
1925 * bucket's hash value. It doesn't hurt empty block storage.
1926 */
1927 if (count) {
1928 index = ((char *)entry - (char *)&xh->xh_entries) /
1929 sizeof(struct ocfs2_xattr_entry);
1930 memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
1931 (count - index) * sizeof(struct ocfs2_xattr_entry));
1932 memset(&xh->xh_entries[count], 0,
1933 sizeof(struct ocfs2_xattr_entry));
1934 }
1519} 1935}
1520 1936
1521/* 1937/*
1522 * ocfs2_xattr_set_entry() 1938 * If we have a problem adjusting the size of an external value during
1939 * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
1940 * in an intermediate state. For example, the value may be partially
1941 * truncated.
1942 *
1943 * If the value tree hasn't changed, the extend/truncate went nowhere.
1944 * We have nothing to do. The caller can treat it as a straight error.
1523 * 1945 *
1524 * Set extended attribute entry into inode or block. 1946 * If the value tree got partially truncated, we now have a corrupted
1947 * extended attribute. We're going to wipe its entry and leak the
1948 * clusters. Better to leak some storage than leave a corrupt entry.
1525 * 1949 *
1526 * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, 1950 * If the value tree grew, it obviously didn't grow enough for the
1527 * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), 1951 * new entry. We're not going to try and reclaim those clusters either.
1528 * then set value in B tree with set_value_outside(). 1952 * If there was already an external value there (orig_clusters != 0),
1953 * the new clusters are attached safely and we can just leave the old
1954 * value in place. If there was no external value there, we remove
1955 * the entry.
1956 *
1957 * This way, the xattr block we store in the journal will be consistent.
1958 * If the size change broke because of the journal, no changes will hit
1959 * disk anyway.
1529 */ 1960 */
1530static int ocfs2_xattr_set_entry(struct inode *inode, 1961static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
1531 struct ocfs2_xattr_info *xi, 1962 const char *what,
1532 struct ocfs2_xattr_search *xs, 1963 unsigned int orig_clusters)
1533 struct ocfs2_xattr_set_ctxt *ctxt, 1964{
1534 int flag) 1965 unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
1535{ 1966 char *nameval_buf = ocfs2_xa_offset_pointer(loc,
1536 struct ocfs2_xattr_entry *last; 1967 le16_to_cpu(loc->xl_entry->xe_name_offset));
1537 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1968
1538 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 1969 if (new_clusters < orig_clusters) {
1539 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); 1970 mlog(ML_ERROR,
1540 size_t size_l = 0; 1971 "Partial truncate while %s xattr %.*s. Leaking "
1541 handle_t *handle = ctxt->handle; 1972 "%u clusters and removing the entry\n",
1542 int free, i, ret; 1973 what, loc->xl_entry->xe_name_len, nameval_buf,
1543 struct ocfs2_xattr_info xi_l = { 1974 orig_clusters - new_clusters);
1544 .name_index = xi->name_index, 1975 ocfs2_xa_remove_entry(loc);
1545 .name = xi->name, 1976 } else if (!orig_clusters) {
1546 .value = xi->value, 1977 mlog(ML_ERROR,
1547 .value_len = xi->value_len, 1978 "Unable to allocate an external value for xattr "
1548 }; 1979 "%.*s safely. Leaking %u clusters and removing the "
1549 struct ocfs2_xattr_value_buf vb = { 1980 "entry\n",
1550 .vb_bh = xs->xattr_bh, 1981 loc->xl_entry->xe_name_len, nameval_buf,
1551 .vb_access = ocfs2_journal_access_di, 1982 new_clusters - orig_clusters);
1552 }; 1983 ocfs2_xa_remove_entry(loc);
1984 } else if (new_clusters > orig_clusters)
1985 mlog(ML_ERROR,
1986 "Unable to grow xattr %.*s safely. %u new clusters "
1987 "have been added, but the value will not be "
1988 "modified\n",
1989 loc->xl_entry->xe_name_len, nameval_buf,
1990 new_clusters - orig_clusters);
1991}
1992
1993static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
1994 struct ocfs2_xattr_set_ctxt *ctxt)
1995{
1996 int rc = 0;
1997 unsigned int orig_clusters;
1998
1999 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2000 orig_clusters = ocfs2_xa_value_clusters(loc);
2001 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2002 if (rc) {
2003 mlog_errno(rc);
2004 /*
2005 * Since this is remove, we can return 0 if
2006 * ocfs2_xa_cleanup_value_truncate() is going to
2007 * wipe the entry anyway. So we check the
2008 * cluster count as well.
2009 */
2010 if (orig_clusters != ocfs2_xa_value_clusters(loc))
2011 rc = 0;
2012 ocfs2_xa_cleanup_value_truncate(loc, "removing",
2013 orig_clusters);
2014 if (rc)
2015 goto out;
2016 }
2017 }
1553 2018
1554 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2019 ocfs2_xa_remove_entry(loc);
1555 BUG_ON(xs->xattr_bh == xs->inode_bh);
1556 vb.vb_access = ocfs2_journal_access_xb;
1557 } else
1558 BUG_ON(xs->xattr_bh != xs->inode_bh);
1559 2020
1560 /* Compute min_offs, last and free space. */ 2021out:
1561 last = xs->header->xh_entries; 2022 return rc;
2023}
1562 2024
1563 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { 2025static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
1564 size_t offs = le16_to_cpu(last->xe_name_offset); 2026{
1565 if (offs < min_offs) 2027 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1566 min_offs = offs; 2028 char *nameval_buf;
1567 last += 1;
1568 }
1569 2029
1570 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; 2030 nameval_buf = ocfs2_xa_offset_pointer(loc,
1571 if (free < 0) 2031 le16_to_cpu(loc->xl_entry->xe_name_offset));
1572 return -EIO; 2032 memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
2033}
1573 2034
1574 if (!xs->not_found) { 2035/*
1575 size_t size = 0; 2036 * Take an existing entry and make it ready for the new value. This
1576 if (ocfs2_xattr_is_local(xs->here)) 2037 * won't allocate space, but it may free space. It should be ready for
1577 size = OCFS2_XATTR_SIZE(name_len) + 2038 * ocfs2_xa_prepare_entry() to finish the work.
1578 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 2039 */
1579 else 2040static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
1580 size = OCFS2_XATTR_SIZE(name_len) + 2041 struct ocfs2_xattr_info *xi,
1581 OCFS2_XATTR_ROOT_SIZE; 2042 struct ocfs2_xattr_set_ctxt *ctxt)
1582 free += (size + sizeof(struct ocfs2_xattr_entry)); 2043{
1583 } 2044 int rc = 0;
1584 /* Check free space in inode or block */ 2045 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
1585 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2046 unsigned int orig_clusters;
1586 if (free < sizeof(struct ocfs2_xattr_entry) + 2047 char *nameval_buf;
1587 OCFS2_XATTR_SIZE(name_len) + 2048 int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
1588 OCFS2_XATTR_ROOT_SIZE) { 2049 int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
1589 ret = -ENOSPC; 2050
1590 goto out; 2051 BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
2052 name_size);
2053
2054 nameval_buf = ocfs2_xa_offset_pointer(loc,
2055 le16_to_cpu(loc->xl_entry->xe_name_offset));
2056 if (xe_local) {
2057 memset(nameval_buf + name_size, 0,
2058 namevalue_size_xe(loc->xl_entry) - name_size);
2059 if (!xi_local)
2060 ocfs2_xa_install_value_root(loc);
2061 } else {
2062 orig_clusters = ocfs2_xa_value_clusters(loc);
2063 if (xi_local) {
2064 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2065 if (rc < 0)
2066 mlog_errno(rc);
2067 else
2068 memset(nameval_buf + name_size, 0,
2069 namevalue_size_xe(loc->xl_entry) -
2070 name_size);
2071 } else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
2072 xi->xi_value_len) {
2073 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
2074 ctxt);
2075 if (rc < 0)
2076 mlog_errno(rc);
1591 } 2077 }
1592 size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 2078
1593 xi_l.value = (void *)&def_xv; 2079 if (rc) {
1594 xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; 2080 ocfs2_xa_cleanup_value_truncate(loc, "reusing",
1595 } else if (xi->value) { 2081 orig_clusters);
1596 if (free < sizeof(struct ocfs2_xattr_entry) +
1597 OCFS2_XATTR_SIZE(name_len) +
1598 OCFS2_XATTR_SIZE(xi->value_len)) {
1599 ret = -ENOSPC;
1600 goto out; 2082 goto out;
1601 } 2083 }
1602 } 2084 }
1603 2085
1604 if (!xs->not_found) { 2086 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1605 /* For existing extended attribute */ 2087 ocfs2_xattr_set_local(loc->xl_entry, xi_local);
1606 size_t size = OCFS2_XATTR_SIZE(name_len) +
1607 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1608 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1609 void *val = xs->base + offs;
1610 2088
1611 if (ocfs2_xattr_is_local(xs->here) && size == size_l) { 2089out:
1612 /* Replace existing local xattr with tree root */ 2090 return rc;
1613 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, 2091}
1614 ctxt, &vb, offs);
1615 if (ret < 0)
1616 mlog_errno(ret);
1617 goto out;
1618 } else if (!ocfs2_xattr_is_local(xs->here)) {
1619 /* For existing xattr which has value outside */
1620 vb.vb_xv = (struct ocfs2_xattr_value_root *)
1621 (val + OCFS2_XATTR_SIZE(name_len));
1622 2092
1623 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2093/*
1624 /* 2094 * Prepares loc->xl_entry to receive the new xattr. This includes
1625 * If new value need set outside also, 2095 * properly setting up the name+value pair region. If loc->xl_entry
1626 * first truncate old value to new value, 2096 * already exists, it will take care of modifying it appropriately.
1627 * then set new value with set_value_outside(). 2097 *
1628 */ 2098 * Note that this modifies the data. You did journal_access already,
1629 ret = ocfs2_xattr_value_truncate(inode, 2099 * right?
1630 &vb, 2100 */
1631 xi->value_len, 2101static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
1632 ctxt); 2102 struct ocfs2_xattr_info *xi,
1633 if (ret < 0) { 2103 u32 name_hash,
1634 mlog_errno(ret); 2104 struct ocfs2_xattr_set_ctxt *ctxt)
1635 goto out; 2105{
1636 } 2106 int rc = 0;
2107 unsigned int orig_clusters;
2108 __le64 orig_value_size = 0;
1637 2109
1638 ret = ocfs2_xattr_update_entry(inode, 2110 rc = ocfs2_xa_check_space(loc, xi);
1639 handle, 2111 if (rc)
1640 xi, 2112 goto out;
1641 xs,
1642 &vb,
1643 offs);
1644 if (ret < 0) {
1645 mlog_errno(ret);
1646 goto out;
1647 }
1648 2113
1649 ret = __ocfs2_xattr_set_value_outside(inode, 2114 if (loc->xl_entry) {
1650 handle, 2115 if (ocfs2_xa_can_reuse_entry(loc, xi)) {
1651 &vb, 2116 orig_value_size = loc->xl_entry->xe_value_size;
1652 xi->value, 2117 rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
1653 xi->value_len); 2118 if (rc)
1654 if (ret < 0) 2119 goto out;
1655 mlog_errno(ret); 2120 goto alloc_value;
2121 }
2122
2123 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2124 orig_clusters = ocfs2_xa_value_clusters(loc);
2125 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2126 if (rc) {
2127 mlog_errno(rc);
2128 ocfs2_xa_cleanup_value_truncate(loc,
2129 "overwriting",
2130 orig_clusters);
1656 goto out; 2131 goto out;
1657 } else {
1658 /*
1659 * If new value need set in local,
1660 * just trucate old value to zero.
1661 */
1662 ret = ocfs2_xattr_value_truncate(inode,
1663 &vb,
1664 0,
1665 ctxt);
1666 if (ret < 0)
1667 mlog_errno(ret);
1668 } 2132 }
1669 } 2133 }
2134 ocfs2_xa_wipe_namevalue(loc);
2135 } else
2136 ocfs2_xa_add_entry(loc, name_hash);
2137
2138 /*
2139 * If we get here, we have a blank entry. Fill it. We grow our
2140 * name+value pair back from the end.
2141 */
2142 ocfs2_xa_add_namevalue(loc, xi);
2143 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2144 ocfs2_xa_install_value_root(loc);
2145
2146alloc_value:
2147 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2148 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) {
2151 /*
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters);
2160 mlog_errno(rc);
2161 }
1670 } 2162 }
1671 2163
1672 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, 2164out:
2165 return rc;
2166}
2167
2168/*
2169 * Store the value portion of the name+value pair. This will skip
2170 * values that are stored externally. Their tree roots were set up
2171 * by ocfs2_xa_prepare_entry().
2172 */
2173static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
2174 struct ocfs2_xattr_info *xi,
2175 struct ocfs2_xattr_set_ctxt *ctxt)
2176{
2177 int rc = 0;
2178 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
2179 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
2180 char *nameval_buf;
2181 struct ocfs2_xattr_value_buf vb;
2182
2183 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
2184 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2185 ocfs2_xa_fill_value_buf(loc, &vb);
2186 rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
2187 ctxt->handle, &vb,
2188 xi->xi_value,
2189 xi->xi_value_len);
2190 } else
2191 memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
2192
2193 return rc;
2194}
2195
2196static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
2197 struct ocfs2_xattr_info *xi,
2198 struct ocfs2_xattr_set_ctxt *ctxt)
2199{
2200 int ret;
2201 u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
2202 xi->xi_name_len);
2203
2204 ret = ocfs2_xa_journal_access(ctxt->handle, loc,
1673 OCFS2_JOURNAL_ACCESS_WRITE); 2205 OCFS2_JOURNAL_ACCESS_WRITE);
1674 if (ret) { 2206 if (ret) {
1675 mlog_errno(ret); 2207 mlog_errno(ret);
1676 goto out; 2208 goto out;
1677 } 2209 }
1678 2210
1679 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1680 ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
1681 OCFS2_JOURNAL_ACCESS_WRITE);
1682 if (ret) {
1683 mlog_errno(ret);
1684 goto out;
1685 }
1686 }
1687
1688 /* 2211 /*
1689 * Set value in local, include set tree root in local. 2212 * From here on out, everything is going to modify the buffer a
1690 * This is the first step for value size >INLINE_SIZE. 2213 * little. Errors are going to leave the xattr header in a
2214 * sane state. Thus, even with errors we dirty the sucker.
1691 */ 2215 */
1692 ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
1693 2216
1694 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2217 /* Don't worry, we are never called with !xi_value and !xl_entry */
1695 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 2218 if (!xi->xi_value) {
1696 if (ret < 0) { 2219 ret = ocfs2_xa_remove(loc, ctxt);
1697 mlog_errno(ret); 2220 goto out_dirty;
1698 goto out;
1699 }
1700 } 2221 }
1701 2222
1702 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && 2223 ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
1703 (flag & OCFS2_INLINE_XATTR_FL)) { 2224 if (ret) {
1704 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2225 if (ret != -ENOSPC)
1705 unsigned int xattrsize = osb->s_xattr_inline_size; 2226 mlog_errno(ret);
1706 2227 goto out_dirty;
1707 /*
1708 * Adjust extent record count or inline data size
1709 * to reserve space for extended attribute.
1710 */
1711 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1712 struct ocfs2_inline_data *idata = &di->id2.i_data;
1713 le16_add_cpu(&idata->id_count, -xattrsize);
1714 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
1715 struct ocfs2_extent_list *el = &di->id2.i_list;
1716 le16_add_cpu(&el->l_count, -(xattrsize /
1717 sizeof(struct ocfs2_extent_rec)));
1718 }
1719 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
1720 } 2228 }
1721 /* Update xattr flag */
1722 spin_lock(&oi->ip_lock);
1723 oi->ip_dyn_features |= flag;
1724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1725 spin_unlock(&oi->ip_lock);
1726 2229
1727 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 2230 ret = ocfs2_xa_store_value(loc, xi, ctxt);
1728 if (ret < 0) 2231 if (ret)
1729 mlog_errno(ret); 2232 mlog_errno(ret);
1730 2233
1731 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2234out_dirty:
1732 /* 2235 ocfs2_xa_journal_dirty(ctxt->handle, loc);
1733 * Set value outside in B tree.
1734 * This is the second step for value size > INLINE_SIZE.
1735 */
1736 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1737 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
1738 &vb, offs);
1739 if (ret < 0) {
1740 int ret2;
1741 2236
1742 mlog_errno(ret);
1743 /*
1744 * If set value outside failed, we have to clean
1745 * the junk tree root we have already set in local.
1746 */
1747 ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
1748 xi, xs, &vb, offs);
1749 if (ret2 < 0)
1750 mlog_errno(ret2);
1751 }
1752 }
1753out: 2237out:
1754 return ret; 2238 return ret;
1755} 2239}
1756 2240
2241static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
2242 struct inode *inode,
2243 struct buffer_head *bh,
2244 struct ocfs2_xattr_entry *entry)
2245{
2246 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2247
2248 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
2249
2250 loc->xl_inode = inode;
2251 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2252 loc->xl_storage = bh;
2253 loc->xl_entry = entry;
2254 loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
2255 loc->xl_header =
2256 (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
2257 loc->xl_size);
2258}
2259
2260static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
2261 struct inode *inode,
2262 struct buffer_head *bh,
2263 struct ocfs2_xattr_entry *entry)
2264{
2265 struct ocfs2_xattr_block *xb =
2266 (struct ocfs2_xattr_block *)bh->b_data;
2267
2268 BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
2269
2270 loc->xl_inode = inode;
2271 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2272 loc->xl_storage = bh;
2273 loc->xl_header = &(xb->xb_attrs.xb_header);
2274 loc->xl_entry = entry;
2275 loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
2276 xb_attrs.xb_header);
2277}
2278
2279static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
2280 struct ocfs2_xattr_bucket *bucket,
2281 struct ocfs2_xattr_entry *entry)
2282{
2283 loc->xl_inode = bucket->bu_inode;
2284 loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
2285 loc->xl_storage = bucket;
2286 loc->xl_header = bucket_xh(bucket);
2287 loc->xl_entry = entry;
2288 loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
2289}
2290
1757/* 2291/*
1758 * In xattr remove, if it is stored outside and refcounted, we may have 2292 * In xattr remove, if it is stored outside and refcounted, we may have
1759 * the chance to split the refcount tree. So need the allocators. 2293 * the chance to split the refcount tree. So need the allocators.
@@ -2149,6 +2683,55 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
2149 return 0; 2683 return 0;
2150} 2684}
2151 2685
2686static int ocfs2_xattr_ibody_init(struct inode *inode,
2687 struct buffer_head *di_bh,
2688 struct ocfs2_xattr_set_ctxt *ctxt)
2689{
2690 int ret;
2691 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2692 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2693 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2694 unsigned int xattrsize = osb->s_xattr_inline_size;
2695
2696 if (!ocfs2_xattr_has_space_inline(inode, di)) {
2697 ret = -ENOSPC;
2698 goto out;
2699 }
2700
2701 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
2702 OCFS2_JOURNAL_ACCESS_WRITE);
2703 if (ret) {
2704 mlog_errno(ret);
2705 goto out;
2706 }
2707
2708 /*
2709 * Adjust extent record count or inline data size
2710 * to reserve space for extended attribute.
2711 */
2712 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2713 struct ocfs2_inline_data *idata = &di->id2.i_data;
2714 le16_add_cpu(&idata->id_count, -xattrsize);
2715 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
2716 struct ocfs2_extent_list *el = &di->id2.i_list;
2717 le16_add_cpu(&el->l_count, -(xattrsize /
2718 sizeof(struct ocfs2_extent_rec)));
2719 }
2720 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
2721
2722 spin_lock(&oi->ip_lock);
2723 oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock);
2726
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730
2731out:
2732 return ret;
2733}
2734
2152/* 2735/*
2153 * ocfs2_xattr_ibody_set() 2736 * ocfs2_xattr_ibody_set()
2154 * 2737 *
@@ -2160,9 +2743,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2160 struct ocfs2_xattr_search *xs, 2743 struct ocfs2_xattr_search *xs,
2161 struct ocfs2_xattr_set_ctxt *ctxt) 2744 struct ocfs2_xattr_set_ctxt *ctxt)
2162{ 2745{
2746 int ret;
2163 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2747 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2164 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2748 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2165 int ret; 2749 struct ocfs2_xa_loc loc;
2166 2750
2167 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) 2751 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
2168 return -ENOSPC; 2752 return -ENOSPC;
@@ -2175,8 +2759,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2175 } 2759 }
2176 } 2760 }
2177 2761
2178 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2762 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2179 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); 2763 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
2764 if (ret) {
2765 if (ret != -ENOSPC)
2766 mlog_errno(ret);
2767 goto out;
2768 }
2769 }
2770
2771 ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
2772 xs->not_found ? NULL : xs->here);
2773 ret = ocfs2_xa_set(&loc, xi, ctxt);
2774 if (ret) {
2775 if (ret != -ENOSPC)
2776 mlog_errno(ret);
2777 goto out;
2778 }
2779 xs->here = loc.xl_entry;
2780
2180out: 2781out:
2181 up_write(&oi->ip_alloc_sem); 2782 up_write(&oi->ip_alloc_sem);
2182 2783
@@ -2236,12 +2837,11 @@ cleanup:
2236 return ret; 2837 return ret;
2237} 2838}
2238 2839
2239static int ocfs2_create_xattr_block(handle_t *handle, 2840static int ocfs2_create_xattr_block(struct inode *inode,
2240 struct inode *inode,
2241 struct buffer_head *inode_bh, 2841 struct buffer_head *inode_bh,
2242 struct ocfs2_alloc_context *meta_ac, 2842 struct ocfs2_xattr_set_ctxt *ctxt,
2243 struct buffer_head **ret_bh, 2843 int indexed,
2244 int indexed) 2844 struct buffer_head **ret_bh)
2245{ 2845{
2246 int ret; 2846 int ret;
2247 u16 suballoc_bit_start; 2847 u16 suballoc_bit_start;
@@ -2252,14 +2852,14 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2252 struct buffer_head *new_bh = NULL; 2852 struct buffer_head *new_bh = NULL;
2253 struct ocfs2_xattr_block *xblk; 2853 struct ocfs2_xattr_block *xblk;
2254 2854
2255 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, 2855 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
2256 OCFS2_JOURNAL_ACCESS_CREATE); 2856 inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
2257 if (ret < 0) { 2857 if (ret < 0) {
2258 mlog_errno(ret); 2858 mlog_errno(ret);
2259 goto end; 2859 goto end;
2260 } 2860 }
2261 2861
2262 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
2263 &suballoc_bit_start, &num_got, 2863 &suballoc_bit_start, &num_got,
2264 &first_blkno); 2864 &first_blkno);
2265 if (ret < 0) { 2865 if (ret < 0) {
@@ -2270,7 +2870,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2270 new_bh = sb_getblk(inode->i_sb, first_blkno); 2870 new_bh = sb_getblk(inode->i_sb, first_blkno);
2271 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); 2871 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2272 2872
2273 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), 2873 ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
2274 new_bh, 2874 new_bh,
2275 OCFS2_JOURNAL_ACCESS_CREATE); 2875 OCFS2_JOURNAL_ACCESS_CREATE);
2276 if (ret < 0) { 2876 if (ret < 0) {
@@ -2282,11 +2882,10 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2282 xblk = (struct ocfs2_xattr_block *)new_bh->b_data; 2882 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
2283 memset(xblk, 0, inode->i_sb->s_blocksize); 2883 memset(xblk, 0, inode->i_sb->s_blocksize);
2284 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2285 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); 2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2286 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2287 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2288 xblk->xb_blkno = cpu_to_le64(first_blkno); 2888 xblk->xb_blkno = cpu_to_le64(first_blkno);
2289
2290 if (indexed) { 2889 if (indexed) {
2291 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
2292 xr->xt_clusters = cpu_to_le32(1); 2891 xr->xt_clusters = cpu_to_le32(1);
@@ -2297,14 +2896,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2297 xr->xt_list.l_next_free_rec = cpu_to_le16(1); 2896 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2298 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); 2897 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
2299 } 2898 }
2899 ocfs2_journal_dirty(ctxt->handle, new_bh);
2300 2900
2301 ret = ocfs2_journal_dirty(handle, new_bh); 2901 /* Add it to the inode */
2302 if (ret < 0) {
2303 mlog_errno(ret);
2304 goto end;
2305 }
2306 di->i_xattr_loc = cpu_to_le64(first_blkno); 2902 di->i_xattr_loc = cpu_to_le64(first_blkno);
2307 ocfs2_journal_dirty(handle, inode_bh); 2903
2904 spin_lock(&OCFS2_I(inode)->ip_lock);
2905 OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
2906 di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
2907 spin_unlock(&OCFS2_I(inode)->ip_lock);
2908
2909 ocfs2_journal_dirty(ctxt->handle, inode_bh);
2308 2910
2309 *ret_bh = new_bh; 2911 *ret_bh = new_bh;
2310 new_bh = NULL; 2912 new_bh = NULL;
@@ -2326,13 +2928,13 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2326 struct ocfs2_xattr_set_ctxt *ctxt) 2928 struct ocfs2_xattr_set_ctxt *ctxt)
2327{ 2929{
2328 struct buffer_head *new_bh = NULL; 2930 struct buffer_head *new_bh = NULL;
2329 handle_t *handle = ctxt->handle;
2330 struct ocfs2_xattr_block *xblk = NULL; 2931 struct ocfs2_xattr_block *xblk = NULL;
2331 int ret; 2932 int ret;
2933 struct ocfs2_xa_loc loc;
2332 2934
2333 if (!xs->xattr_bh) { 2935 if (!xs->xattr_bh) {
2334 ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, 2936 ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
2335 ctxt->meta_ac, &new_bh, 0); 2937 0, &new_bh);
2336 if (ret) { 2938 if (ret) {
2337 mlog_errno(ret); 2939 mlog_errno(ret);
2338 goto end; 2940 goto end;
@@ -2348,21 +2950,25 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2348 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2950 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2349 2951
2350 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { 2952 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
2351 /* Set extended attribute into external block */ 2953 ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
2352 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2954 xs->not_found ? NULL : xs->here);
2353 OCFS2_HAS_XATTR_FL);
2354 if (!ret || ret != -ENOSPC)
2355 goto end;
2356 2955
2357 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2956 ret = ocfs2_xa_set(&loc, xi, ctxt);
2358 if (ret) 2957 if (!ret)
2958 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC)
2359 goto end; 2960 goto end;
2961 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
2963 if (ret)
2964 goto end;
2965 }
2360 } 2966 }
2361 2967
2362 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); 2968 if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
2969 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
2363 2970
2364end: 2971end:
2365
2366 return ret; 2972 return ret;
2367} 2973}
2368 2974
@@ -2371,7 +2977,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2371 struct ocfs2_xattr_info *xi, 2977 struct ocfs2_xattr_info *xi,
2372 struct ocfs2_xattr_search *xs) 2978 struct ocfs2_xattr_search *xs)
2373{ 2979{
2374 u64 value_size;
2375 struct ocfs2_xattr_entry *last; 2980 struct ocfs2_xattr_entry *last;
2376 int free, i; 2981 int free, i;
2377 size_t min_offs = xs->end - xs->base; 2982 size_t min_offs = xs->end - xs->base;
@@ -2394,13 +2999,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2394 2999
2395 BUG_ON(!xs->not_found); 3000 BUG_ON(!xs->not_found);
2396 3001
2397 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3002 if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
2398 value_size = OCFS2_XATTR_ROOT_SIZE;
2399 else
2400 value_size = OCFS2_XATTR_SIZE(xi->value_len);
2401
2402 if (free >= sizeof(struct ocfs2_xattr_entry) +
2403 OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
2404 return 1; 3003 return 1;
2405 3004
2406 return 0; 3005 return 0;
@@ -2424,7 +3023,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2424 char *base = NULL; 3023 char *base = NULL;
2425 int name_offset, name_len = 0; 3024 int name_offset, name_len = 0;
2426 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, 3025 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2427 xi->value_len); 3026 xi->xi_value_len);
2428 u64 value_size; 3027 u64 value_size;
2429 3028
2430 /* 3029 /*
@@ -2432,14 +3031,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2432 * No matter whether we replace an old one or add a new one, 3031 * No matter whether we replace an old one or add a new one,
2433 * we need this for writing. 3032 * we need this for writing.
2434 */ 3033 */
2435 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3034 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2436 credits += new_clusters * 3035 credits += new_clusters *
2437 ocfs2_clusters_to_blocks(inode->i_sb, 1); 3036 ocfs2_clusters_to_blocks(inode->i_sb, 1);
2438 3037
2439 if (xis->not_found && xbs->not_found) { 3038 if (xis->not_found && xbs->not_found) {
2440 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3039 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2441 3040
2442 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3041 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2443 clusters_add += new_clusters; 3042 clusters_add += new_clusters;
2444 credits += ocfs2_calc_extend_credits(inode->i_sb, 3043 credits += ocfs2_calc_extend_credits(inode->i_sb,
2445 &def_xv.xv.xr_list, 3044 &def_xv.xv.xr_list,
@@ -2484,7 +3083,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2484 * The credits for removing the value tree will be extended 3083 * The credits for removing the value tree will be extended
2485 * by ocfs2_remove_extent itself. 3084 * by ocfs2_remove_extent itself.
2486 */ 3085 */
2487 if (!xi->value) { 3086 if (!xi->xi_value) {
2488 if (!ocfs2_xattr_is_local(xe)) 3087 if (!ocfs2_xattr_is_local(xe))
2489 credits += ocfs2_remove_extent_credits(inode->i_sb); 3088 credits += ocfs2_remove_extent_credits(inode->i_sb);
2490 3089
@@ -2514,7 +3113,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2514 } 3113 }
2515 } 3114 }
2516 3115
2517 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3116 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2518 /* the new values will be stored outside. */ 3117 /* the new values will be stored outside. */
2519 u32 old_clusters = 0; 3118 u32 old_clusters = 0;
2520 3119
@@ -2547,9 +3146,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2547 * value, we don't need any allocation, otherwise we have 3146 * value, we don't need any allocation, otherwise we have
2548 * to guess metadata allocation. 3147 * to guess metadata allocation.
2549 */ 3148 */
2550 if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) || 3149 if ((ocfs2_xattr_is_local(xe) &&
3150 (value_size >= xi->xi_value_len)) ||
2551 (!ocfs2_xattr_is_local(xe) && 3151 (!ocfs2_xattr_is_local(xe) &&
2552 OCFS2_XATTR_ROOT_SIZE >= xi->value_len)) 3152 OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
2553 goto out; 3153 goto out;
2554 } 3154 }
2555 3155
@@ -2639,7 +3239,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2639 3239
2640 meta_add += extra_meta; 3240 meta_add += extra_meta;
2641 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 3241 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2642 "credits = %d\n", xi->name, meta_add, clusters_add, *credits); 3242 "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
2643 3243
2644 if (meta_add) { 3244 if (meta_add) {
2645 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, 3245 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2679,7 +3279,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2679{ 3279{
2680 int ret = 0, credits, old_found; 3280 int ret = 0, credits, old_found;
2681 3281
2682 if (!xi->value) { 3282 if (!xi->xi_value) {
2683 /* Remove existing extended attribute */ 3283 /* Remove existing extended attribute */
2684 if (!xis->not_found) 3284 if (!xis->not_found)
2685 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); 3285 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2693,8 +3293,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2693 * If succeed and that extended attribute existing in 3293 * If succeed and that extended attribute existing in
2694 * external block, then we will remove it. 3294 * external block, then we will remove it.
2695 */ 3295 */
2696 xi->value = NULL; 3296 xi->xi_value = NULL;
2697 xi->value_len = 0; 3297 xi->xi_value_len = 0;
2698 3298
2699 old_found = xis->not_found; 3299 old_found = xis->not_found;
2700 xis->not_found = -ENODATA; 3300 xis->not_found = -ENODATA;
@@ -2722,8 +3322,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2722 } else if (ret == -ENOSPC) { 3322 } else if (ret == -ENOSPC) {
2723 if (di->i_xattr_loc && !xbs->xattr_bh) { 3323 if (di->i_xattr_loc && !xbs->xattr_bh) {
2724 ret = ocfs2_xattr_block_find(inode, 3324 ret = ocfs2_xattr_block_find(inode,
2725 xi->name_index, 3325 xi->xi_name_index,
2726 xi->name, xbs); 3326 xi->xi_name, xbs);
2727 if (ret) 3327 if (ret)
2728 goto out; 3328 goto out;
2729 3329
@@ -2762,8 +3362,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2762 * If succeed and that extended attribute 3362 * If succeed and that extended attribute
2763 * existing in inode, we will remove it. 3363 * existing in inode, we will remove it.
2764 */ 3364 */
2765 xi->value = NULL; 3365 xi->xi_value = NULL;
2766 xi->value_len = 0; 3366 xi->xi_value_len = 0;
2767 xbs->not_found = -ENODATA; 3367 xbs->not_found = -ENODATA;
2768 ret = ocfs2_calc_xattr_set_need(inode, 3368 ret = ocfs2_calc_xattr_set_need(inode,
2769 di, 3369 di,
@@ -2829,10 +3429,11 @@ int ocfs2_xattr_set_handle(handle_t *handle,
2829 int ret; 3429 int ret;
2830 3430
2831 struct ocfs2_xattr_info xi = { 3431 struct ocfs2_xattr_info xi = {
2832 .name_index = name_index, 3432 .xi_name_index = name_index,
2833 .name = name, 3433 .xi_name = name,
2834 .value = value, 3434 .xi_name_len = strlen(name),
2835 .value_len = value_len, 3435 .xi_value = value,
3436 .xi_value_len = value_len,
2836 }; 3437 };
2837 3438
2838 struct ocfs2_xattr_search xis = { 3439 struct ocfs2_xattr_search xis = {
@@ -2912,10 +3513,11 @@ int ocfs2_xattr_set(struct inode *inode,
2912 struct ocfs2_refcount_tree *ref_tree = NULL; 3513 struct ocfs2_refcount_tree *ref_tree = NULL;
2913 3514
2914 struct ocfs2_xattr_info xi = { 3515 struct ocfs2_xattr_info xi = {
2915 .name_index = name_index, 3516 .xi_name_index = name_index,
2916 .name = name, 3517 .xi_name = name,
2917 .value = value, 3518 .xi_name_len = strlen(name),
2918 .value_len = value_len, 3519 .xi_value = value,
3520 .xi_value_len = value_len,
2919 }; 3521 };
2920 3522
2921 struct ocfs2_xattr_search xis = { 3523 struct ocfs2_xattr_search xis = {
@@ -3759,7 +4361,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3759 struct ocfs2_xattr_bucket *bucket) 4361 struct ocfs2_xattr_bucket *bucket)
3760{ 4362{
3761 int ret, i; 4363 int ret, i;
3762 size_t end, offset, len, value_len; 4364 size_t end, offset, len;
3763 struct ocfs2_xattr_header *xh; 4365 struct ocfs2_xattr_header *xh;
3764 char *entries, *buf, *bucket_buf = NULL; 4366 char *entries, *buf, *bucket_buf = NULL;
3765 u64 blkno = bucket_blkno(bucket); 4367 u64 blkno = bucket_blkno(bucket);
@@ -3813,12 +4415,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3813 end = OCFS2_XATTR_BUCKET_SIZE; 4415 end = OCFS2_XATTR_BUCKET_SIZE;
3814 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { 4416 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
3815 offset = le16_to_cpu(xe->xe_name_offset); 4417 offset = le16_to_cpu(xe->xe_name_offset);
3816 if (ocfs2_xattr_is_local(xe)) 4418 len = namevalue_size_xe(xe);
3817 value_len = OCFS2_XATTR_SIZE(
3818 le64_to_cpu(xe->xe_value_size));
3819 else
3820 value_len = OCFS2_XATTR_ROOT_SIZE;
3821 len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
3822 4419
3823 /* 4420 /*
3824 * We must make sure that the name/value pair 4421 * We must make sure that the name/value pair
@@ -4007,7 +4604,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4007 int new_bucket_head) 4604 int new_bucket_head)
4008{ 4605{
4009 int ret, i; 4606 int ret, i;
4010 int count, start, len, name_value_len = 0, xe_len, name_offset = 0; 4607 int count, start, len, name_value_len = 0, name_offset = 0;
4011 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; 4608 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
4012 struct ocfs2_xattr_header *xh; 4609 struct ocfs2_xattr_header *xh;
4013 struct ocfs2_xattr_entry *xe; 4610 struct ocfs2_xattr_entry *xe;
@@ -4098,13 +4695,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4098 name_value_len = 0; 4695 name_value_len = 0;
4099 for (i = 0; i < start; i++) { 4696 for (i = 0; i < start; i++) {
4100 xe = &xh->xh_entries[i]; 4697 xe = &xh->xh_entries[i];
4101 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); 4698 name_value_len += namevalue_size_xe(xe);
4102 if (ocfs2_xattr_is_local(xe))
4103 xe_len +=
4104 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4105 else
4106 xe_len += OCFS2_XATTR_ROOT_SIZE;
4107 name_value_len += xe_len;
4108 if (le16_to_cpu(xe->xe_name_offset) < name_offset) 4699 if (le16_to_cpu(xe->xe_name_offset) < name_offset)
4109 name_offset = le16_to_cpu(xe->xe_name_offset); 4700 name_offset = le16_to_cpu(xe->xe_name_offset);
4110 } 4701 }
@@ -4134,12 +4725,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4134 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); 4725 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4135 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 4726 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4136 xe = &xh->xh_entries[i]; 4727 xe = &xh->xh_entries[i];
4137 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
4138 if (ocfs2_xattr_is_local(xe))
4139 xe_len +=
4140 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4141 else
4142 xe_len += OCFS2_XATTR_ROOT_SIZE;
4143 if (le16_to_cpu(xe->xe_name_offset) < 4728 if (le16_to_cpu(xe->xe_name_offset) <
4144 le16_to_cpu(xh->xh_free_start)) 4729 le16_to_cpu(xh->xh_free_start))
4145 xh->xh_free_start = xe->xe_name_offset; 4730 xh->xh_free_start = xe->xe_name_offset;
@@ -4751,195 +5336,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
4751} 5336}
4752 5337
4753/* 5338/*
4754 * Handle the normal xattr set, including replace, delete and new.
4755 *
4756 * Note: "local" indicates the real data's locality. So we can't
4757 * just its bucket locality by its length.
4758 */
4759static void ocfs2_xattr_set_entry_normal(struct inode *inode,
4760 struct ocfs2_xattr_info *xi,
4761 struct ocfs2_xattr_search *xs,
4762 u32 name_hash,
4763 int local)
4764{
4765 struct ocfs2_xattr_entry *last, *xe;
4766 int name_len = strlen(xi->name);
4767 struct ocfs2_xattr_header *xh = xs->header;
4768 u16 count = le16_to_cpu(xh->xh_count), start;
4769 size_t blocksize = inode->i_sb->s_blocksize;
4770 char *val;
4771 size_t offs, size, new_size;
4772
4773 last = &xh->xh_entries[count];
4774 if (!xs->not_found) {
4775 xe = xs->here;
4776 offs = le16_to_cpu(xe->xe_name_offset);
4777 if (ocfs2_xattr_is_local(xe))
4778 size = OCFS2_XATTR_SIZE(name_len) +
4779 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4780 else
4781 size = OCFS2_XATTR_SIZE(name_len) +
4782 OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
4783
4784 /*
4785 * If the new value will be stored outside, xi->value has been
4786 * initalized as an empty ocfs2_xattr_value_root, and the same
4787 * goes with xi->value_len, so we can set new_size safely here.
4788 * See ocfs2_xattr_set_in_bucket.
4789 */
4790 new_size = OCFS2_XATTR_SIZE(name_len) +
4791 OCFS2_XATTR_SIZE(xi->value_len);
4792
4793 le16_add_cpu(&xh->xh_name_value_len, -size);
4794 if (xi->value) {
4795 if (new_size > size)
4796 goto set_new_name_value;
4797
4798 /* Now replace the old value with new one. */
4799 if (local)
4800 xe->xe_value_size = cpu_to_le64(xi->value_len);
4801 else
4802 xe->xe_value_size = 0;
4803
4804 val = ocfs2_xattr_bucket_get_val(inode,
4805 xs->bucket, offs);
4806 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
4807 size - OCFS2_XATTR_SIZE(name_len));
4808 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
4809 memcpy(val + OCFS2_XATTR_SIZE(name_len),
4810 xi->value, xi->value_len);
4811
4812 le16_add_cpu(&xh->xh_name_value_len, new_size);
4813 ocfs2_xattr_set_local(xe, local);
4814 return;
4815 } else {
4816 /*
4817 * Remove the old entry if there is more than one.
4818 * We don't remove the last entry so that we can
4819 * use it to indicate the hash value of the empty
4820 * bucket.
4821 */
4822 last -= 1;
4823 le16_add_cpu(&xh->xh_count, -1);
4824 if (xh->xh_count) {
4825 memmove(xe, xe + 1,
4826 (void *)last - (void *)xe);
4827 memset(last, 0,
4828 sizeof(struct ocfs2_xattr_entry));
4829 } else
4830 xh->xh_free_start =
4831 cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4832
4833 return;
4834 }
4835 } else {
4836 /* find a new entry for insert. */
4837 int low = 0, high = count - 1, tmp;
4838 struct ocfs2_xattr_entry *tmp_xe;
4839
4840 while (low <= high && count) {
4841 tmp = (low + high) / 2;
4842 tmp_xe = &xh->xh_entries[tmp];
4843
4844 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
4845 low = tmp + 1;
4846 else if (name_hash <
4847 le32_to_cpu(tmp_xe->xe_name_hash))
4848 high = tmp - 1;
4849 else {
4850 low = tmp;
4851 break;
4852 }
4853 }
4854
4855 xe = &xh->xh_entries[low];
4856 if (low != count)
4857 memmove(xe + 1, xe, (void *)last - (void *)xe);
4858
4859 le16_add_cpu(&xh->xh_count, 1);
4860 memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
4861 xe->xe_name_hash = cpu_to_le32(name_hash);
4862 xe->xe_name_len = name_len;
4863 ocfs2_xattr_set_type(xe, xi->name_index);
4864 }
4865
4866set_new_name_value:
4867 /* Insert the new name+value. */
4868 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
4869
4870 /*
4871 * We must make sure that the name/value pair
4872 * exists in the same block.
4873 */
4874 offs = le16_to_cpu(xh->xh_free_start);
4875 start = offs - size;
4876
4877 if (start >> inode->i_sb->s_blocksize_bits !=
4878 (offs - 1) >> inode->i_sb->s_blocksize_bits) {
4879 offs = offs - offs % blocksize;
4880 xh->xh_free_start = cpu_to_le16(offs);
4881 }
4882
4883 val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
4884 xe->xe_name_offset = cpu_to_le16(offs - size);
4885
4886 memset(val, 0, size);
4887 memcpy(val, xi->name, name_len);
4888 memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
4889
4890 xe->xe_value_size = cpu_to_le64(xi->value_len);
4891 ocfs2_xattr_set_local(xe, local);
4892 xs->here = xe;
4893 le16_add_cpu(&xh->xh_free_start, -size);
4894 le16_add_cpu(&xh->xh_name_value_len, size);
4895
4896 return;
4897}
4898
4899/*
4900 * Set the xattr entry in the specified bucket.
4901 * The bucket is indicated by xs->bucket and it should have the enough
4902 * space for the xattr insertion.
4903 */
4904static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4905 handle_t *handle,
4906 struct ocfs2_xattr_info *xi,
4907 struct ocfs2_xattr_search *xs,
4908 u32 name_hash,
4909 int local)
4910{
4911 int ret;
4912 u64 blkno;
4913
4914 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4915 (unsigned long)xi->value_len, xi->name_index,
4916 (unsigned long long)bucket_blkno(xs->bucket));
4917
4918 if (!xs->bucket->bu_bhs[1]) {
4919 blkno = bucket_blkno(xs->bucket);
4920 ocfs2_xattr_bucket_relse(xs->bucket);
4921 ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
4922 if (ret) {
4923 mlog_errno(ret);
4924 goto out;
4925 }
4926 }
4927
4928 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4929 OCFS2_JOURNAL_ACCESS_WRITE);
4930 if (ret < 0) {
4931 mlog_errno(ret);
4932 goto out;
4933 }
4934
4935 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4936 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4937
4938out:
4939 return ret;
4940}
4941
4942/*
4943 * Truncate the specified xe_off entry in xattr bucket. 5339 * Truncate the specified xe_off entry in xattr bucket.
4944 * bucket is indicated by header_bh and len is the new length. 5340 * bucket is indicated by header_bh and len is the new length.
4945 * Both the ocfs2_xattr_value_root and the entry will be updated here. 5341 * Both the ocfs2_xattr_value_root and the entry will be updated here.
@@ -5009,66 +5405,6 @@ out:
5009 return ret; 5405 return ret;
5010} 5406}
5011 5407
5012static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
5013 struct ocfs2_xattr_search *xs,
5014 int len,
5015 struct ocfs2_xattr_set_ctxt *ctxt)
5016{
5017 int ret, offset;
5018 struct ocfs2_xattr_entry *xe = xs->here;
5019 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
5020
5021 BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
5022
5023 offset = xe - xh->xh_entries;
5024 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
5025 offset, len, ctxt);
5026 if (ret)
5027 mlog_errno(ret);
5028
5029 return ret;
5030}
5031
5032static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
5033 handle_t *handle,
5034 struct ocfs2_xattr_search *xs,
5035 char *val,
5036 int value_len)
5037{
5038 int ret, offset, block_off;
5039 struct ocfs2_xattr_value_root *xv;
5040 struct ocfs2_xattr_entry *xe = xs->here;
5041 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5042 void *base;
5043 struct ocfs2_xattr_value_buf vb = {
5044 .vb_access = ocfs2_journal_access,
5045 };
5046
5047 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
5048
5049 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
5050 xe - xh->xh_entries,
5051 &block_off,
5052 &offset);
5053 if (ret) {
5054 mlog_errno(ret);
5055 goto out;
5056 }
5057
5058 base = bucket_block(xs->bucket, block_off);
5059 xv = (struct ocfs2_xattr_value_root *)(base + offset +
5060 OCFS2_XATTR_SIZE(xe->xe_name_len));
5061
5062 vb.vb_xv = xv;
5063 vb.vb_bh = xs->bucket->bu_bhs[block_off];
5064 ret = __ocfs2_xattr_set_value_outside(inode, handle,
5065 &vb, val, value_len);
5066 if (ret)
5067 mlog_errno(ret);
5068out:
5069 return ret;
5070}
5071
5072static int ocfs2_rm_xattr_cluster(struct inode *inode, 5408static int ocfs2_rm_xattr_cluster(struct inode *inode,
5073 struct buffer_head *root_bh, 5409 struct buffer_head *root_bh,
5074 u64 blkno, 5410 u64 blkno,
@@ -5167,128 +5503,6 @@ out:
5167 return ret; 5503 return ret;
5168} 5504}
5169 5505
5170static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
5171 handle_t *handle,
5172 struct ocfs2_xattr_search *xs)
5173{
5174 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5175 struct ocfs2_xattr_entry *last = &xh->xh_entries[
5176 le16_to_cpu(xh->xh_count) - 1];
5177 int ret = 0;
5178
5179 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
5180 OCFS2_JOURNAL_ACCESS_WRITE);
5181 if (ret) {
5182 mlog_errno(ret);
5183 return;
5184 }
5185
5186 /* Remove the old entry. */
5187 memmove(xs->here, xs->here + 1,
5188 (void *)last - (void *)xs->here);
5189 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
5190 le16_add_cpu(&xh->xh_count, -1);
5191
5192 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
5193}
5194
5195/*
5196 * Set the xattr name/value in the bucket specified in xs.
5197 *
5198 * As the new value in xi may be stored in the bucket or in an outside cluster,
5199 * we divide the whole process into 3 steps:
5200 * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
5201 * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
5202 * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
5203 * 4. If the clusters for the new outside value can't be allocated, we need
5204 * to free the xattr we allocated in set.
5205 */
5206static int ocfs2_xattr_set_in_bucket(struct inode *inode,
5207 struct ocfs2_xattr_info *xi,
5208 struct ocfs2_xattr_search *xs,
5209 struct ocfs2_xattr_set_ctxt *ctxt)
5210{
5211 int ret, local = 1;
5212 size_t value_len;
5213 char *val = (char *)xi->value;
5214 struct ocfs2_xattr_entry *xe = xs->here;
5215 u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
5216 strlen(xi->name));
5217
5218 if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
5219 /*
5220 * We need to truncate the xattr storage first.
5221 *
5222 * If both the old and new value are stored to
5223 * outside block, we only need to truncate
5224 * the storage and then set the value outside.
5225 *
5226 * If the new value should be stored within block,
5227 * we should free all the outside block first and
5228 * the modification to the xattr block will be done
5229 * by following steps.
5230 */
5231 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
5232 value_len = xi->value_len;
5233 else
5234 value_len = 0;
5235
5236 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5237 value_len,
5238 ctxt);
5239 if (ret)
5240 goto out;
5241
5242 if (value_len)
5243 goto set_value_outside;
5244 }
5245
5246 value_len = xi->value_len;
5247 /* So we have to handle the inside block change now. */
5248 if (value_len > OCFS2_XATTR_INLINE_SIZE) {
5249 /*
5250 * If the new value will be stored outside of block,
5251 * initalize a new empty value root and insert it first.
5252 */
5253 local = 0;
5254 xi->value = &def_xv;
5255 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
5256 }
5257
5258 ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
5259 name_hash, local);
5260 if (ret) {
5261 mlog_errno(ret);
5262 goto out;
5263 }
5264
5265 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
5266 goto out;
5267
5268 /* allocate the space now for the outside block storage. */
5269 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5270 value_len, ctxt);
5271 if (ret) {
5272 mlog_errno(ret);
5273
5274 if (xs->not_found) {
5275 /*
5276 * We can't allocate enough clusters for outside
5277 * storage and we have allocated xattr already,
5278 * so need to remove it.
5279 */
5280 ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
5281 }
5282 goto out;
5283 }
5284
5285set_value_outside:
5286 ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
5287 xs, val, value_len);
5288out:
5289 return ret;
5290}
5291
5292/* 5506/*
5293 * check whether the xattr bucket is filled up with the same hash value. 5507 * check whether the xattr bucket is filled up with the same hash value.
5294 * If we want to insert the xattr with the same hash, return -ENOSPC. 5508 * If we want to insert the xattr with the same hash, return -ENOSPC.
@@ -5317,156 +5531,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
5317 return 0; 5531 return 0;
5318} 5532}
5319 5533
5320static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 5534/*
5321 struct ocfs2_xattr_info *xi, 5535 * Try to set the entry in the current bucket. If we fail, the caller
5322 struct ocfs2_xattr_search *xs, 5536 * will handle getting us another bucket.
5323 struct ocfs2_xattr_set_ctxt *ctxt) 5537 */
5538static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
5539 struct ocfs2_xattr_info *xi,
5540 struct ocfs2_xattr_search *xs,
5541 struct ocfs2_xattr_set_ctxt *ctxt)
5324{ 5542{
5325 struct ocfs2_xattr_header *xh; 5543 int ret;
5326 struct ocfs2_xattr_entry *xe; 5544 struct ocfs2_xa_loc loc;
5327 u16 count, header_size, xh_free_start;
5328 int free, max_free, need, old;
5329 size_t value_size = 0, name_len = strlen(xi->name);
5330 size_t blocksize = inode->i_sb->s_blocksize;
5331 int ret, allocation = 0;
5332
5333 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
5334
5335try_again:
5336 xh = xs->header;
5337 count = le16_to_cpu(xh->xh_count);
5338 xh_free_start = le16_to_cpu(xh->xh_free_start);
5339 header_size = sizeof(struct ocfs2_xattr_header) +
5340 count * sizeof(struct ocfs2_xattr_entry);
5341 max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
5342 le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
5343
5344 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
5345 "of %u which exceed block size\n",
5346 (unsigned long long)bucket_blkno(xs->bucket),
5347 header_size);
5348 5545
5349 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) 5546 mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
5350 value_size = OCFS2_XATTR_ROOT_SIZE;
5351 else if (xi->value)
5352 value_size = OCFS2_XATTR_SIZE(xi->value_len);
5353 5547
5354 if (xs->not_found) 5548 ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
5355 need = sizeof(struct ocfs2_xattr_entry) + 5549 xs->not_found ? NULL : xs->here);
5356 OCFS2_XATTR_SIZE(name_len) + value_size; 5550 ret = ocfs2_xa_set(&loc, xi, ctxt);
5357 else { 5551 if (!ret) {
5358 need = value_size + OCFS2_XATTR_SIZE(name_len); 5552 xs->here = loc.xl_entry;
5553 goto out;
5554 }
5555 if (ret != -ENOSPC) {
5556 mlog_errno(ret);
5557 goto out;
5558 }
5359 5559
5360 /* 5560 /* Ok, we need space. Let's try defragmenting the bucket. */
5361 * We only replace the old value if the new length is smaller 5561 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5362 * than the old one. Otherwise we will allocate new space in the 5562 xs->bucket);
5363 * bucket to store it. 5563 if (ret) {
5364 */ 5564 mlog_errno(ret);
5365 xe = xs->here; 5565 goto out;
5366 if (ocfs2_xattr_is_local(xe)) 5566 }
5367 old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
5368 else
5369 old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
5370 5567
5371 if (old >= value_size) 5568 ret = ocfs2_xa_set(&loc, xi, ctxt);
5372 need = 0; 5569 if (!ret) {
5570 xs->here = loc.xl_entry;
5571 goto out;
5373 } 5572 }
5573 if (ret != -ENOSPC)
5574 mlog_errno(ret);
5374 5575
5375 free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
5376 /*
5377 * We need to make sure the new name/value pair
5378 * can exist in the same block.
5379 */
5380 if (xh_free_start % blocksize < need)
5381 free -= xh_free_start % blocksize;
5382
5383 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
5384 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
5385 " %u\n", xs->not_found,
5386 (unsigned long long)bucket_blkno(xs->bucket),
5387 free, need, max_free, le16_to_cpu(xh->xh_free_start),
5388 le16_to_cpu(xh->xh_name_value_len));
5389
5390 if (free < need ||
5391 (xs->not_found &&
5392 count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
5393 if (need <= max_free &&
5394 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
5395 /*
5396 * We can create the space by defragment. Since only the
5397 * name/value will be moved, the xe shouldn't be changed
5398 * in xs.
5399 */
5400 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5401 xs->bucket);
5402 if (ret) {
5403 mlog_errno(ret);
5404 goto out;
5405 }
5406 5576
5407 xh_free_start = le16_to_cpu(xh->xh_free_start); 5577out:
5408 free = xh_free_start - header_size 5578 mlog_exit(ret);
5409 - OCFS2_XATTR_HEADER_GAP; 5579 return ret;
5410 if (xh_free_start % blocksize < need) 5580}
5411 free -= xh_free_start % blocksize;
5412 5581
5413 if (free >= need) 5582static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
5414 goto xattr_set; 5583 struct ocfs2_xattr_info *xi,
5584 struct ocfs2_xattr_search *xs,
5585 struct ocfs2_xattr_set_ctxt *ctxt)
5586{
5587 int ret;
5415 5588
5416 mlog(0, "Can't get enough space for xattr insert by " 5589 mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
5417 "defragment. Need %u bytes, but we have %d, so "
5418 "allocate new bucket for it.\n", need, free);
5419 }
5420 5590
5421 /* 5591 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5422 * We have to add new buckets or clusters and one 5592 if (!ret)
5423 * allocation should leave us enough space for insert. 5593 goto out;
5424 */ 5594 if (ret != -ENOSPC) {
5425 BUG_ON(allocation); 5595 mlog_errno(ret);
5596 goto out;
5597 }
5426 5598
5427 /* 5599 /* Ack, need more space. Let's try to get another bucket! */
5428 * We do not allow for overlapping ranges between buckets. And
5429 * the maximum number of collisions we will allow for then is
5430 * one bucket's worth, so check it here whether we need to
5431 * add a new bucket for the insert.
5432 */
5433 ret = ocfs2_check_xattr_bucket_collision(inode,
5434 xs->bucket,
5435 xi->name);
5436 if (ret) {
5437 mlog_errno(ret);
5438 goto out;
5439 }
5440 5600
5441 ret = ocfs2_add_new_xattr_bucket(inode, 5601 /*
5442 xs->xattr_bh, 5602 * We do not allow for overlapping ranges between buckets. And
5603 * the maximum number of collisions we will allow for then is
5604 * one bucket's worth, so check it here whether we need to
5605 * add a new bucket for the insert.
5606 */
5607 ret = ocfs2_check_xattr_bucket_collision(inode,
5443 xs->bucket, 5608 xs->bucket,
5444 ctxt); 5609 xi->xi_name);
5445 if (ret) { 5610 if (ret) {
5446 mlog_errno(ret); 5611 mlog_errno(ret);
5447 goto out; 5612 goto out;
5448 } 5613 }
5449 5614
5450 /* 5615 ret = ocfs2_add_new_xattr_bucket(inode,
5451 * ocfs2_add_new_xattr_bucket() will have updated 5616 xs->xattr_bh,
5452 * xs->bucket if it moved, but it will not have updated 5617 xs->bucket,
5453 * any of the other search fields. Thus, we drop it and 5618 ctxt);
5454 * re-search. Everything should be cached, so it'll be 5619 if (ret) {
5455 * quick. 5620 mlog_errno(ret);
5456 */ 5621 goto out;
5457 ocfs2_xattr_bucket_relse(xs->bucket);
5458 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5459 xi->name_index,
5460 xi->name, xs);
5461 if (ret && ret != -ENODATA)
5462 goto out;
5463 xs->not_found = ret;
5464 allocation = 1;
5465 goto try_again;
5466 } 5622 }
5467 5623
5468xattr_set: 5624 /*
5469 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt); 5625 * ocfs2_add_new_xattr_bucket() will have updated
5626 * xs->bucket if it moved, but it will not have updated
5627 * any of the other search fields. Thus, we drop it and
5628 * re-search. Everything should be cached, so it'll be
5629 * quick.
5630 */
5631 ocfs2_xattr_bucket_relse(xs->bucket);
5632 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5633 xi->xi_name_index,
5634 xi->xi_name, xs);
5635 if (ret && ret != -ENODATA)
5636 goto out;
5637 xs->not_found = ret;
5638
5639 /* Ok, we have a new bucket, let's try again */
5640 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5641 if (ret && (ret != -ENOSPC))
5642 mlog_errno(ret);
5643
5470out: 5644out:
5471 mlog_exit(ret); 5645 mlog_exit(ret);
5472 return ret; 5646 return ret;
@@ -5678,7 +5852,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
5678 * refcount tree, and make the original extent become 3. So we will need 5852 * refcount tree, and make the original extent become 3. So we will need
5679 * 2 * cluster more extent recs at most. 5853 * 2 * cluster more extent recs at most.
5680 */ 5854 */
5681 if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { 5855 if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
5682 5856
5683 ret = ocfs2_refcounted_xattr_delete_need(inode, 5857 ret = ocfs2_refcounted_xattr_delete_need(inode,
5684 &(*ref_tree)->rf_ci, 5858 &(*ref_tree)->rf_ci,
@@ -6354,9 +6528,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6354 int indexed) 6528 int indexed)
6355{ 6529{
6356 int ret; 6530 int ret;
6357 handle_t *handle;
6358 struct ocfs2_alloc_context *meta_ac; 6531 struct ocfs2_alloc_context *meta_ac;
6359 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6533 struct ocfs2_xattr_set_ctxt ctxt = {
6534 .meta_ac = meta_ac,
6535 };
6360 6536
6361 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6537 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
6362 if (ret < 0) { 6538 if (ret < 0) {
@@ -6364,21 +6540,21 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6364 return ret; 6540 return ret;
6365 } 6541 }
6366 6542
6367 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); 6543 ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
6368 if (IS_ERR(handle)) { 6544 if (IS_ERR(ctxt.handle)) {
6369 ret = PTR_ERR(handle); 6545 ret = PTR_ERR(ctxt.handle);
6370 mlog_errno(ret); 6546 mlog_errno(ret);
6371 goto out; 6547 goto out;
6372 } 6548 }
6373 6549
6374 mlog(0, "create new xattr block for inode %llu, index = %d\n", 6550 mlog(0, "create new xattr block for inode %llu, index = %d\n",
6375 (unsigned long long)fe_bh->b_blocknr, indexed); 6551 (unsigned long long)fe_bh->b_blocknr, indexed);
6376 ret = ocfs2_create_xattr_block(handle, inode, fe_bh, 6552 ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
6377 meta_ac, ret_bh, indexed); 6553 ret_bh);
6378 if (ret) 6554 if (ret)
6379 mlog_errno(ret); 6555 mlog_errno(ret);
6380 6556
6381 ocfs2_commit_trans(osb, handle); 6557 ocfs2_commit_trans(osb, ctxt.handle);
6382out: 6558out:
6383 ocfs2_free_alloc_context(meta_ac); 6559 ocfs2_free_alloc_context(meta_ac);
6384 return ret; 6560 return ret;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index f3b7c1541f3a..75d9b5ba1d45 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -11,6 +11,7 @@
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/writeback.h>
14#include <linux/crc-itu-t.h> 15#include <linux/crc-itu-t.h>
15#include "omfs.h" 16#include "omfs.h"
16 17
@@ -89,7 +90,7 @@ static void omfs_update_checksums(struct omfs_inode *oi)
89 oi->i_head.h_check_xor = xor; 90 oi->i_head.h_check_xor = xor;
90} 91}
91 92
92static int omfs_write_inode(struct inode *inode, int wait) 93static int __omfs_write_inode(struct inode *inode, int wait)
93{ 94{
94 struct omfs_inode *oi; 95 struct omfs_inode *oi;
95 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); 96 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
@@ -162,9 +163,14 @@ out:
162 return ret; 163 return ret;
163} 164}
164 165
166static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
167{
168 return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
169}
170
165int omfs_sync_inode(struct inode *inode) 171int omfs_sync_inode(struct inode *inode)
166{ 172{
167 return omfs_write_inode(inode, 1); 173 return __omfs_write_inode(inode, 1);
168} 174}
169 175
170/* 176/*
diff --git a/fs/open.c b/fs/open.c
index 040cef72bc00..e0b2d88b0380 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -271,7 +271,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
271 * Make sure that there are no leases. get_write_access() protects 271 * Make sure that there are no leases. get_write_access() protects
272 * against the truncate racing with a lease-granting setlease(). 272 * against the truncate racing with a lease-granting setlease().
273 */ 273 */
274 error = break_lease(inode, FMODE_WRITE); 274 error = break_lease(inode, O_WRONLY);
275 if (error) 275 if (error)
276 goto put_write_and_out; 276 goto put_write_and_out;
277 277
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 64bc8998ac9a..e8865c11777f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -412,9 +412,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
412 pdev = part_to_dev(p); 412 pdev = part_to_dev(p);
413 413
414 p->start_sect = start; 414 p->start_sect = start;
415 p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); 415 p->alignment_offset =
416 p->discard_alignment = queue_sector_discard_alignment(disk->queue, 416 queue_limit_alignment_offset(&disk->queue->limits, start);
417 start); 417 p->discard_alignment =
418 queue_limit_discard_alignment(&disk->queue->limits, start);
418 p->nr_sects = len; 419 p->nr_sects = len;
419 p->partno = partno; 420 p->partno = partno;
420 p->policy = get_disk_ro(disk); 421 p->policy = get_disk_ro(disk);
diff --git a/fs/pnode.c b/fs/pnode.c
index 8d5f392ec3d3..5cc564a83149 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -86,7 +86,7 @@ static int do_make_slave(struct vfsmount *mnt)
86 86
87 /* 87 /*
88 * slave 'mnt' to a peer mount that has the 88 * slave 'mnt' to a peer mount that has the
89 * same root dentry. If none is available than 89 * same root dentry. If none is available then
90 * slave it to anything that is available. 90 * slave it to anything that is available.
91 */ 91 */
92 while ((peer_mnt = next_peer(peer_mnt)) != mnt && 92 while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
@@ -147,6 +147,11 @@ void change_mnt_propagation(struct vfsmount *mnt, int type)
147 * get the next mount in the propagation tree. 147 * get the next mount in the propagation tree.
148 * @m: the mount seen last 148 * @m: the mount seen last
149 * @origin: the original mount from where the tree walk initiated 149 * @origin: the original mount from where the tree walk initiated
150 *
151 * Note that peer groups form contiguous segments of slave lists.
152 * We rely on that in get_source() to be able to find out if
153 * vfsmount found while iterating with propagation_next() is
154 * a peer of one we'd found earlier.
150 */ 155 */
151static struct vfsmount *propagation_next(struct vfsmount *m, 156static struct vfsmount *propagation_next(struct vfsmount *m,
152 struct vfsmount *origin) 157 struct vfsmount *origin)
@@ -186,10 +191,6 @@ static struct vfsmount *get_source(struct vfsmount *dest,
186{ 191{
187 struct vfsmount *p_last_src = NULL; 192 struct vfsmount *p_last_src = NULL;
188 struct vfsmount *p_last_dest = NULL; 193 struct vfsmount *p_last_dest = NULL;
189 *type = CL_PROPAGATION;
190
191 if (IS_MNT_SHARED(dest))
192 *type |= CL_MAKE_SHARED;
193 194
194 while (last_dest != dest->mnt_master) { 195 while (last_dest != dest->mnt_master) {
195 p_last_dest = last_dest; 196 p_last_dest = last_dest;
@@ -202,13 +203,18 @@ static struct vfsmount *get_source(struct vfsmount *dest,
202 do { 203 do {
203 p_last_dest = next_peer(p_last_dest); 204 p_last_dest = next_peer(p_last_dest);
204 } while (IS_MNT_NEW(p_last_dest)); 205 } while (IS_MNT_NEW(p_last_dest));
206 /* is that a peer of the earlier? */
207 if (dest == p_last_dest) {
208 *type = CL_MAKE_SHARED;
209 return p_last_src;
210 }
205 } 211 }
206 212 /* slave of the earlier, then */
207 if (dest != p_last_dest) { 213 *type = CL_SLAVE;
208 *type |= CL_SLAVE; 214 /* beginning of peer group among the slaves? */
209 return last_src; 215 if (IS_MNT_SHARED(dest))
210 } else 216 *type |= CL_MAKE_SHARED;
211 return p_last_src; 217 return last_src;
212} 218}
213 219
214/* 220/*
diff --git a/fs/pnode.h b/fs/pnode.h
index 958665d662af..1ea4ae1efcd3 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,12 +21,11 @@
21#define CL_SLAVE 0x02 21#define CL_SLAVE 0x02
22#define CL_COPY_ALL 0x04 22#define CL_COPY_ALL 0x04
23#define CL_MAKE_SHARED 0x08 23#define CL_MAKE_SHARED 0x08
24#define CL_PROPAGATION 0x10 24#define CL_PRIVATE 0x10
25#define CL_PRIVATE 0x20
26 25
27static inline void set_mnt_shared(struct vfsmount *mnt) 26static inline void set_mnt_shared(struct vfsmount *mnt)
28{ 27{
29 mnt->mnt_flags &= ~MNT_PNODE_MASK; 28 mnt->mnt_flags &= ~MNT_SHARED_MASK;
30 mnt->mnt_flags |= MNT_SHARED; 29 mnt->mnt_flags |= MNT_SHARED;
31} 30}
32 31
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 13b5d0708175..18e20feee251 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -270,7 +270,9 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
270 blocked = p->blocked; 270 blocked = p->blocked;
271 collect_sigign_sigcatch(p, &ignored, &caught); 271 collect_sigign_sigcatch(p, &ignored, &caught);
272 num_threads = atomic_read(&p->signal->count); 272 num_threads = atomic_read(&p->signal->count);
273 rcu_read_lock(); /* FIXME: is this correct? */
273 qsize = atomic_read(&__task_cred(p)->user->sigpending); 274 qsize = atomic_read(&__task_cred(p)->user->sigpending);
275 rcu_read_unlock();
274 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; 276 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
275 unlock_task_sighand(p, &flags); 277 unlock_task_sighand(p, &flags);
276 } 278 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e42bbd843ed1..a7310841c831 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -647,17 +647,11 @@ static int mounts_release(struct inode *inode, struct file *file)
647static unsigned mounts_poll(struct file *file, poll_table *wait) 647static unsigned mounts_poll(struct file *file, poll_table *wait)
648{ 648{
649 struct proc_mounts *p = file->private_data; 649 struct proc_mounts *p = file->private_data;
650 struct mnt_namespace *ns = p->ns;
651 unsigned res = POLLIN | POLLRDNORM; 650 unsigned res = POLLIN | POLLRDNORM;
652 651
653 poll_wait(file, &ns->poll, wait); 652 poll_wait(file, &p->ns->poll, wait);
654 653 if (mnt_had_events(p))
655 spin_lock(&vfsmount_lock);
656 if (p->event != ns->event) {
657 p->event = ns->event;
658 res |= POLLERR | POLLPRI; 654 res |= POLLERR | POLLPRI;
659 }
660 spin_unlock(&vfsmount_lock);
661 655
662 return res; 656 return res;
663} 657}
@@ -1095,8 +1089,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1095 if (!capable(CAP_AUDIT_CONTROL)) 1089 if (!capable(CAP_AUDIT_CONTROL))
1096 return -EPERM; 1090 return -EPERM;
1097 1091
1098 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) 1092 rcu_read_lock();
1093 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1094 rcu_read_unlock();
1099 return -EPERM; 1095 return -EPERM;
1096 }
1097 rcu_read_unlock();
1100 1098
1101 if (count >= PAGE_SIZE) 1099 if (count >= PAGE_SIZE)
1102 count = PAGE_SIZE - 1; 1100 count = PAGE_SIZE - 1;
@@ -2369,16 +2367,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2369{ 2367{
2370 struct pid_namespace *ns = dentry->d_sb->s_fs_info; 2368 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2371 pid_t tgid = task_tgid_nr_ns(current, ns); 2369 pid_t tgid = task_tgid_nr_ns(current, ns);
2372 char tmp[PROC_NUMBUF]; 2370 char *name = ERR_PTR(-ENOENT);
2373 if (!tgid) 2371 if (tgid) {
2374 return ERR_PTR(-ENOENT); 2372 name = __getname();
2375 sprintf(tmp, "%d", task_tgid_nr_ns(current, ns)); 2373 if (!name)
2376 return ERR_PTR(vfs_follow_link(nd,tmp)); 2374 name = ERR_PTR(-ENOMEM);
2375 else
2376 sprintf(name, "%d", tgid);
2377 }
2378 nd_set_link(nd, name);
2379 return NULL;
2380}
2381
2382static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2383 void *cookie)
2384{
2385 char *s = nd_get_link(nd);
2386 if (!IS_ERR(s))
2387 __putname(s);
2377} 2388}
2378 2389
2379static const struct inode_operations proc_self_inode_operations = { 2390static const struct inode_operations proc_self_inode_operations = {
2380 .readlink = proc_self_readlink, 2391 .readlink = proc_self_readlink,
2381 .follow_link = proc_self_follow_link, 2392 .follow_link = proc_self_follow_link,
2393 .put_link = proc_self_put_link,
2382}; 2394};
2383 2395
2384/* 2396/*
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 480cb1065eec..9580abeadeb3 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -662,6 +662,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
662 } 662 }
663 return ent; 663 return ent;
664} 664}
665EXPORT_SYMBOL(proc_symlink);
665 666
666struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, 667struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
667 struct proc_dir_entry *parent) 668 struct proc_dir_entry *parent)
@@ -700,6 +701,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
700{ 701{
701 return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); 702 return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
702} 703}
704EXPORT_SYMBOL(proc_mkdir);
703 705
704struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, 706struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
705 struct proc_dir_entry *parent) 707 struct proc_dir_entry *parent)
@@ -728,6 +730,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
728 } 730 }
729 return ent; 731 return ent;
730} 732}
733EXPORT_SYMBOL(create_proc_entry);
731 734
732struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, 735struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
733 struct proc_dir_entry *parent, 736 struct proc_dir_entry *parent,
@@ -762,6 +765,7 @@ out_free:
762out: 765out:
763 return NULL; 766 return NULL;
764} 767}
768EXPORT_SYMBOL(proc_create_data);
765 769
766static void free_proc_entry(struct proc_dir_entry *de) 770static void free_proc_entry(struct proc_dir_entry *de)
767{ 771{
@@ -853,3 +857,4 @@ continue_removing:
853 de->parent->name, de->name, de->subdir->name); 857 de->parent->name, de->name, de->subdir->name);
854 pde_put(de); 858 pde_put(de);
855} 859}
860EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca78346d3f0..cfe90a48a6e8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,37 +12,37 @@
12#include <linux/poll.h> 12#include <linux/poll.h>
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/syslog.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io.h> 18#include <asm/io.h>
18 19
19extern wait_queue_head_t log_wait; 20extern wait_queue_head_t log_wait;
20 21
21extern int do_syslog(int type, char __user *bug, int count);
22
23static int kmsg_open(struct inode * inode, struct file * file) 22static int kmsg_open(struct inode * inode, struct file * file)
24{ 23{
25 return do_syslog(1,NULL,0); 24 return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
26} 25}
27 26
28static int kmsg_release(struct inode * inode, struct file * file) 27static int kmsg_release(struct inode * inode, struct file * file)
29{ 28{
30 (void) do_syslog(0,NULL,0); 29 (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
31 return 0; 30 return 0;
32} 31}
33 32
34static ssize_t kmsg_read(struct file *file, char __user *buf, 33static ssize_t kmsg_read(struct file *file, char __user *buf,
35 size_t count, loff_t *ppos) 34 size_t count, loff_t *ppos)
36{ 35{
37 if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) 36 if ((file->f_flags & O_NONBLOCK) &&
37 !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
38 return -EAGAIN; 38 return -EAGAIN;
39 return do_syslog(2, buf, count); 39 return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
40} 40}
41 41
42static unsigned int kmsg_poll(struct file *file, poll_table *wait) 42static unsigned int kmsg_poll(struct file *file, poll_table *wait)
43{ 43{
44 poll_wait(file, &log_wait, wait); 44 poll_wait(file, &log_wait, wait);
45 if (do_syslog(9, NULL, 0)) 45 if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
46 return POLLIN | POLLRDNORM; 46 return POLLIN | POLLRDNORM;
47 return 0; 47 return 0;
48} 48}
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 123257bb356b..f8650dce74fb 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -10,16 +10,19 @@
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/stat.h> 11#include <linux/stat.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/of.h>
14#include <linux/module.h>
13#include <asm/prom.h> 15#include <asm/prom.h>
14#include <asm/uaccess.h> 16#include <asm/uaccess.h>
15#include "internal.h" 17#include "internal.h"
16 18
17#ifndef HAVE_ARCH_DEVTREE_FIXUPS
18static inline void set_node_proc_entry(struct device_node *np, 19static inline void set_node_proc_entry(struct device_node *np,
19 struct proc_dir_entry *de) 20 struct proc_dir_entry *de)
20{ 21{
21} 22#ifdef HAVE_ARCH_DEVTREE_FIXUPS
23 np->pde = de;
22#endif 24#endif
25}
23 26
24static struct proc_dir_entry *proc_device_tree; 27static struct proc_dir_entry *proc_device_tree;
25 28
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b080b791d9e3..757c069f2a65 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -220,9 +220,3 @@ void pid_ns_release_proc(struct pid_namespace *ns)
220{ 220{
221 mntput(ns->proc_mnt); 221 mntput(ns->proc_mnt);
222} 222}
223
224EXPORT_SYMBOL(proc_symlink);
225EXPORT_SYMBOL(proc_mkdir);
226EXPORT_SYMBOL(create_proc_entry);
227EXPORT_SYMBOL(proc_create_data);
228EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9087b10209e6..0d651f980a8d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1497,9 +1497,11 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1497 1497
1498 args.objectid = key->on_disk_key.k_objectid; 1498 args.objectid = key->on_disk_key.k_objectid;
1499 args.dirid = key->on_disk_key.k_dir_id; 1499 args.dirid = key->on_disk_key.k_dir_id;
1500 reiserfs_write_unlock(s);
1500 inode = iget5_locked(s, key->on_disk_key.k_objectid, 1501 inode = iget5_locked(s, key->on_disk_key.k_objectid,
1501 reiserfs_find_actor, reiserfs_init_locked_inode, 1502 reiserfs_find_actor, reiserfs_init_locked_inode,
1502 (void *)(&args)); 1503 (void *)(&args));
1504 reiserfs_write_lock(s);
1503 if (!inode) 1505 if (!inode)
1504 return ERR_PTR(-ENOMEM); 1506 return ERR_PTR(-ENOMEM);
1505 1507
@@ -1613,7 +1615,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1613** to properly mark inodes for datasync and such, but only actually 1615** to properly mark inodes for datasync and such, but only actually
1614** does something when called for a synchronous update. 1616** does something when called for a synchronous update.
1615*/ 1617*/
1616int reiserfs_write_inode(struct inode *inode, int do_sync) 1618int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1617{ 1619{
1618 struct reiserfs_transaction_handle th; 1620 struct reiserfs_transaction_handle th;
1619 int jbegin_count = 1; 1621 int jbegin_count = 1;
@@ -1625,7 +1627,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
1625 ** inode needs to reach disk for safety, and they can safely be 1627 ** inode needs to reach disk for safety, and they can safely be
1626 ** ignored because the altered inode has already been logged. 1628 ** ignored because the altered inode has already been logged.
1627 */ 1629 */
1628 if (do_sync && !(current->flags & PF_MEMALLOC)) { 1630 if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
1629 reiserfs_write_lock(inode->i_sb); 1631 reiserfs_write_lock(inode->i_sb);
1630 if (!journal_begin(&th, inode->i_sb, jbegin_count)) { 1632 if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1631 reiserfs_update_sd(&th, inode); 1633 reiserfs_update_sd(&th, inode);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eae7d9dbf3ff..5afd554efad3 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -674,7 +674,6 @@ struct list_head *seq_list_start(struct list_head *head, loff_t pos)
674 674
675 return NULL; 675 return NULL;
676} 676}
677
678EXPORT_SYMBOL(seq_list_start); 677EXPORT_SYMBOL(seq_list_start);
679 678
680struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) 679struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
@@ -684,7 +683,6 @@ struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
684 683
685 return seq_list_start(head, pos - 1); 684 return seq_list_start(head, pos - 1);
686} 685}
687
688EXPORT_SYMBOL(seq_list_start_head); 686EXPORT_SYMBOL(seq_list_start_head);
689 687
690struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) 688struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
@@ -695,5 +693,131 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
695 ++*ppos; 693 ++*ppos;
696 return lh == head ? NULL : lh; 694 return lh == head ? NULL : lh;
697} 695}
698
699EXPORT_SYMBOL(seq_list_next); 696EXPORT_SYMBOL(seq_list_next);
697
698/**
699 * seq_hlist_start - start an iteration of a hlist
700 * @head: the head of the hlist
701 * @pos: the start position of the sequence
702 *
703 * Called at seq_file->op->start().
704 */
705struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
706{
707 struct hlist_node *node;
708
709 hlist_for_each(node, head)
710 if (pos-- == 0)
711 return node;
712 return NULL;
713}
714EXPORT_SYMBOL(seq_hlist_start);
715
716/**
717 * seq_hlist_start_head - start an iteration of a hlist
718 * @head: the head of the hlist
719 * @pos: the start position of the sequence
720 *
721 * Called at seq_file->op->start(). Call this function if you want to
722 * print a header at the top of the output.
723 */
724struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
725{
726 if (!pos)
727 return SEQ_START_TOKEN;
728
729 return seq_hlist_start(head, pos - 1);
730}
731EXPORT_SYMBOL(seq_hlist_start_head);
732
733/**
734 * seq_hlist_next - move to the next position of the hlist
735 * @v: the current iterator
736 * @head: the head of the hlist
737 * @pos: the current posision
738 *
739 * Called at seq_file->op->next().
740 */
741struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
742 loff_t *ppos)
743{
744 struct hlist_node *node = v;
745
746 ++*ppos;
747 if (v == SEQ_START_TOKEN)
748 return head->first;
749 else
750 return node->next;
751}
752EXPORT_SYMBOL(seq_hlist_next);
753
754/**
755 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
756 * @head: the head of the hlist
757 * @pos: the start position of the sequence
758 *
759 * Called at seq_file->op->start().
760 *
761 * This list-traversal primitive may safely run concurrently with
762 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
763 * as long as the traversal is guarded by rcu_read_lock().
764 */
765struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
766 loff_t pos)
767{
768 struct hlist_node *node;
769
770 __hlist_for_each_rcu(node, head)
771 if (pos-- == 0)
772 return node;
773 return NULL;
774}
775EXPORT_SYMBOL(seq_hlist_start_rcu);
776
777/**
778 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
779 * @head: the head of the hlist
780 * @pos: the start position of the sequence
781 *
782 * Called at seq_file->op->start(). Call this function if you want to
783 * print a header at the top of the output.
784 *
785 * This list-traversal primitive may safely run concurrently with
786 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
787 * as long as the traversal is guarded by rcu_read_lock().
788 */
789struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
790 loff_t pos)
791{
792 if (!pos)
793 return SEQ_START_TOKEN;
794
795 return seq_hlist_start_rcu(head, pos - 1);
796}
797EXPORT_SYMBOL(seq_hlist_start_head_rcu);
798
799/**
800 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
801 * @v: the current iterator
802 * @head: the head of the hlist
803 * @pos: the current posision
804 *
805 * Called at seq_file->op->next().
806 *
807 * This list-traversal primitive may safely run concurrently with
808 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
809 * as long as the traversal is guarded by rcu_read_lock().
810 */
811struct hlist_node *seq_hlist_next_rcu(void *v,
812 struct hlist_head *head,
813 loff_t *ppos)
814{
815 struct hlist_node *node = v;
816
817 ++*ppos;
818 if (v == SEQ_START_TOKEN)
819 return rcu_dereference(head->first);
820 else
821 return rcu_dereference(node->next);
822}
823EXPORT_SYMBOL(seq_hlist_next_rcu);
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 70e3244fa30f..df8a19ef870d 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2a7960310349..1cb0d81b164b 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -29,15 +29,14 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/vfs.h> 30#include <linux/vfs.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/mutex.h>
33#include <linux/string.h> 32#include <linux/string.h>
34#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
35#include <linux/zlib.h>
36 34
37#include "squashfs_fs.h" 35#include "squashfs_fs.h"
38#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
39#include "squashfs_fs_i.h" 37#include "squashfs_fs_i.h"
40#include "squashfs.h" 38#include "squashfs.h"
39#include "decompressor.h"
41 40
42/* 41/*
43 * Read the metadata block length, this is stored in the first two 42 * Read the metadata block length, this is stored in the first two
@@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
153 } 152 }
154 153
155 if (compressed) { 154 if (compressed) {
156 int zlib_err = 0, zlib_init = 0; 155 length = squashfs_decompress(msblk, buffer, bh, b, offset,
157 156 length, srclength, pages);
158 /* 157 if (length < 0)
159 * Uncompress block. 158 goto read_failure;
160 */
161
162 mutex_lock(&msblk->read_data_mutex);
163
164 msblk->stream.avail_out = 0;
165 msblk->stream.avail_in = 0;
166
167 bytes = length;
168 do {
169 if (msblk->stream.avail_in == 0 && k < b) {
170 avail = min(bytes, msblk->devblksize - offset);
171 bytes -= avail;
172 wait_on_buffer(bh[k]);
173 if (!buffer_uptodate(bh[k]))
174 goto release_mutex;
175
176 if (avail == 0) {
177 offset = 0;
178 put_bh(bh[k++]);
179 continue;
180 }
181
182 msblk->stream.next_in = bh[k]->b_data + offset;
183 msblk->stream.avail_in = avail;
184 offset = 0;
185 }
186
187 if (msblk->stream.avail_out == 0 && page < pages) {
188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 }
191
192 if (!zlib_init) {
193 zlib_err = zlib_inflateInit(&msblk->stream);
194 if (zlib_err != Z_OK) {
195 ERROR("zlib_inflateInit returned"
196 " unexpected result 0x%x,"
197 " srclength %d\n", zlib_err,
198 srclength);
199 goto release_mutex;
200 }
201 zlib_init = 1;
202 }
203
204 zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
205
206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK);
209
210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate error, data probably corrupt\n");
212 goto release_mutex;
213 }
214
215 zlib_err = zlib_inflateEnd(&msblk->stream);
216 if (zlib_err != Z_OK) {
217 ERROR("zlib_inflate error, data probably corrupt\n");
218 goto release_mutex;
219 }
220 length = msblk->stream.total_out;
221 mutex_unlock(&msblk->read_data_mutex);
222 } else { 159 } else {
223 /* 160 /*
224 * Block is uncompressed. 161 * Block is uncompressed.
@@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
255 kfree(bh); 192 kfree(bh);
256 return length; 193 return length;
257 194
258release_mutex:
259 mutex_unlock(&msblk->read_data_mutex);
260
261block_release: 195block_release:
262 for (; k < b; k++) 196 for (; k < b; k++)
263 put_bh(bh[k]); 197 put_bh(bh[k]);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 40c98fa6b5d6..57314bee9059 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -51,7 +51,6 @@
51#include <linux/sched.h> 51#include <linux/sched.h>
52#include <linux/spinlock.h> 52#include <linux/spinlock.h>
53#include <linux/wait.h> 53#include <linux/wait.h>
54#include <linux/zlib.h>
55#include <linux/pagemap.h> 54#include <linux/pagemap.h>
56 55
57#include "squashfs_fs.h" 56#include "squashfs_fs.h"
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 000000000000..157478da6ac9
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,68 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * decompressor.c
22 */
23
24#include <linux/types.h>
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27
28#include "squashfs_fs.h"
29#include "squashfs_fs_sb.h"
30#include "squashfs_fs_i.h"
31#include "decompressor.h"
32#include "squashfs.h"
33
34/*
35 * This file (and decompressor.h) implements a decompressor framework for
36 * Squashfs, allowing multiple decompressors to be easily supported
37 */
38
39static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
41};
42
43static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
45};
46
47static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
48 NULL, NULL, NULL, 0, "unknown", 0
49};
50
51static const struct squashfs_decompressor *decompressor[] = {
52 &squashfs_zlib_comp_ops,
53 &squashfs_lzma_unsupported_comp_ops,
54 &squashfs_lzo_unsupported_comp_ops,
55 &squashfs_unknown_comp_ops
56};
57
58
59const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
60{
61 int i;
62
63 for (i = 0; decompressor[i]->id; i++)
64 if (id == decompressor[i]->id)
65 break;
66
67 return decompressor[i];
68}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 000000000000..7425f80783f6
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,55 @@
1#ifndef DECOMPRESSOR_H
2#define DECOMPRESSOR_H
3/*
4 * Squashfs - a compressed read only filesystem for Linux
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * decompressor.h
24 */
25
26struct squashfs_decompressor {
27 void *(*init)(struct squashfs_sb_info *);
28 void (*free)(void *);
29 int (*decompress)(struct squashfs_sb_info *, void **,
30 struct buffer_head **, int, int, int, int, int);
31 int id;
32 char *name;
33 int supported;
34};
35
36static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
37{
38 return msblk->decompressor->init(msblk);
39}
40
41static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
42 void *s)
43{
44 if (msblk->decompressor)
45 msblk->decompressor->free(s);
46}
47
48static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
49 void **buffer, struct buffer_head **bh, int b, int offset, int length,
50 int srclength, int pages)
51{
52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
53 length, srclength, pages);
54}
55#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 566b0eaed868..12b933ac6585 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -30,7 +30,6 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/zlib.h>
34 33
35#include "squashfs_fs.h" 34#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h" 35#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 2b1b8fe5e037..7f93d5a9ee05 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -39,7 +39,6 @@
39#include <linux/vfs.h> 39#include <linux/vfs.h>
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h>
43#include <linux/slab.h> 42#include <linux/slab.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 717767d831df..a25c5060bdcb 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,7 +47,6 @@
47#include <linux/string.h> 47#include <linux/string.h>
48#include <linux/pagemap.h> 48#include <linux/pagemap.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/zlib.h>
51 50
52#include "squashfs_fs.h" 51#include "squashfs_fs.h"
53#include "squashfs_fs_sb.h" 52#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index b5a2c15bbbc7..7c90bbd6879d 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -36,7 +36,6 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/zlib.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 3795b837ba28..b7f64bcd2b70 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -34,7 +34,6 @@
34#include <linux/fs.h> 34#include <linux/fs.h>
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/zlib.h>
38 37
39#include "squashfs_fs.h" 38#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 39#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 9101dbde39ec..49daaf669e41 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,7 +40,6 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/zlib.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
46#include "squashfs_fs_sb.h" 45#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 9e398653b22b..5266bd8ad932 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,7 +57,6 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/zlib.h>
61 60
62#include "squashfs_fs.h" 61#include "squashfs_fs.h"
63#include "squashfs_fs_sb.h" 62#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 0e9feb6adf7e..fe2587af5512 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
51 u64, int); 51 u64, int);
52extern int squashfs_read_table(struct super_block *, void *, u64, int); 52extern int squashfs_read_table(struct super_block *, void *, u64, int);
53 53
54/* decompressor.c */
55extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
56
54/* export.c */ 57/* export.c */
55extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, 58extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
56 unsigned int); 59 unsigned int);
@@ -71,7 +74,7 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
71extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
72 75
73/* 76/*
74 * Inodes and files operations 77 * Inodes, files and decompressor operations
75 */ 78 */
76 79
77/* dir.c */ 80/* dir.c */
@@ -88,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops;
88 91
89/* symlink.c */ 92/* symlink.c */
90extern const struct address_space_operations squashfs_symlink_aops; 93extern const struct address_space_operations squashfs_symlink_aops;
94
95/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 283daafc568e..79024245ea00 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -183,8 +183,6 @@
183#define SQUASHFS_MAX_FILE_SIZE (1LL << \ 183#define SQUASHFS_MAX_FILE_SIZE (1LL << \
184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) 184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
185 185
186#define SQUASHFS_MARKER_BYTE 0xff
187
188/* meta index cache */ 186/* meta index cache */
189#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) 187#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
190#define SQUASHFS_META_ENTRIES 127 188#define SQUASHFS_META_ENTRIES 127
@@ -211,7 +209,9 @@ struct meta_index {
211/* 209/*
212 * definitions for structures on disk 210 * definitions for structures on disk
213 */ 211 */
214#define ZLIB_COMPRESSION 1 212#define ZLIB_COMPRESSION 1
213#define LZMA_COMPRESSION 2
214#define LZO_COMPRESSION 3
215 215
216struct squashfs_super_block { 216struct squashfs_super_block {
217 __le32 s_magic; 217 __le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index c8c65614dd1c..2e77dc547e25 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -52,25 +52,25 @@ struct squashfs_cache_entry {
52}; 52};
53 53
54struct squashfs_sb_info { 54struct squashfs_sb_info {
55 int devblksize; 55 const struct squashfs_decompressor *decompressor;
56 int devblksize_log2; 56 int devblksize;
57 struct squashfs_cache *block_cache; 57 int devblksize_log2;
58 struct squashfs_cache *fragment_cache; 58 struct squashfs_cache *block_cache;
59 struct squashfs_cache *read_page; 59 struct squashfs_cache *fragment_cache;
60 int next_meta_index; 60 struct squashfs_cache *read_page;
61 __le64 *id_table; 61 int next_meta_index;
62 __le64 *fragment_index; 62 __le64 *id_table;
63 unsigned int *fragment_index_2; 63 __le64 *fragment_index;
64 struct mutex read_data_mutex; 64 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 65 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 66 struct meta_index *meta_index;
67 z_stream stream; 67 void *stream;
68 __le64 *inode_lookup_table; 68 __le64 *inode_lookup_table;
69 u64 inode_table; 69 u64 inode_table;
70 u64 directory_table; 70 u64 directory_table;
71 unsigned int block_size; 71 unsigned int block_size;
72 unsigned short block_log; 72 unsigned short block_log;
73 long long bytes_used; 73 long long bytes_used;
74 unsigned int inodes; 74 unsigned int inodes;
75}; 75};
76#endif 76#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6c197ef53add..3550aec2f655 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,34 +35,41 @@
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/zlib.h>
39#include <linux/magic.h> 38#include <linux/magic.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
44#include "squashfs.h" 43#include "squashfs.h"
44#include "decompressor.h"
45 45
46static struct file_system_type squashfs_fs_type; 46static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 47static const struct super_operations squashfs_super_ops;
48 48
49static int supported_squashfs_filesystem(short major, short minor, short comp) 49static const struct squashfs_decompressor *supported_squashfs_filesystem(short
50 major, short minor, short id)
50{ 51{
52 const struct squashfs_decompressor *decompressor;
53
51 if (major < SQUASHFS_MAJOR) { 54 if (major < SQUASHFS_MAJOR) {
52 ERROR("Major/Minor mismatch, older Squashfs %d.%d " 55 ERROR("Major/Minor mismatch, older Squashfs %d.%d "
53 "filesystems are unsupported\n", major, minor); 56 "filesystems are unsupported\n", major, minor);
54 return -EINVAL; 57 return NULL;
55 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { 58 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
56 ERROR("Major/Minor mismatch, trying to mount newer " 59 ERROR("Major/Minor mismatch, trying to mount newer "
57 "%d.%d filesystem\n", major, minor); 60 "%d.%d filesystem\n", major, minor);
58 ERROR("Please update your kernel\n"); 61 ERROR("Please update your kernel\n");
59 return -EINVAL; 62 return NULL;
60 } 63 }
61 64
62 if (comp != ZLIB_COMPRESSION) 65 decompressor = squashfs_lookup_decompressor(id);
63 return -EINVAL; 66 if (!decompressor->supported) {
67 ERROR("Filesystem uses \"%s\" compression. This is not "
68 "supported\n", decompressor->name);
69 return NULL;
70 }
64 71
65 return 0; 72 return decompressor;
66} 73}
67 74
68 75
@@ -87,13 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
87 } 94 }
88 msblk = sb->s_fs_info; 95 msblk = sb->s_fs_info;
89 96
90 msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
91 GFP_KERNEL);
92 if (msblk->stream.workspace == NULL) {
93 ERROR("Failed to allocate zlib workspace\n");
94 goto failure;
95 }
96
97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); 97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
98 if (sblk == NULL) { 98 if (sblk == NULL) {
99 ERROR("Failed to allocate squashfs_super_block\n"); 99 ERROR("Failed to allocate squashfs_super_block\n");
@@ -120,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
120 goto failed_mount; 120 goto failed_mount;
121 } 121 }
122 122
123 err = -EINVAL;
124
123 /* Check it is a SQUASHFS superblock */ 125 /* Check it is a SQUASHFS superblock */
124 sb->s_magic = le32_to_cpu(sblk->s_magic); 126 sb->s_magic = le32_to_cpu(sblk->s_magic);
125 if (sb->s_magic != SQUASHFS_MAGIC) { 127 if (sb->s_magic != SQUASHFS_MAGIC) {
126 if (!silent) 128 if (!silent)
127 ERROR("Can't find a SQUASHFS superblock on %s\n", 129 ERROR("Can't find a SQUASHFS superblock on %s\n",
128 bdevname(sb->s_bdev, b)); 130 bdevname(sb->s_bdev, b));
129 err = -EINVAL;
130 goto failed_mount; 131 goto failed_mount;
131 } 132 }
132 133
133 /* Check the MAJOR & MINOR versions and compression type */ 134 /* Check the MAJOR & MINOR versions and lookup compression type */
134 err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), 135 msblk->decompressor = supported_squashfs_filesystem(
136 le16_to_cpu(sblk->s_major),
135 le16_to_cpu(sblk->s_minor), 137 le16_to_cpu(sblk->s_minor),
136 le16_to_cpu(sblk->compression)); 138 le16_to_cpu(sblk->compression));
137 if (err < 0) 139 if (msblk->decompressor == NULL)
138 goto failed_mount; 140 goto failed_mount;
139 141
140 err = -EINVAL;
141
142 /* 142 /*
143 * Check if there's xattrs in the filesystem. These are not 143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored. 144 * supported in this version, so warn that they will be ignored.
@@ -205,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
205 205
206 err = -ENOMEM; 206 err = -ENOMEM;
207 207
208 msblk->stream = squashfs_decompressor_init(msblk);
209 if (msblk->stream == NULL)
210 goto failed_mount;
211
208 msblk->block_cache = squashfs_cache_init("metadata", 212 msblk->block_cache = squashfs_cache_init("metadata",
209 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); 213 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
210 if (msblk->block_cache == NULL) 214 if (msblk->block_cache == NULL)
@@ -292,17 +296,16 @@ failed_mount:
292 squashfs_cache_delete(msblk->block_cache); 296 squashfs_cache_delete(msblk->block_cache);
293 squashfs_cache_delete(msblk->fragment_cache); 297 squashfs_cache_delete(msblk->fragment_cache);
294 squashfs_cache_delete(msblk->read_page); 298 squashfs_cache_delete(msblk->read_page);
299 squashfs_decompressor_free(msblk, msblk->stream);
295 kfree(msblk->inode_lookup_table); 300 kfree(msblk->inode_lookup_table);
296 kfree(msblk->fragment_index); 301 kfree(msblk->fragment_index);
297 kfree(msblk->id_table); 302 kfree(msblk->id_table);
298 kfree(msblk->stream.workspace);
299 kfree(sb->s_fs_info); 303 kfree(sb->s_fs_info);
300 sb->s_fs_info = NULL; 304 sb->s_fs_info = NULL;
301 kfree(sblk); 305 kfree(sblk);
302 return err; 306 return err;
303 307
304failure: 308failure:
305 kfree(msblk->stream.workspace);
306 kfree(sb->s_fs_info); 309 kfree(sb->s_fs_info);
307 sb->s_fs_info = NULL; 310 sb->s_fs_info = NULL;
308 return -ENOMEM; 311 return -ENOMEM;
@@ -346,10 +349,10 @@ static void squashfs_put_super(struct super_block *sb)
346 squashfs_cache_delete(sbi->block_cache); 349 squashfs_cache_delete(sbi->block_cache);
347 squashfs_cache_delete(sbi->fragment_cache); 350 squashfs_cache_delete(sbi->fragment_cache);
348 squashfs_cache_delete(sbi->read_page); 351 squashfs_cache_delete(sbi->read_page);
352 squashfs_decompressor_free(sbi, sbi->stream);
349 kfree(sbi->id_table); 353 kfree(sbi->id_table);
350 kfree(sbi->fragment_index); 354 kfree(sbi->fragment_index);
351 kfree(sbi->meta_index); 355 kfree(sbi->meta_index);
352 kfree(sbi->stream.workspace);
353 kfree(sb->s_fs_info); 356 kfree(sb->s_fs_info);
354 sb->s_fs_info = NULL; 357 sb->s_fs_info = NULL;
355 } 358 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 83d87880aac8..e80be2022a7f 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -36,7 +36,6 @@
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/string.h> 37#include <linux/string.h>
38#include <linux/pagemap.h> 38#include <linux/pagemap.h>
39#include <linux/zlib.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 000000000000..4dd70e04333b
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,150 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * zlib_wrapper.c
22 */
23
24
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27#include <linux/zlib.h>
28
29#include "squashfs_fs.h"
30#include "squashfs_fs_sb.h"
31#include "squashfs_fs_i.h"
32#include "squashfs.h"
33#include "decompressor.h"
34
35static void *zlib_init(struct squashfs_sb_info *dummy)
36{
37 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
38 if (stream == NULL)
39 goto failed;
40 stream->workspace = kmalloc(zlib_inflate_workspacesize(),
41 GFP_KERNEL);
42 if (stream->workspace == NULL)
43 goto failed;
44
45 return stream;
46
47failed:
48 ERROR("Failed to allocate zlib workspace\n");
49 kfree(stream);
50 return NULL;
51}
52
53
54static void zlib_free(void *strm)
55{
56 z_stream *stream = strm;
57
58 if (stream)
59 kfree(stream->workspace);
60 kfree(stream);
61}
62
63
64static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
65 struct buffer_head **bh, int b, int offset, int length, int srclength,
66 int pages)
67{
68 int zlib_err = 0, zlib_init = 0;
69 int avail, bytes, k = 0, page = 0;
70 z_stream *stream = msblk->stream;
71
72 mutex_lock(&msblk->read_data_mutex);
73
74 stream->avail_out = 0;
75 stream->avail_in = 0;
76
77 bytes = length;
78 do {
79 if (stream->avail_in == 0 && k < b) {
80 avail = min(bytes, msblk->devblksize - offset);
81 bytes -= avail;
82 wait_on_buffer(bh[k]);
83 if (!buffer_uptodate(bh[k]))
84 goto release_mutex;
85
86 if (avail == 0) {
87 offset = 0;
88 put_bh(bh[k++]);
89 continue;
90 }
91
92 stream->next_in = bh[k]->b_data + offset;
93 stream->avail_in = avail;
94 offset = 0;
95 }
96
97 if (stream->avail_out == 0 && page < pages) {
98 stream->next_out = buffer[page++];
99 stream->avail_out = PAGE_CACHE_SIZE;
100 }
101
102 if (!zlib_init) {
103 zlib_err = zlib_inflateInit(stream);
104 if (zlib_err != Z_OK) {
105 ERROR("zlib_inflateInit returned unexpected "
106 "result 0x%x, srclength %d\n",
107 zlib_err, srclength);
108 goto release_mutex;
109 }
110 zlib_init = 1;
111 }
112
113 zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
114
115 if (stream->avail_in == 0 && k < b)
116 put_bh(bh[k++]);
117 } while (zlib_err == Z_OK);
118
119 if (zlib_err != Z_STREAM_END) {
120 ERROR("zlib_inflate error, data probably corrupt\n");
121 goto release_mutex;
122 }
123
124 zlib_err = zlib_inflateEnd(stream);
125 if (zlib_err != Z_OK) {
126 ERROR("zlib_inflate error, data probably corrupt\n");
127 goto release_mutex;
128 }
129
130 mutex_unlock(&msblk->read_data_mutex);
131 return stream->total_out;
132
133release_mutex:
134 mutex_unlock(&msblk->read_data_mutex);
135
136 for (; k < b; k++)
137 put_bh(bh[k]);
138
139 return -EIO;
140}
141
142const struct squashfs_decompressor squashfs_zlib_comp_ops = {
143 .init = zlib_init,
144 .free = zlib_free,
145 .decompress = zlib_uncompress,
146 .id = ZLIB_COMPRESSION,
147 .name = "zlib",
148 .supported = 1
149};
150
diff --git a/fs/super.c b/fs/super.c
index aff046b0fe78..f35ac6022109 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -568,7 +568,7 @@ out:
568int do_remount_sb(struct super_block *sb, int flags, void *data, int force) 568int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
569{ 569{
570 int retval; 570 int retval;
571 int remount_rw; 571 int remount_rw, remount_ro;
572 572
573 if (sb->s_frozen != SB_UNFROZEN) 573 if (sb->s_frozen != SB_UNFROZEN)
574 return -EBUSY; 574 return -EBUSY;
@@ -583,9 +583,12 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
583 shrink_dcache_sb(sb); 583 shrink_dcache_sb(sb);
584 sync_filesystem(sb); 584 sync_filesystem(sb);
585 585
586 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
587 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
588
586 /* If we are remounting RDONLY and current sb is read/write, 589 /* If we are remounting RDONLY and current sb is read/write,
587 make sure there are no rw files opened */ 590 make sure there are no rw files opened */
588 if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { 591 if (remount_ro) {
589 if (force) 592 if (force)
590 mark_files_ro(sb); 593 mark_files_ro(sb);
591 else if (!fs_may_remount_ro(sb)) 594 else if (!fs_may_remount_ro(sb))
@@ -594,7 +597,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
594 if (retval < 0 && retval != -ENOSYS) 597 if (retval < 0 && retval != -ENOSYS)
595 return -EBUSY; 598 return -EBUSY;
596 } 599 }
597 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
598 600
599 if (sb->s_op->remount_fs) { 601 if (sb->s_op->remount_fs) {
600 retval = sb->s_op->remount_fs(sb, &flags, data); 602 retval = sb->s_op->remount_fs(sb, &flags, data);
@@ -604,6 +606,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
604 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 606 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
605 if (remount_rw) 607 if (remount_rw)
606 vfs_dq_quota_on_remount(sb); 608 vfs_dq_quota_on_remount(sb);
609 /*
610 * Some filesystems modify their metadata via some other path than the
611 * bdev buffer cache (eg. use a private mapping, or directories in
612 * pagecache, etc). Also file data modifications go via their own
613 * mappings. So If we try to mount readonly then copy the filesystem
614 * from bdev, we could get stale data, so invalidate it to give a best
615 * effort at coherency.
616 */
617 if (remount_ro && sb->s_bdev)
618 invalidate_bdev(sb->s_bdev);
607 return 0; 619 return 0;
608} 620}
609 621
@@ -925,6 +937,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
925 if (!mnt) 937 if (!mnt)
926 goto out; 938 goto out;
927 939
940 if (flags & MS_KERNMOUNT)
941 mnt->mnt_flags = MNT_INTERNAL;
942
928 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { 943 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
929 secdata = alloc_secdata(); 944 secdata = alloc_secdata();
930 if (!secdata) 945 if (!secdata)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 220b758523ae..6a06a1d1ea7b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -81,24 +81,23 @@ int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
81 if (!sd_attrs) 81 if (!sd_attrs)
82 return -ENOMEM; 82 return -ENOMEM;
83 sd->s_iattr = sd_attrs; 83 sd->s_iattr = sd_attrs;
84 } else { 84 }
85 /* attributes were changed at least once in past */ 85 /* attributes were changed at least once in past */
86 iattrs = &sd_attrs->ia_iattr; 86 iattrs = &sd_attrs->ia_iattr;
87 87
88 if (ia_valid & ATTR_UID) 88 if (ia_valid & ATTR_UID)
89 iattrs->ia_uid = iattr->ia_uid; 89 iattrs->ia_uid = iattr->ia_uid;
90 if (ia_valid & ATTR_GID) 90 if (ia_valid & ATTR_GID)
91 iattrs->ia_gid = iattr->ia_gid; 91 iattrs->ia_gid = iattr->ia_gid;
92 if (ia_valid & ATTR_ATIME) 92 if (ia_valid & ATTR_ATIME)
93 iattrs->ia_atime = iattr->ia_atime; 93 iattrs->ia_atime = iattr->ia_atime;
94 if (ia_valid & ATTR_MTIME) 94 if (ia_valid & ATTR_MTIME)
95 iattrs->ia_mtime = iattr->ia_mtime; 95 iattrs->ia_mtime = iattr->ia_mtime;
96 if (ia_valid & ATTR_CTIME) 96 if (ia_valid & ATTR_CTIME)
97 iattrs->ia_ctime = iattr->ia_ctime; 97 iattrs->ia_ctime = iattr->ia_ctime;
98 if (ia_valid & ATTR_MODE) { 98 if (ia_valid & ATTR_MODE) {
99 umode_t mode = iattr->ia_mode; 99 umode_t mode = iattr->ia_mode;
100 iattrs->ia_mode = sd->s_mode = mode; 100 iattrs->ia_mode = sd->s_mode = mode;
101 }
102 } 101 }
103 return 0; 102 return 0;
104} 103}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9824743832a7..4573734d723d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -26,6 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/vfs.h> 28#include <linux/vfs.h>
29#include <linux/writeback.h>
29#include <linux/namei.h> 30#include <linux/namei.h>
30#include <asm/byteorder.h> 31#include <asm/byteorder.h>
31#include "sysv.h" 32#include "sysv.h"
@@ -246,7 +247,7 @@ bad_inode:
246 return ERR_PTR(-EIO); 247 return ERR_PTR(-EIO);
247} 248}
248 249
249int sysv_write_inode(struct inode *inode, int wait) 250static int __sysv_write_inode(struct inode *inode, int wait)
250{ 251{
251 struct super_block * sb = inode->i_sb; 252 struct super_block * sb = inode->i_sb;
252 struct sysv_sb_info * sbi = SYSV_SB(sb); 253 struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait)
296 return 0; 297 return 0;
297} 298}
298 299
300int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
301{
302 return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
303}
304
299int sysv_sync_inode(struct inode *inode) 305int sysv_sync_inode(struct inode *inode)
300{ 306{
301 return sysv_write_inode(inode, 1); 307 return __sysv_write_inode(inode, 1);
302} 308}
303 309
304static void sysv_delete_inode(struct inode *inode) 310static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 53786eb5cf60..94cb9b4d76c2 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
142 142
143/* inode.c */ 143/* inode.c */
144extern struct inode *sysv_iget(struct super_block *, unsigned int); 144extern struct inode *sysv_iget(struct super_block *, unsigned int);
145extern int sysv_write_inode(struct inode *, int); 145extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
146extern int sysv_sync_inode(struct inode *); 146extern int sysv_sync_inode(struct inode *);
147extern void sysv_set_inode(struct inode *, dev_t); 147extern void sysv_set_inode(struct inode *, dev_t);
148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); 148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 552fb0111fff..401e503d44a1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1120 if (release) 1120 if (release)
1121 ubifs_release_budget(c, &ino_req); 1121 ubifs_release_budget(c, &ino_req);
1122 if (IS_SYNC(old_inode)) 1122 if (IS_SYNC(old_inode))
1123 err = old_inode->i_sb->s_op->write_inode(old_inode, 1); 1123 err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
1124 return err; 1124 return err;
1125 1125
1126out_cancel: 1126out_cancel:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 16a6444330ec..e26c02ab6cd5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1011,7 +1011,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
1011 /* Is the page fully inside @i_size? */ 1011 /* Is the page fully inside @i_size? */
1012 if (page->index < end_index) { 1012 if (page->index < end_index) {
1013 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { 1013 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
1014 err = inode->i_sb->s_op->write_inode(inode, 1); 1014 err = inode->i_sb->s_op->write_inode(inode, NULL);
1015 if (err) 1015 if (err)
1016 goto out_unlock; 1016 goto out_unlock;
1017 /* 1017 /*
@@ -1039,7 +1039,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
1039 kunmap_atomic(kaddr, KM_USER0); 1039 kunmap_atomic(kaddr, KM_USER0);
1040 1040
1041 if (i_size > synced_i_size) { 1041 if (i_size > synced_i_size) {
1042 err = inode->i_sb->s_op->write_inode(inode, 1); 1042 err = inode->i_sb->s_op->write_inode(inode, NULL);
1043 if (err) 1043 if (err)
1044 goto out_unlock; 1044 goto out_unlock;
1045 } 1045 }
@@ -1242,7 +1242,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1242 if (release) 1242 if (release)
1243 ubifs_release_budget(c, &req); 1243 ubifs_release_budget(c, &req);
1244 if (IS_SYNC(inode)) 1244 if (IS_SYNC(inode))
1245 err = inode->i_sb->s_op->write_inode(inode, 1); 1245 err = inode->i_sb->s_op->write_inode(inode, NULL);
1246 return err; 1246 return err;
1247 1247
1248out: 1248out:
@@ -1316,7 +1316,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1316 * the inode unless this is a 'datasync()' call. 1316 * the inode unless this is a 'datasync()' call.
1317 */ 1317 */
1318 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { 1318 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
1319 err = inode->i_sb->s_op->write_inode(inode, 1); 1319 err = inode->i_sb->s_op->write_inode(inode, NULL);
1320 if (err) 1320 if (err)
1321 return err; 1321 return err;
1322 } 1322 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 43f9d19a6f33..4d2f2157dd3f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode)
283/* 283/*
284 * Note, Linux write-back code calls this without 'i_mutex'. 284 * Note, Linux write-back code calls this without 'i_mutex'.
285 */ 285 */
286static int ubifs_write_inode(struct inode *inode, int wait) 286static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
287{ 287{
288 int err = 0; 288 int err = 0;
289 struct ubifs_info *c = inode->i_sb->s_fs_info; 289 struct ubifs_info *c = inode->i_sb->s_fs_info;
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 82372e332f08..b2d96f45c12b 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -547,7 +547,7 @@ static void udf_table_free_blocks(struct super_block *sb,
547 } 547 }
548 548
549 if (epos.offset + (2 * adsize) > sb->s_blocksize) { 549 if (epos.offset + (2 * adsize) > sb->s_blocksize) {
550 char *sptr, *dptr; 550 unsigned char *sptr, *dptr;
551 int loffset; 551 int loffset;
552 552
553 brelse(oepos.bh); 553 brelse(oepos.bh);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 61d9a76a3a69..f0f2a436251e 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,8 +45,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
45 int block, iblock; 45 int block, iblock;
46 loff_t nf_pos = (filp->f_pos - 1) << 2; 46 loff_t nf_pos = (filp->f_pos - 1) << 2;
47 int flen; 47 int flen;
48 char *fname = NULL; 48 unsigned char *fname = NULL;
49 char *nameptr; 49 unsigned char *nameptr;
50 uint16_t liu; 50 uint16_t liu;
51 uint8_t lfi; 51 uint8_t lfi;
52 loff_t size = udf_ext0_offset(dir) + dir->i_size; 52 loff_t size = udf_ext0_offset(dir) + dir->i_size;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f90231eb2916..b02089247296 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1373,12 +1373,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
1373 return mode; 1373 return mode;
1374} 1374}
1375 1375
1376int udf_write_inode(struct inode *inode, int sync) 1376int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
1377{ 1377{
1378 int ret; 1378 int ret;
1379 1379
1380 lock_kernel(); 1380 lock_kernel();
1381 ret = udf_update_inode(inode, sync); 1381 ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1382 unlock_kernel(); 1382 unlock_kernel();
1383 1383
1384 return ret; 1384 return ret;
@@ -1672,7 +1672,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1672 return -1; 1672 return -1;
1673 1673
1674 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { 1674 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
1675 char *sptr, *dptr; 1675 unsigned char *sptr, *dptr;
1676 struct buffer_head *nbh; 1676 struct buffer_head *nbh;
1677 int err, loffset; 1677 int err, loffset;
1678 struct kernel_lb_addr obloc = epos->block; 1678 struct kernel_lb_addr obloc = epos->block;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index cd2115060fdc..7c56ff00cd53 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -34,8 +34,8 @@
34#include <linux/crc-itu-t.h> 34#include <linux/crc-itu-t.h>
35#include <linux/exportfs.h> 35#include <linux/exportfs.h>
36 36
37static inline int udf_match(int len1, const char *name1, int len2, 37static inline int udf_match(int len1, const unsigned char *name1, int len2,
38 const char *name2) 38 const unsigned char *name2)
39{ 39{
40 if (len1 != len2) 40 if (len1 != len2)
41 return 0; 41 return 0;
@@ -142,15 +142,15 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
142} 142}
143 143
144static struct fileIdentDesc *udf_find_entry(struct inode *dir, 144static struct fileIdentDesc *udf_find_entry(struct inode *dir,
145 struct qstr *child, 145 const struct qstr *child,
146 struct udf_fileident_bh *fibh, 146 struct udf_fileident_bh *fibh,
147 struct fileIdentDesc *cfi) 147 struct fileIdentDesc *cfi)
148{ 148{
149 struct fileIdentDesc *fi = NULL; 149 struct fileIdentDesc *fi = NULL;
150 loff_t f_pos; 150 loff_t f_pos;
151 int block, flen; 151 int block, flen;
152 char *fname = NULL; 152 unsigned char *fname = NULL;
153 char *nameptr; 153 unsigned char *nameptr;
154 uint8_t lfi; 154 uint8_t lfi;
155 uint16_t liu; 155 uint16_t liu;
156 loff_t size; 156 loff_t size;
@@ -308,7 +308,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
308{ 308{
309 struct super_block *sb = dir->i_sb; 309 struct super_block *sb = dir->i_sb;
310 struct fileIdentDesc *fi = NULL; 310 struct fileIdentDesc *fi = NULL;
311 char *name = NULL; 311 unsigned char *name = NULL;
312 int namelen; 312 int namelen;
313 loff_t f_pos; 313 loff_t f_pos;
314 loff_t size = udf_ext0_offset(dir) + dir->i_size; 314 loff_t size = udf_ext0_offset(dir) + dir->i_size;
@@ -885,16 +885,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
885{ 885{
886 struct inode *inode; 886 struct inode *inode;
887 struct pathComponent *pc; 887 struct pathComponent *pc;
888 char *compstart; 888 const char *compstart;
889 struct udf_fileident_bh fibh; 889 struct udf_fileident_bh fibh;
890 struct extent_position epos = {}; 890 struct extent_position epos = {};
891 int eoffset, elen = 0; 891 int eoffset, elen = 0;
892 struct fileIdentDesc *fi; 892 struct fileIdentDesc *fi;
893 struct fileIdentDesc cfi; 893 struct fileIdentDesc cfi;
894 char *ea; 894 uint8_t *ea;
895 int err; 895 int err;
896 int block; 896 int block;
897 char *name = NULL; 897 unsigned char *name = NULL;
898 int namelen; 898 int namelen;
899 struct buffer_head *bh; 899 struct buffer_head *bh;
900 struct udf_inode_info *iinfo; 900 struct udf_inode_info *iinfo;
@@ -970,7 +970,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
970 970
971 pc = (struct pathComponent *)(ea + elen); 971 pc = (struct pathComponent *)(ea + elen);
972 972
973 compstart = (char *)symname; 973 compstart = symname;
974 974
975 do { 975 do {
976 symname++; 976 symname++;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index c3265e1385d4..852e91845688 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -32,12 +32,12 @@
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include "udf_i.h" 33#include "udf_i.h"
34 34
35static void udf_pc_to_char(struct super_block *sb, char *from, int fromlen, 35static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
36 char *to) 36 int fromlen, unsigned char *to)
37{ 37{
38 struct pathComponent *pc; 38 struct pathComponent *pc;
39 int elen = 0; 39 int elen = 0;
40 char *p = to; 40 unsigned char *p = to;
41 41
42 while (elen < fromlen) { 42 while (elen < fromlen) {
43 pc = (struct pathComponent *)(from + elen); 43 pc = (struct pathComponent *)(from + elen);
@@ -75,9 +75,9 @@ static int udf_symlink_filler(struct file *file, struct page *page)
75{ 75{
76 struct inode *inode = page->mapping->host; 76 struct inode *inode = page->mapping->host;
77 struct buffer_head *bh = NULL; 77 struct buffer_head *bh = NULL;
78 char *symlink; 78 unsigned char *symlink;
79 int err = -EIO; 79 int err = -EIO;
80 char *p = kmap(page); 80 unsigned char *p = kmap(page);
81 struct udf_inode_info *iinfo; 81 struct udf_inode_info *iinfo;
82 82
83 lock_kernel(); 83 lock_kernel();
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8d46f4294ee7..4223ac855da9 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *);
142extern void udf_read_inode(struct inode *); 142extern void udf_read_inode(struct inode *);
143extern void udf_delete_inode(struct inode *); 143extern void udf_delete_inode(struct inode *);
144extern void udf_clear_inode(struct inode *); 144extern void udf_clear_inode(struct inode *);
145extern int udf_write_inode(struct inode *, int); 145extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
146extern long udf_block_map(struct inode *, sector_t); 146extern long udf_block_map(struct inode *, sector_t);
147extern int udf_extend_file(struct inode *, struct extent_position *, 147extern int udf_extend_file(struct inode *, struct extent_position *,
148 struct kernel_long_ad *, sector_t); 148 struct kernel_long_ad *, sector_t);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 22af68f8b682..317a0d444f6b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -31,7 +31,7 @@
31 * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller. 31 * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller.
32 */ 32 */
33static inline int ufs_match(struct super_block *sb, int len, 33static inline int ufs_match(struct super_block *sb, int len,
34 const char * const name, struct ufs_dir_entry * de) 34 const unsigned char *name, struct ufs_dir_entry *de)
35{ 35{
36 if (len != ufs_get_de_namlen(sb, de)) 36 if (len != ufs_get_de_namlen(sb, de))
37 return 0; 37 return 0;
@@ -70,7 +70,7 @@ static inline unsigned long ufs_dir_pages(struct inode *inode)
70 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; 70 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
71} 71}
72 72
73ino_t ufs_inode_by_name(struct inode *dir, struct qstr *qstr) 73ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
74{ 74{
75 ino_t res = 0; 75 ino_t res = 0;
76 struct ufs_dir_entry *de; 76 struct ufs_dir_entry *de;
@@ -249,11 +249,11 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
249 * (as a parameter - res_dir). Page is returned mapped and unlocked. 249 * (as a parameter - res_dir). Page is returned mapped and unlocked.
250 * Entry is guaranteed to be valid. 250 * Entry is guaranteed to be valid.
251 */ 251 */
252struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct qstr *qstr, 252struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
253 struct page **res_page) 253 struct page **res_page)
254{ 254{
255 struct super_block *sb = dir->i_sb; 255 struct super_block *sb = dir->i_sb;
256 const char *name = qstr->name; 256 const unsigned char *name = qstr->name;
257 int namelen = qstr->len; 257 int namelen = qstr->len;
258 unsigned reclen = UFS_DIR_REC_LEN(namelen); 258 unsigned reclen = UFS_DIR_REC_LEN(namelen);
259 unsigned long start, n; 259 unsigned long start, n;
@@ -313,7 +313,7 @@ found:
313int ufs_add_link(struct dentry *dentry, struct inode *inode) 313int ufs_add_link(struct dentry *dentry, struct inode *inode)
314{ 314{
315 struct inode *dir = dentry->d_parent->d_inode; 315 struct inode *dir = dentry->d_parent->d_inode;
316 const char *name = dentry->d_name.name; 316 const unsigned char *name = dentry->d_name.name;
317 int namelen = dentry->d_name.len; 317 int namelen = dentry->d_name.len;
318 struct super_block *sb = dir->i_sb; 318 struct super_block *sb = dir->i_sb;
319 unsigned reclen = UFS_DIR_REC_LEN(namelen); 319 unsigned reclen = UFS_DIR_REC_LEN(namelen);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7cf33379fd46..0a627e08610b 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/smp_lock.h> 37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/writeback.h>
39 40
40#include "ufs_fs.h" 41#include "ufs_fs.h"
41#include "ufs.h" 42#include "ufs.h"
@@ -890,11 +891,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
890 return 0; 891 return 0;
891} 892}
892 893
893int ufs_write_inode (struct inode * inode, int wait) 894int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
894{ 895{
895 int ret; 896 int ret;
896 lock_kernel(); 897 lock_kernel();
897 ret = ufs_update_inode (inode, wait); 898 ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
898 unlock_kernel(); 899 unlock_kernel();
899 return ret; 900 return ret;
900} 901}
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 0b4c39bc0d9e..43f9f5d5670e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
86/* dir.c */ 86/* dir.c */
87extern const struct inode_operations ufs_dir_inode_operations; 87extern const struct inode_operations ufs_dir_inode_operations;
88extern int ufs_add_link (struct dentry *, struct inode *); 88extern int ufs_add_link (struct dentry *, struct inode *);
89extern ino_t ufs_inode_by_name(struct inode *, struct qstr *); 89extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
90extern int ufs_make_empty(struct inode *, struct inode *); 90extern int ufs_make_empty(struct inode *, struct inode *);
91extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct qstr *, struct page **); 91extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
92extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); 92extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
93extern int ufs_empty_dir (struct inode *); 93extern int ufs_empty_dir (struct inode *);
94extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); 94extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
@@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
106 106
107/* inode.c */ 107/* inode.c */
108extern struct inode *ufs_iget(struct super_block *, unsigned long); 108extern struct inode *ufs_iget(struct super_block *, unsigned long);
109extern int ufs_write_inode (struct inode *, int); 109extern int ufs_write_inode (struct inode *, struct writeback_control *);
110extern int ufs_sync_inode (struct inode *); 110extern int ufs_sync_inode (struct inode *);
111extern void ufs_delete_inode (struct inode *); 111extern void ufs_delete_inode (struct inode *);
112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); 112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 56641fe52a23..5c5a366aa332 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,7 +16,7 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -funsigned-char 19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6
20 20
21XFS_LINUX := linux-2.6 21XFS_LINUX := linux-2.6
22 22
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 2d3f90afe5f1..bc7405585def 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/vmalloc.h>
20#include <linux/highmem.h> 19#include <linux/highmem.h>
21#include <linux/swap.h> 20#include <linux/swap.h>
22#include <linux/blkdev.h> 21#include <linux/blkdev.h>
@@ -24,8 +23,25 @@
24#include "time.h" 23#include "time.h"
25#include "kmem.h" 24#include "kmem.h"
26 25
27#define MAX_VMALLOCS 6 26/*
28#define MAX_SLAB_SIZE 0x20000 27 * Greedy allocation. May fail and may return vmalloced memory.
28 *
29 * Must be freed using kmem_free_large.
30 */
31void *
32kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
33{
34 void *ptr;
35 size_t kmsize = maxsize;
36
37 while (!(ptr = kmem_zalloc_large(kmsize))) {
38 if ((kmsize >>= 1) <= minsize)
39 kmsize = minsize;
40 }
41 if (ptr)
42 *size = kmsize;
43 return ptr;
44}
29 45
30void * 46void *
31kmem_alloc(size_t size, unsigned int __nocast flags) 47kmem_alloc(size_t size, unsigned int __nocast flags)
@@ -34,19 +50,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
34 gfp_t lflags = kmem_flags_convert(flags); 50 gfp_t lflags = kmem_flags_convert(flags);
35 void *ptr; 51 void *ptr;
36 52
37#ifdef DEBUG
38 if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
39 printk(KERN_WARNING "Large %s attempt, size=%ld\n",
40 __func__, (long)size);
41 dump_stack();
42 }
43#endif
44
45 do { 53 do {
46 if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) 54 ptr = kmalloc(size, lflags);
47 ptr = kmalloc(size, lflags);
48 else
49 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
50 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 55 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
51 return ptr; 56 return ptr;
52 if (!(++retries % 100)) 57 if (!(++retries % 100))
@@ -68,27 +73,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
68 return ptr; 73 return ptr;
69} 74}
70 75
71void *
72kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
73 unsigned int __nocast flags)
74{
75 void *ptr;
76 size_t kmsize = maxsize;
77 unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
78
79 while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
80 if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
81 break;
82 if ((kmsize >>= 1) <= minsize) {
83 kmsize = minsize;
84 kmflags = flags;
85 }
86 }
87 if (ptr)
88 *size = kmsize;
89 return ptr;
90}
91
92void 76void
93kmem_free(const void *ptr) 77kmem_free(const void *ptr)
94{ 78{
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 179cbd630f69..f7c8f7a9ea6d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/vmalloc.h>
24 25
25/* 26/*
26 * General memory allocation interfaces 27 * General memory allocation interfaces
@@ -30,7 +31,6 @@
30#define KM_NOSLEEP 0x0002u 31#define KM_NOSLEEP 0x0002u
31#define KM_NOFS 0x0004u 32#define KM_NOFS 0x0004u
32#define KM_MAYFAIL 0x0008u 33#define KM_MAYFAIL 0x0008u
33#define KM_LARGE 0x0010u
34 34
35/* 35/*
36 * We use a special process flag to avoid recursive callbacks into 36 * We use a special process flag to avoid recursive callbacks into
@@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
42{ 42{
43 gfp_t lflags; 43 gfp_t lflags;
44 44
45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); 45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
46 46
47 if (flags & KM_NOSLEEP) { 47 if (flags & KM_NOSLEEP) {
48 lflags = GFP_ATOMIC | __GFP_NOWARN; 48 lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags)
56 56
57extern void *kmem_alloc(size_t, unsigned int __nocast); 57extern void *kmem_alloc(size_t, unsigned int __nocast);
58extern void *kmem_zalloc(size_t, unsigned int __nocast); 58extern void *kmem_zalloc(size_t, unsigned int __nocast);
59extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
60extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); 59extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
61extern void kmem_free(const void *); 60extern void kmem_free(const void *);
62 61
62static inline void *kmem_zalloc_large(size_t size)
63{
64 void *ptr;
65
66 ptr = vmalloc(size);
67 if (ptr)
68 memset(ptr, 0, size);
69 return ptr;
70}
71static inline void kmem_free_large(void *ptr)
72{
73 vfree(ptr);
74}
75
76extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
77
63/* 78/*
64 * Zone interfaces 79 * Zone interfaces
65 */ 80 */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 883ca5ab8af5..bf85bbe4a9ae 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -106,7 +106,7 @@ xfs_get_acl(struct inode *inode, int type)
106 struct posix_acl *acl; 106 struct posix_acl *acl;
107 struct xfs_acl *xfs_acl; 107 struct xfs_acl *xfs_acl;
108 int len = sizeof(struct xfs_acl); 108 int len = sizeof(struct xfs_acl);
109 char *ea_name; 109 unsigned char *ea_name;
110 int error; 110 int error;
111 111
112 acl = get_cached_acl(inode, type); 112 acl = get_cached_acl(inode, type);
@@ -133,7 +133,8 @@ xfs_get_acl(struct inode *inode, int type)
133 if (!xfs_acl) 133 if (!xfs_acl)
134 return ERR_PTR(-ENOMEM); 134 return ERR_PTR(-ENOMEM);
135 135
136 error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT); 136 error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
137 &len, ATTR_ROOT);
137 if (error) { 138 if (error) {
138 /* 139 /*
139 * If the attribute doesn't exist make sure we have a negative 140 * If the attribute doesn't exist make sure we have a negative
@@ -162,7 +163,7 @@ STATIC int
162xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) 163xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
163{ 164{
164 struct xfs_inode *ip = XFS_I(inode); 165 struct xfs_inode *ip = XFS_I(inode);
165 char *ea_name; 166 unsigned char *ea_name;
166 int error; 167 int error;
167 168
168 if (S_ISLNK(inode->i_mode)) 169 if (S_ISLNK(inode->i_mode))
@@ -194,7 +195,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
194 (sizeof(struct xfs_acl_entry) * 195 (sizeof(struct xfs_acl_entry) *
195 (XFS_ACL_MAX_ENTRIES - acl->a_count)); 196 (XFS_ACL_MAX_ENTRIES - acl->a_count));
196 197
197 error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl, 198 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
198 len, ATTR_ROOT); 199 len, ATTR_ROOT);
199 200
200 kfree(xfs_acl); 201 kfree(xfs_acl);
@@ -262,7 +263,7 @@ xfs_set_mode(struct inode *inode, mode_t mode)
262} 263}
263 264
264static int 265static int
265xfs_acl_exists(struct inode *inode, char *name) 266xfs_acl_exists(struct inode *inode, unsigned char *name)
266{ 267{
267 int len = sizeof(struct xfs_acl); 268 int len = sizeof(struct xfs_acl);
268 269
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 77b8be81c769..6f76ba85f193 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,6 +33,7 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
36 37
37#include "xfs_sb.h" 38#include "xfs_sb.h"
38#include "xfs_inum.h" 39#include "xfs_inum.h"
@@ -76,6 +77,27 @@ struct workqueue_struct *xfsconvertd_workqueue;
76#define xfs_buf_deallocate(bp) \ 77#define xfs_buf_deallocate(bp) \
77 kmem_zone_free(xfs_buf_zone, (bp)); 78 kmem_zone_free(xfs_buf_zone, (bp));
78 79
80static inline int
81xfs_buf_is_vmapped(
82 struct xfs_buf *bp)
83{
84 /*
85 * Return true if the buffer is vmapped.
86 *
87 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
88 * code is clever enough to know it doesn't have to map a single page,
89 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
90 */
91 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
92}
93
94static inline int
95xfs_buf_vmap_len(
96 struct xfs_buf *bp)
97{
98 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
99}
100
79/* 101/*
80 * Page Region interfaces. 102 * Page Region interfaces.
81 * 103 *
@@ -314,7 +336,7 @@ xfs_buf_free(
314 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 336 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
315 uint i; 337 uint i;
316 338
317 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 339 if (xfs_buf_is_vmapped(bp))
318 free_address(bp->b_addr - bp->b_offset); 340 free_address(bp->b_addr - bp->b_offset);
319 341
320 for (i = 0; i < bp->b_page_count; i++) { 342 for (i = 0; i < bp->b_page_count; i++) {
@@ -1051,22 +1073,30 @@ xfs_buf_ioerror(
1051} 1073}
1052 1074
1053int 1075int
1054xfs_bawrite( 1076xfs_bwrite(
1055 void *mp, 1077 struct xfs_mount *mp,
1056 struct xfs_buf *bp) 1078 struct xfs_buf *bp)
1057{ 1079{
1058 trace_xfs_buf_bawrite(bp, _RET_IP_); 1080 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
1081 int error = 0;
1059 1082
1060 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 1083 bp->b_strat = xfs_bdstrat_cb;
1084 bp->b_mount = mp;
1085 bp->b_flags |= XBF_WRITE;
1086 if (!iowait)
1087 bp->b_flags |= _XBF_RUN_QUEUES;
1061 1088
1062 xfs_buf_delwri_dequeue(bp); 1089 xfs_buf_delwri_dequeue(bp);
1090 xfs_buf_iostrategy(bp);
1063 1091
1064 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); 1092 if (iowait) {
1065 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); 1093 error = xfs_buf_iowait(bp);
1094 if (error)
1095 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1096 xfs_buf_relse(bp);
1097 }
1066 1098
1067 bp->b_mount = mp; 1099 return error;
1068 bp->b_strat = xfs_bdstrat_cb;
1069 return xfs_bdstrat_cb(bp);
1070} 1100}
1071 1101
1072void 1102void
@@ -1085,6 +1115,126 @@ xfs_bdwrite(
1085 xfs_buf_delwri_queue(bp, 1); 1115 xfs_buf_delwri_queue(bp, 1);
1086} 1116}
1087 1117
1118/*
1119 * Called when we want to stop a buffer from getting written or read.
1120 * We attach the EIO error, muck with its flags, and call biodone
1121 * so that the proper iodone callbacks get called.
1122 */
1123STATIC int
1124xfs_bioerror(
1125 xfs_buf_t *bp)
1126{
1127#ifdef XFSERRORDEBUG
1128 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1129#endif
1130
1131 /*
1132 * No need to wait until the buffer is unpinned, we aren't flushing it.
1133 */
1134 XFS_BUF_ERROR(bp, EIO);
1135
1136 /*
1137 * We're calling biodone, so delete XBF_DONE flag.
1138 */
1139 XFS_BUF_UNREAD(bp);
1140 XFS_BUF_UNDELAYWRITE(bp);
1141 XFS_BUF_UNDONE(bp);
1142 XFS_BUF_STALE(bp);
1143
1144 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1145 xfs_biodone(bp);
1146
1147 return EIO;
1148}
1149
1150/*
1151 * Same as xfs_bioerror, except that we are releasing the buffer
1152 * here ourselves, and avoiding the biodone call.
1153 * This is meant for userdata errors; metadata bufs come with
1154 * iodone functions attached, so that we can track down errors.
1155 */
1156STATIC int
1157xfs_bioerror_relse(
1158 struct xfs_buf *bp)
1159{
1160 int64_t fl = XFS_BUF_BFLAGS(bp);
1161 /*
1162 * No need to wait until the buffer is unpinned.
1163 * We aren't flushing it.
1164 *
1165 * chunkhold expects B_DONE to be set, whether
1166 * we actually finish the I/O or not. We don't want to
1167 * change that interface.
1168 */
1169 XFS_BUF_UNREAD(bp);
1170 XFS_BUF_UNDELAYWRITE(bp);
1171 XFS_BUF_DONE(bp);
1172 XFS_BUF_STALE(bp);
1173 XFS_BUF_CLR_IODONE_FUNC(bp);
1174 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1175 if (!(fl & XBF_ASYNC)) {
1176 /*
1177 * Mark b_error and B_ERROR _both_.
1178 * Lot's of chunkcache code assumes that.
1179 * There's no reason to mark error for
1180 * ASYNC buffers.
1181 */
1182 XFS_BUF_ERROR(bp, EIO);
1183 XFS_BUF_FINISH_IOWAIT(bp);
1184 } else {
1185 xfs_buf_relse(bp);
1186 }
1187
1188 return EIO;
1189}
1190
1191
1192/*
1193 * All xfs metadata buffers except log state machine buffers
1194 * get this attached as their b_bdstrat callback function.
1195 * This is so that we can catch a buffer
1196 * after prematurely unpinning it to forcibly shutdown the filesystem.
1197 */
1198int
1199xfs_bdstrat_cb(
1200 struct xfs_buf *bp)
1201{
1202 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1203 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1204 /*
1205 * Metadata write that didn't get logged but
1206 * written delayed anyway. These aren't associated
1207 * with a transaction, and can be ignored.
1208 */
1209 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1210 return xfs_bioerror_relse(bp);
1211 else
1212 return xfs_bioerror(bp);
1213 }
1214
1215 xfs_buf_iorequest(bp);
1216 return 0;
1217}
1218
1219/*
1220 * Wrapper around bdstrat so that we can stop data from going to disk in case
1221 * we are shutting down the filesystem. Typically user data goes thru this
1222 * path; one of the exceptions is the superblock.
1223 */
1224void
1225xfsbdstrat(
1226 struct xfs_mount *mp,
1227 struct xfs_buf *bp)
1228{
1229 if (XFS_FORCED_SHUTDOWN(mp)) {
1230 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1231 xfs_bioerror_relse(bp);
1232 return;
1233 }
1234
1235 xfs_buf_iorequest(bp);
1236}
1237
1088STATIC void 1238STATIC void
1089_xfs_buf_ioend( 1239_xfs_buf_ioend(
1090 xfs_buf_t *bp, 1240 xfs_buf_t *bp,
@@ -1107,6 +1257,9 @@ xfs_buf_bio_end_io(
1107 1257
1108 xfs_buf_ioerror(bp, -error); 1258 xfs_buf_ioerror(bp, -error);
1109 1259
1260 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1261 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1262
1110 do { 1263 do {
1111 struct page *page = bvec->bv_page; 1264 struct page *page = bvec->bv_page;
1112 1265
@@ -1216,6 +1369,10 @@ next_chunk:
1216 1369
1217submit_io: 1370submit_io:
1218 if (likely(bio->bi_size)) { 1371 if (likely(bio->bi_size)) {
1372 if (xfs_buf_is_vmapped(bp)) {
1373 flush_kernel_vmap_range(bp->b_addr,
1374 xfs_buf_vmap_len(bp));
1375 }
1219 submit_bio(rw, bio); 1376 submit_bio(rw, bio);
1220 if (size) 1377 if (size)
1221 goto next_chunk; 1378 goto next_chunk;
@@ -1296,7 +1453,7 @@ xfs_buf_iomove(
1296 xfs_buf_t *bp, /* buffer to process */ 1453 xfs_buf_t *bp, /* buffer to process */
1297 size_t boff, /* starting buffer offset */ 1454 size_t boff, /* starting buffer offset */
1298 size_t bsize, /* length to copy */ 1455 size_t bsize, /* length to copy */
1299 caddr_t data, /* data address */ 1456 void *data, /* data address */
1300 xfs_buf_rw_t mode) /* read/write/zero flag */ 1457 xfs_buf_rw_t mode) /* read/write/zero flag */
1301{ 1458{
1302 size_t bend, cpoff, csize; 1459 size_t bend, cpoff, csize;
@@ -1378,8 +1535,8 @@ xfs_alloc_bufhash(
1378 1535
1379 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1536 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1380 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1537 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1381 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1538 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1382 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1539 sizeof(xfs_bufhash_t));
1383 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1540 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1384 spin_lock_init(&btp->bt_hash[i].bh_lock); 1541 spin_lock_init(&btp->bt_hash[i].bh_lock);
1385 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1542 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1390,7 +1547,7 @@ STATIC void
1390xfs_free_bufhash( 1547xfs_free_bufhash(
1391 xfs_buftarg_t *btp) 1548 xfs_buftarg_t *btp)
1392{ 1549{
1393 kmem_free(btp->bt_hash); 1550 kmem_free_large(btp->bt_hash);
1394 btp->bt_hash = NULL; 1551 btp->bt_hash = NULL;
1395} 1552}
1396 1553
@@ -1595,6 +1752,11 @@ xfs_buf_delwri_queue(
1595 list_del(&bp->b_list); 1752 list_del(&bp->b_list);
1596 } 1753 }
1597 1754
1755 if (list_empty(dwq)) {
1756 /* start xfsbufd as it is about to have something to do */
1757 wake_up_process(bp->b_target->bt_task);
1758 }
1759
1598 bp->b_flags |= _XBF_DELWRI_Q; 1760 bp->b_flags |= _XBF_DELWRI_Q;
1599 list_add_tail(&bp->b_list, dwq); 1761 list_add_tail(&bp->b_list, dwq);
1600 bp->b_queuetime = jiffies; 1762 bp->b_queuetime = jiffies;
@@ -1626,6 +1788,35 @@ xfs_buf_delwri_dequeue(
1626 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); 1788 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1627} 1789}
1628 1790
1791/*
1792 * If a delwri buffer needs to be pushed before it has aged out, then promote
1793 * it to the head of the delwri queue so that it will be flushed on the next
1794 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1795 * than the age currently needed to flush the buffer. Hence the next time the
1796 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1797 */
1798void
1799xfs_buf_delwri_promote(
1800 struct xfs_buf *bp)
1801{
1802 struct xfs_buftarg *btp = bp->b_target;
1803 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1804
1805 ASSERT(bp->b_flags & XBF_DELWRI);
1806 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1807
1808 /*
1809 * Check the buffer age before locking the delayed write queue as we
1810 * don't need to promote buffers that are already past the flush age.
1811 */
1812 if (bp->b_queuetime < jiffies - age)
1813 return;
1814 bp->b_queuetime = jiffies - age;
1815 spin_lock(&btp->bt_delwrite_lock);
1816 list_move(&bp->b_list, &btp->bt_delwrite_queue);
1817 spin_unlock(&btp->bt_delwrite_lock);
1818}
1819
1629STATIC void 1820STATIC void
1630xfs_buf_runall_queues( 1821xfs_buf_runall_queues(
1631 struct workqueue_struct *queue) 1822 struct workqueue_struct *queue)
@@ -1644,6 +1835,8 @@ xfsbufd_wakeup(
1644 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1835 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1645 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1836 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1646 continue; 1837 continue;
1838 if (list_empty(&btp->bt_delwrite_queue))
1839 continue;
1647 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1840 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1648 wake_up_process(btp->bt_task); 1841 wake_up_process(btp->bt_task);
1649 } 1842 }
@@ -1694,20 +1887,53 @@ xfs_buf_delwri_split(
1694 1887
1695} 1888}
1696 1889
1890/*
1891 * Compare function is more complex than it needs to be because
1892 * the return value is only 32 bits and we are doing comparisons
1893 * on 64 bit values
1894 */
1895static int
1896xfs_buf_cmp(
1897 void *priv,
1898 struct list_head *a,
1899 struct list_head *b)
1900{
1901 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1902 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1903 xfs_daddr_t diff;
1904
1905 diff = ap->b_bn - bp->b_bn;
1906 if (diff < 0)
1907 return -1;
1908 if (diff > 0)
1909 return 1;
1910 return 0;
1911}
1912
1913void
1914xfs_buf_delwri_sort(
1915 xfs_buftarg_t *target,
1916 struct list_head *list)
1917{
1918 list_sort(NULL, list, xfs_buf_cmp);
1919}
1920
1697STATIC int 1921STATIC int
1698xfsbufd( 1922xfsbufd(
1699 void *data) 1923 void *data)
1700{ 1924{
1701 struct list_head tmp; 1925 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1702 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1703 int count;
1704 xfs_buf_t *bp;
1705 1926
1706 current->flags |= PF_MEMALLOC; 1927 current->flags |= PF_MEMALLOC;
1707 1928
1708 set_freezable(); 1929 set_freezable();
1709 1930
1710 do { 1931 do {
1932 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1933 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1934 int count = 0;
1935 struct list_head tmp;
1936
1711 if (unlikely(freezing(current))) { 1937 if (unlikely(freezing(current))) {
1712 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1938 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1713 refrigerator(); 1939 refrigerator();
@@ -1715,17 +1941,16 @@ xfsbufd(
1715 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1941 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1716 } 1942 }
1717 1943
1718 schedule_timeout_interruptible( 1944 /* sleep for a long time if there is nothing to do. */
1719 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1945 if (list_empty(&target->bt_delwrite_queue))
1946 tout = MAX_SCHEDULE_TIMEOUT;
1947 schedule_timeout_interruptible(tout);
1720 1948
1721 xfs_buf_delwri_split(target, &tmp, 1949 xfs_buf_delwri_split(target, &tmp, age);
1722 xfs_buf_age_centisecs * msecs_to_jiffies(10)); 1950 list_sort(NULL, &tmp, xfs_buf_cmp);
1723
1724 count = 0;
1725 while (!list_empty(&tmp)) { 1951 while (!list_empty(&tmp)) {
1726 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1952 struct xfs_buf *bp;
1727 ASSERT(target == bp->b_target); 1953 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1728
1729 list_del_init(&bp->b_list); 1954 list_del_init(&bp->b_list);
1730 xfs_buf_iostrategy(bp); 1955 xfs_buf_iostrategy(bp);
1731 count++; 1956 count++;
@@ -1751,42 +1976,45 @@ xfs_flush_buftarg(
1751 xfs_buftarg_t *target, 1976 xfs_buftarg_t *target,
1752 int wait) 1977 int wait)
1753{ 1978{
1754 struct list_head tmp; 1979 xfs_buf_t *bp;
1755 xfs_buf_t *bp, *n;
1756 int pincount = 0; 1980 int pincount = 0;
1981 LIST_HEAD(tmp_list);
1982 LIST_HEAD(wait_list);
1757 1983
1758 xfs_buf_runall_queues(xfsconvertd_workqueue); 1984 xfs_buf_runall_queues(xfsconvertd_workqueue);
1759 xfs_buf_runall_queues(xfsdatad_workqueue); 1985 xfs_buf_runall_queues(xfsdatad_workqueue);
1760 xfs_buf_runall_queues(xfslogd_workqueue); 1986 xfs_buf_runall_queues(xfslogd_workqueue);
1761 1987
1762 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1988 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1763 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1989 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1764 1990
1765 /* 1991 /*
1766 * Dropped the delayed write list lock, now walk the temporary list 1992 * Dropped the delayed write list lock, now walk the temporary list.
1993 * All I/O is issued async and then if we need to wait for completion
1994 * we do that after issuing all the IO.
1767 */ 1995 */
1768 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1996 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1997 while (!list_empty(&tmp_list)) {
1998 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1769 ASSERT(target == bp->b_target); 1999 ASSERT(target == bp->b_target);
1770 if (wait) 2000 list_del_init(&bp->b_list);
2001 if (wait) {
1771 bp->b_flags &= ~XBF_ASYNC; 2002 bp->b_flags &= ~XBF_ASYNC;
1772 else 2003 list_add(&bp->b_list, &wait_list);
1773 list_del_init(&bp->b_list); 2004 }
1774
1775 xfs_buf_iostrategy(bp); 2005 xfs_buf_iostrategy(bp);
1776 } 2006 }
1777 2007
1778 if (wait) 2008 if (wait) {
2009 /* Expedite and wait for IO to complete. */
1779 blk_run_address_space(target->bt_mapping); 2010 blk_run_address_space(target->bt_mapping);
2011 while (!list_empty(&wait_list)) {
2012 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1780 2013
1781 /* 2014 list_del_init(&bp->b_list);
1782 * Remaining list items must be flushed before returning 2015 xfs_iowait(bp);
1783 */ 2016 xfs_buf_relse(bp);
1784 while (!list_empty(&tmp)) { 2017 }
1785 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1786
1787 list_del_init(&bp->b_list);
1788 xfs_iowait(bp);
1789 xfs_buf_relse(bp);
1790 } 2018 }
1791 2019
1792 return pincount; 2020 return pincount;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a34c7b54822d..386e7361e50e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -232,13 +232,17 @@ extern void xfs_buf_lock(xfs_buf_t *);
232extern void xfs_buf_unlock(xfs_buf_t *); 232extern void xfs_buf_unlock(xfs_buf_t *);
233 233
234/* Buffer Read and Write Routines */ 234/* Buffer Read and Write Routines */
235extern int xfs_bawrite(void *mp, xfs_buf_t *bp); 235extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); 236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
237
238extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
239extern int xfs_bdstrat_cb(struct xfs_buf *);
240
237extern void xfs_buf_ioend(xfs_buf_t *, int); 241extern void xfs_buf_ioend(xfs_buf_t *, int);
238extern void xfs_buf_ioerror(xfs_buf_t *, int); 242extern void xfs_buf_ioerror(xfs_buf_t *, int);
239extern int xfs_buf_iorequest(xfs_buf_t *); 243extern int xfs_buf_iorequest(xfs_buf_t *);
240extern int xfs_buf_iowait(xfs_buf_t *); 244extern int xfs_buf_iowait(xfs_buf_t *);
241extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, 245extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
242 xfs_buf_rw_t); 246 xfs_buf_rw_t);
243 247
244static inline int xfs_buf_iostrategy(xfs_buf_t *bp) 248static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
@@ -261,6 +265,7 @@ extern int xfs_buf_ispin(xfs_buf_t *);
261 265
262/* Delayed Write Buffer Routines */ 266/* Delayed Write Buffer Routines */
263extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 267extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
268extern void xfs_buf_delwri_promote(xfs_buf_t *);
264 269
265/* Buffer Daemon Setup Routines */ 270/* Buffer Daemon Setup Routines */
266extern int xfs_buf_init(void); 271extern int xfs_buf_init(void);
@@ -270,33 +275,19 @@ extern void xfs_buf_terminate(void);
270 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) 275 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
271 276
272 277
273#define XFS_B_ASYNC XBF_ASYNC
274#define XFS_B_DELWRI XBF_DELWRI
275#define XFS_B_READ XBF_READ
276#define XFS_B_WRITE XBF_WRITE
277#define XFS_B_STALE XBF_STALE
278
279#define XFS_BUF_TRYLOCK XBF_TRYLOCK
280#define XFS_INCORE_TRYLOCK XBF_TRYLOCK
281#define XFS_BUF_LOCK XBF_LOCK
282#define XFS_BUF_MAPPED XBF_MAPPED
283
284#define BUF_BUSY XBF_DONT_BLOCK
285
286#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) 278#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
287#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 279#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
288 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 280 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
289 281
290#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) 282#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE)
291#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) 283#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
292#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) 284#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
293#define XFS_BUF_SUPER_STALE(bp) do { \ 285#define XFS_BUF_SUPER_STALE(bp) do { \
294 XFS_BUF_STALE(bp); \ 286 XFS_BUF_STALE(bp); \
295 xfs_buf_delwri_dequeue(bp); \ 287 xfs_buf_delwri_dequeue(bp); \
296 XFS_BUF_DONE(bp); \ 288 XFS_BUF_DONE(bp); \
297 } while (0) 289 } while (0)
298 290
299#define XFS_BUF_MANAGE XBF_FS_MANAGED
300#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) 291#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
301 292
302#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 293#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
@@ -385,31 +376,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
385 376
386#define xfs_biomove(bp, off, len, data, rw) \ 377#define xfs_biomove(bp, off, len, data, rw) \
387 xfs_buf_iomove((bp), (off), (len), (data), \ 378 xfs_buf_iomove((bp), (off), (len), (data), \
388 ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) 379 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
389 380
390#define xfs_biozero(bp, off, len) \ 381#define xfs_biozero(bp, off, len) \
391 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) 382 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
392 383
393
394static inline int XFS_bwrite(xfs_buf_t *bp)
395{
396 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
397 int error = 0;
398
399 if (!iowait)
400 bp->b_flags |= _XBF_RUN_QUEUES;
401
402 xfs_buf_delwri_dequeue(bp);
403 xfs_buf_iostrategy(bp);
404 if (iowait) {
405 error = xfs_buf_iowait(bp);
406 xfs_buf_relse(bp);
407 }
408 return error;
409}
410
411#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
412
413#define xfs_iowait(bp) xfs_buf_iowait(bp) 384#define xfs_iowait(bp) xfs_buf_iowait(bp)
414 385
415#define xfs_baread(target, rablkno, ralen) \ 386#define xfs_baread(target, rablkno, ralen) \
@@ -424,6 +395,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
424extern void xfs_wait_buftarg(xfs_buftarg_t *); 395extern void xfs_wait_buftarg(xfs_buftarg_t *);
425extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
426extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 397extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
398
427#ifdef CONFIG_KDB_MODULES 399#ifdef CONFIG_KDB_MODULES
428extern struct list_head *xfs_get_buftarg_list(void); 400extern struct list_head *xfs_get_buftarg_list(void);
429#endif 401#endif
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 7501b85fd860..b6918d76bc7b 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,7 +79,7 @@ xfs_flush_pages(
79 xfs_iflags_clear(ip, XFS_ITRUNCATED); 79 xfs_iflags_clear(ip, XFS_ITRUNCATED);
80 ret = -filemap_fdatawrite(mapping); 80 ret = -filemap_fdatawrite(mapping);
81 } 81 }
82 if (flags & XFS_B_ASYNC) 82 if (flags & XBF_ASYNC)
83 return ret; 83 return ret;
84 ret2 = xfs_wait_on_pages(ip, first, last); 84 ret2 = xfs_wait_on_pages(ip, first, last);
85 if (!ret) 85 if (!ret)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a034cf624437..4ea1ee18aded 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -447,12 +447,12 @@ xfs_attrlist_by_handle(
447int 447int
448xfs_attrmulti_attr_get( 448xfs_attrmulti_attr_get(
449 struct inode *inode, 449 struct inode *inode,
450 char *name, 450 unsigned char *name,
451 char __user *ubuf, 451 unsigned char __user *ubuf,
452 __uint32_t *len, 452 __uint32_t *len,
453 __uint32_t flags) 453 __uint32_t flags)
454{ 454{
455 char *kbuf; 455 unsigned char *kbuf;
456 int error = EFAULT; 456 int error = EFAULT;
457 457
458 if (*len > XATTR_SIZE_MAX) 458 if (*len > XATTR_SIZE_MAX)
@@ -476,12 +476,12 @@ xfs_attrmulti_attr_get(
476int 476int
477xfs_attrmulti_attr_set( 477xfs_attrmulti_attr_set(
478 struct inode *inode, 478 struct inode *inode,
479 char *name, 479 unsigned char *name,
480 const char __user *ubuf, 480 const unsigned char __user *ubuf,
481 __uint32_t len, 481 __uint32_t len,
482 __uint32_t flags) 482 __uint32_t flags)
483{ 483{
484 char *kbuf; 484 unsigned char *kbuf;
485 int error = EFAULT; 485 int error = EFAULT;
486 486
487 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 487 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -501,7 +501,7 @@ xfs_attrmulti_attr_set(
501int 501int
502xfs_attrmulti_attr_remove( 502xfs_attrmulti_attr_remove(
503 struct inode *inode, 503 struct inode *inode,
504 char *name, 504 unsigned char *name,
505 __uint32_t flags) 505 __uint32_t flags)
506{ 506{
507 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 507 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -519,7 +519,7 @@ xfs_attrmulti_by_handle(
519 xfs_fsop_attrmulti_handlereq_t am_hreq; 519 xfs_fsop_attrmulti_handlereq_t am_hreq;
520 struct dentry *dentry; 520 struct dentry *dentry;
521 unsigned int i, size; 521 unsigned int i, size;
522 char *attr_name; 522 unsigned char *attr_name;
523 523
524 if (!capable(CAP_SYS_ADMIN)) 524 if (!capable(CAP_SYS_ADMIN))
525 return -XFS_ERROR(EPERM); 525 return -XFS_ERROR(EPERM);
@@ -547,7 +547,7 @@ xfs_attrmulti_by_handle(
547 547
548 error = 0; 548 error = 0;
549 for (i = 0; i < am_hreq.opcount; i++) { 549 for (i = 0; i < am_hreq.opcount; i++) {
550 ops[i].am_error = strncpy_from_user(attr_name, 550 ops[i].am_error = strncpy_from_user((char *)attr_name,
551 ops[i].am_attrname, MAXNAMELEN); 551 ops[i].am_attrname, MAXNAMELEN);
552 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 552 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
553 error = -ERANGE; 553 error = -ERANGE;
@@ -1431,6 +1431,9 @@ xfs_file_ioctl(
1431 if (!capable(CAP_SYS_ADMIN)) 1431 if (!capable(CAP_SYS_ADMIN))
1432 return -EPERM; 1432 return -EPERM;
1433 1433
1434 if (mp->m_flags & XFS_MOUNT_RDONLY)
1435 return -XFS_ERROR(EROFS);
1436
1434 if (copy_from_user(&inout, arg, sizeof(inout))) 1437 if (copy_from_user(&inout, arg, sizeof(inout)))
1435 return -XFS_ERROR(EFAULT); 1438 return -XFS_ERROR(EFAULT);
1436 1439
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 7bd7c6afc1eb..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -45,23 +45,23 @@ xfs_readlink_by_handle(
45extern int 45extern int
46xfs_attrmulti_attr_get( 46xfs_attrmulti_attr_get(
47 struct inode *inode, 47 struct inode *inode,
48 char *name, 48 unsigned char *name,
49 char __user *ubuf, 49 unsigned char __user *ubuf,
50 __uint32_t *len, 50 __uint32_t *len,
51 __uint32_t flags); 51 __uint32_t flags);
52 52
53extern int 53extern int
54 xfs_attrmulti_attr_set( 54xfs_attrmulti_attr_set(
55 struct inode *inode, 55 struct inode *inode,
56 char *name, 56 unsigned char *name,
57 const char __user *ubuf, 57 const unsigned char __user *ubuf,
58 __uint32_t len, 58 __uint32_t len,
59 __uint32_t flags); 59 __uint32_t flags);
60 60
61extern int 61extern int
62xfs_attrmulti_attr_remove( 62xfs_attrmulti_attr_remove(
63 struct inode *inode, 63 struct inode *inode,
64 char *name, 64 unsigned char *name,
65 __uint32_t flags); 65 __uint32_t flags);
66 66
67extern struct dentry * 67extern struct dentry *
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index be1527b1670c..0bf6d61f0528 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -411,7 +411,7 @@ xfs_compat_attrmulti_by_handle(
411 compat_xfs_fsop_attrmulti_handlereq_t am_hreq; 411 compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
412 struct dentry *dentry; 412 struct dentry *dentry;
413 unsigned int i, size; 413 unsigned int i, size;
414 char *attr_name; 414 unsigned char *attr_name;
415 415
416 if (!capable(CAP_SYS_ADMIN)) 416 if (!capable(CAP_SYS_ADMIN))
417 return -XFS_ERROR(EPERM); 417 return -XFS_ERROR(EPERM);
@@ -440,7 +440,7 @@ xfs_compat_attrmulti_by_handle(
440 440
441 error = 0; 441 error = 0;
442 for (i = 0; i < am_hreq.opcount; i++) { 442 for (i = 0; i < am_hreq.opcount; i++) {
443 ops[i].am_error = strncpy_from_user(attr_name, 443 ops[i].am_error = strncpy_from_user((char *)attr_name,
444 compat_ptr(ops[i].am_attrname), 444 compat_ptr(ops[i].am_attrname),
445 MAXNAMELEN); 445 MAXNAMELEN);
446 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 446 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 225946012d0b..e8566bbf0f00 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -140,10 +140,10 @@ xfs_init_security(
140 struct xfs_inode *ip = XFS_I(inode); 140 struct xfs_inode *ip = XFS_I(inode);
141 size_t length; 141 size_t length;
142 void *value; 142 void *value;
143 char *name; 143 unsigned char *name;
144 int error; 144 int error;
145 145
146 error = security_inode_init_security(inode, dir, &name, 146 error = security_inode_init_security(inode, dir, (char **)&name,
147 &value, &length); 147 &value, &length);
148 if (error) { 148 if (error) {
149 if (error == -EOPNOTSUPP) 149 if (error == -EOPNOTSUPP)
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 0d32457abef1..eac6f80d786d 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -630,18 +630,9 @@ start:
630 * by root. This keeps people from modifying setuid and 630 * by root. This keeps people from modifying setuid and
631 * setgid binaries. 631 * setgid binaries.
632 */ 632 */
633 633 error = -file_remove_suid(file);
634 if (((xip->i_d.di_mode & S_ISUID) || 634 if (unlikely(error))
635 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == 635 goto out_unlock_internal;
636 (S_ISGID | S_IXGRP))) &&
637 !capable(CAP_FSETID)) {
638 error = xfs_write_clear_setuid(xip);
639 if (likely(!error))
640 error = -file_remove_suid(file);
641 if (unlikely(error)) {
642 goto out_unlock_internal;
643 }
644 }
645 636
646 /* We can write back this queue in page reclaim */ 637 /* We can write back this queue in page reclaim */
647 current->backing_dev_info = mapping->backing_dev_info; 638 current->backing_dev_info = mapping->backing_dev_info;
@@ -784,53 +775,6 @@ write_retry:
784} 775}
785 776
786/* 777/*
787 * All xfs metadata buffers except log state machine buffers
788 * get this attached as their b_bdstrat callback function.
789 * This is so that we can catch a buffer
790 * after prematurely unpinning it to forcibly shutdown the filesystem.
791 */
792int
793xfs_bdstrat_cb(struct xfs_buf *bp)
794{
795 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
796 trace_xfs_bdstrat_shut(bp, _RET_IP_);
797 /*
798 * Metadata write that didn't get logged but
799 * written delayed anyway. These aren't associated
800 * with a transaction, and can be ignored.
801 */
802 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
803 (XFS_BUF_ISREAD(bp)) == 0)
804 return (xfs_bioerror_relse(bp));
805 else
806 return (xfs_bioerror(bp));
807 }
808
809 xfs_buf_iorequest(bp);
810 return 0;
811}
812
813/*
814 * Wrapper around bdstrat so that we can stop data from going to disk in case
815 * we are shutting down the filesystem. Typically user data goes thru this
816 * path; one of the exceptions is the superblock.
817 */
818void
819xfsbdstrat(
820 struct xfs_mount *mp,
821 struct xfs_buf *bp)
822{
823 ASSERT(mp);
824 if (!XFS_FORCED_SHUTDOWN(mp)) {
825 xfs_buf_iorequest(bp);
826 return;
827 }
828
829 trace_xfs_bdstrat_shut(bp, _RET_IP_);
830 xfs_bioerror_relse(bp);
831}
832
833/*
834 * If the underlying (data/log/rt) device is readonly, there are some 778 * If the underlying (data/log/rt) device is readonly, there are some
835 * operations that cannot proceed. 779 * operations that cannot proceed.
836 */ 780 */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index d1f7789c7ffb..342ae8c0d011 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -22,9 +22,6 @@ struct xfs_mount;
22struct xfs_inode; 22struct xfs_inode;
23struct xfs_buf; 23struct xfs_buf;
24 24
25/* errors from xfsbdstrat() must be extracted from the buffer */
26extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
27extern int xfs_bdstrat_cb(struct xfs_buf *);
28extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 25extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
29 26
30extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 27extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 77414db10dc2..71345a370d9f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -877,12 +877,11 @@ xfsaild(
877{ 877{
878 struct xfs_ail *ailp = data; 878 struct xfs_ail *ailp = data;
879 xfs_lsn_t last_pushed_lsn = 0; 879 xfs_lsn_t last_pushed_lsn = 0;
880 long tout = 0; 880 long tout = 0; /* milliseconds */
881 881
882 while (!kthread_should_stop()) { 882 while (!kthread_should_stop()) {
883 if (tout) 883 schedule_timeout_interruptible(tout ?
884 schedule_timeout_interruptible(msecs_to_jiffies(tout)); 884 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
885 tout = 1000;
886 885
887 /* swsusp */ 886 /* swsusp */
888 try_to_freeze(); 887 try_to_freeze();
@@ -1022,59 +1021,108 @@ xfs_fs_dirty_inode(
1022 XFS_I(inode)->i_update_core = 1; 1021 XFS_I(inode)->i_update_core = 1;
1023} 1022}
1024 1023
1025/* 1024STATIC int
1026 * Attempt to flush the inode, this will actually fail 1025xfs_log_inode(
1027 * if the inode is pinned, but we dirty the inode again 1026 struct xfs_inode *ip)
1028 * at the point when it is unpinned after a log write, 1027{
1029 * since this is when the inode itself becomes flushable. 1028 struct xfs_mount *mp = ip->i_mount;
1030 */ 1029 struct xfs_trans *tp;
1030 int error;
1031
1032 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1033 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1034 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
1035
1036 if (error) {
1037 xfs_trans_cancel(tp, 0);
1038 /* we need to return with the lock hold shared */
1039 xfs_ilock(ip, XFS_ILOCK_SHARED);
1040 return error;
1041 }
1042
1043 xfs_ilock(ip, XFS_ILOCK_EXCL);
1044
1045 /*
1046 * Note - it's possible that we might have pushed ourselves out of the
1047 * way during trans_reserve which would flush the inode. But there's
1048 * no guarantee that the inode buffer has actually gone out yet (it's
1049 * delwri). Plus the buffer could be pinned anyway if it's part of
1050 * an inode in another recent transaction. So we play it safe and
1051 * fire off the transaction anyway.
1052 */
1053 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1054 xfs_trans_ihold(tp, ip);
1055 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1056 xfs_trans_set_sync(tp);
1057 error = xfs_trans_commit(tp, 0);
1058 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1059
1060 return error;
1061}
1062
1031STATIC int 1063STATIC int
1032xfs_fs_write_inode( 1064xfs_fs_write_inode(
1033 struct inode *inode, 1065 struct inode *inode,
1034 int sync) 1066 struct writeback_control *wbc)
1035{ 1067{
1036 struct xfs_inode *ip = XFS_I(inode); 1068 struct xfs_inode *ip = XFS_I(inode);
1037 struct xfs_mount *mp = ip->i_mount; 1069 struct xfs_mount *mp = ip->i_mount;
1038 int error = 0; 1070 int error = EAGAIN;
1039 1071
1040 xfs_itrace_entry(ip); 1072 xfs_itrace_entry(ip);
1041 1073
1042 if (XFS_FORCED_SHUTDOWN(mp)) 1074 if (XFS_FORCED_SHUTDOWN(mp))
1043 return XFS_ERROR(EIO); 1075 return XFS_ERROR(EIO);
1044 1076
1045 if (sync) { 1077 if (wbc->sync_mode == WB_SYNC_ALL) {
1046 error = xfs_wait_on_pages(ip, 0, -1); 1078 /*
1047 if (error) 1079 * Make sure the inode has hit stable storage. By using the
1080 * log and the fsync transactions we reduce the IOs we have
1081 * to do here from two (log and inode) to just the log.
1082 *
1083 * Note: We still need to do a delwri write of the inode after
1084 * this to flush it to the backing buffer so that bulkstat
1085 * works properly if this is the first time the inode has been
1086 * written. Because we hold the ilock atomically over the
1087 * transaction commit and the inode flush we are guaranteed
1088 * that the inode is not pinned when it returns. If the flush
1089 * lock is already held, then the inode has already been
1090 * flushed once and we don't need to flush it again. Hence
1091 * the code will only flush the inode if it isn't already
1092 * being flushed.
1093 */
1094 xfs_ilock(ip, XFS_ILOCK_SHARED);
1095 if (ip->i_update_core) {
1096 error = xfs_log_inode(ip);
1097 if (error)
1098 goto out_unlock;
1099 }
1100 } else {
1101 /*
1102 * We make this non-blocking if the inode is contended, return
1103 * EAGAIN to indicate to the caller that they did not succeed.
1104 * This prevents the flush path from blocking on inodes inside
1105 * another operation right now, they get caught later by xfs_sync.
1106 */
1107 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1048 goto out; 1108 goto out;
1049 } 1109 }
1050 1110
1051 /* 1111 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1052 * Bypass inodes which have already been cleaned by 1112 goto out_unlock;
1053 * the inode flush clustering code inside xfs_iflush
1054 */
1055 if (xfs_inode_clean(ip))
1056 goto out;
1057 1113
1058 /* 1114 /*
1059 * We make this non-blocking if the inode is contended, return 1115 * Now we have the flush lock and the inode is not pinned, we can check
1060 * EAGAIN to indicate to the caller that they did not succeed. 1116 * if the inode is really clean as we know that there are no pending
1061 * This prevents the flush path from blocking on inodes inside 1117 * transaction completions, it is not waiting on the delayed write
1062 * another operation right now, they get caught later by xfs_sync. 1118 * queue and there is no IO in progress.
1063 */ 1119 */
1064 if (sync) { 1120 if (xfs_inode_clean(ip)) {
1065 xfs_ilock(ip, XFS_ILOCK_SHARED); 1121 xfs_ifunlock(ip);
1066 xfs_iflock(ip); 1122 error = 0;
1067 1123 goto out_unlock;
1068 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
1069 } else {
1070 error = EAGAIN;
1071 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1072 goto out;
1073 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1074 goto out_unlock;
1075
1076 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
1077 } 1124 }
1125 error = xfs_iflush(ip, 0);
1078 1126
1079 out_unlock: 1127 out_unlock:
1080 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1128 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1257,6 +1305,29 @@ xfs_fs_statfs(
1257 return 0; 1305 return 0;
1258} 1306}
1259 1307
1308STATIC void
1309xfs_save_resvblks(struct xfs_mount *mp)
1310{
1311 __uint64_t resblks = 0;
1312
1313 mp->m_resblks_save = mp->m_resblks;
1314 xfs_reserve_blocks(mp, &resblks, NULL);
1315}
1316
1317STATIC void
1318xfs_restore_resvblks(struct xfs_mount *mp)
1319{
1320 __uint64_t resblks;
1321
1322 if (mp->m_resblks_save) {
1323 resblks = mp->m_resblks_save;
1324 mp->m_resblks_save = 0;
1325 } else
1326 resblks = xfs_default_resblks(mp);
1327
1328 xfs_reserve_blocks(mp, &resblks, NULL);
1329}
1330
1260STATIC int 1331STATIC int
1261xfs_fs_remount( 1332xfs_fs_remount(
1262 struct super_block *sb, 1333 struct super_block *sb,
@@ -1336,11 +1407,27 @@ xfs_fs_remount(
1336 } 1407 }
1337 mp->m_update_flags = 0; 1408 mp->m_update_flags = 0;
1338 } 1409 }
1410
1411 /*
1412 * Fill out the reserve pool if it is empty. Use the stashed
1413 * value if it is non-zero, otherwise go with the default.
1414 */
1415 xfs_restore_resvblks(mp);
1339 } 1416 }
1340 1417
1341 /* rw -> ro */ 1418 /* rw -> ro */
1342 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1419 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1420 /*
1421 * After we have synced the data but before we sync the
1422 * metadata, we need to free up the reserve block pool so that
1423 * the used block count in the superblock on disk is correct at
1424 * the end of the remount. Stash the current reserve pool size
1425 * so that if we get remounted rw, we can return it to the same
1426 * size.
1427 */
1428
1343 xfs_quiesce_data(mp); 1429 xfs_quiesce_data(mp);
1430 xfs_save_resvblks(mp);
1344 xfs_quiesce_attr(mp); 1431 xfs_quiesce_attr(mp);
1345 mp->m_flags |= XFS_MOUNT_RDONLY; 1432 mp->m_flags |= XFS_MOUNT_RDONLY;
1346 } 1433 }
@@ -1359,11 +1446,22 @@ xfs_fs_freeze(
1359{ 1446{
1360 struct xfs_mount *mp = XFS_M(sb); 1447 struct xfs_mount *mp = XFS_M(sb);
1361 1448
1449 xfs_save_resvblks(mp);
1362 xfs_quiesce_attr(mp); 1450 xfs_quiesce_attr(mp);
1363 return -xfs_fs_log_dummy(mp); 1451 return -xfs_fs_log_dummy(mp);
1364} 1452}
1365 1453
1366STATIC int 1454STATIC int
1455xfs_fs_unfreeze(
1456 struct super_block *sb)
1457{
1458 struct xfs_mount *mp = XFS_M(sb);
1459
1460 xfs_restore_resvblks(mp);
1461 return 0;
1462}
1463
1464STATIC int
1367xfs_fs_show_options( 1465xfs_fs_show_options(
1368 struct seq_file *m, 1466 struct seq_file *m,
1369 struct vfsmount *mnt) 1467 struct vfsmount *mnt)
@@ -1585,6 +1683,7 @@ static const struct super_operations xfs_super_operations = {
1585 .put_super = xfs_fs_put_super, 1683 .put_super = xfs_fs_put_super,
1586 .sync_fs = xfs_fs_sync_fs, 1684 .sync_fs = xfs_fs_sync_fs,
1587 .freeze_fs = xfs_fs_freeze, 1685 .freeze_fs = xfs_fs_freeze,
1686 .unfreeze_fs = xfs_fs_unfreeze,
1588 .statfs = xfs_fs_statfs, 1687 .statfs = xfs_fs_statfs,
1589 .remount_fs = xfs_fs_remount, 1688 .remount_fs = xfs_fs_remount,
1590 .show_options = xfs_fs_show_options, 1689 .show_options = xfs_fs_show_options,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 1f5e4bb5e970..a9f6d20aff41 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -90,14 +90,13 @@ xfs_inode_ag_lookup(
90STATIC int 90STATIC int
91xfs_inode_ag_walk( 91xfs_inode_ag_walk(
92 struct xfs_mount *mp, 92 struct xfs_mount *mp,
93 xfs_agnumber_t ag, 93 struct xfs_perag *pag,
94 int (*execute)(struct xfs_inode *ip, 94 int (*execute)(struct xfs_inode *ip,
95 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
96 int flags, 96 int flags,
97 int tag, 97 int tag,
98 int exclusive) 98 int exclusive)
99{ 99{
100 struct xfs_perag *pag = &mp->m_perag[ag];
101 uint32_t first_index; 100 uint32_t first_index;
102 int last_error = 0; 101 int last_error = 0;
103 int skipped; 102 int skipped;
@@ -141,8 +140,6 @@ restart:
141 delay(1); 140 delay(1);
142 goto restart; 141 goto restart;
143 } 142 }
144
145 xfs_put_perag(mp, pag);
146 return last_error; 143 return last_error;
147} 144}
148 145
@@ -160,10 +157,16 @@ xfs_inode_ag_iterator(
160 xfs_agnumber_t ag; 157 xfs_agnumber_t ag;
161 158
162 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 159 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
163 if (!mp->m_perag[ag].pag_ici_init) 160 struct xfs_perag *pag;
161
162 pag = xfs_perag_get(mp, ag);
163 if (!pag->pag_ici_init) {
164 xfs_perag_put(pag);
164 continue; 165 continue;
165 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, 166 }
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
166 exclusive); 168 exclusive);
169 xfs_perag_put(pag);
167 if (error) { 170 if (error) {
168 last_error = error; 171 last_error = error;
169 if (error == EFSCORRUPTED) 172 if (error == EFSCORRUPTED)
@@ -231,7 +234,7 @@ xfs_sync_inode_data(
231 } 234 }
232 235
233 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 236 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
234 0 : XFS_B_ASYNC, FI_NONE); 237 0 : XBF_ASYNC, FI_NONE);
235 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 238 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
236 239
237 out_wait: 240 out_wait:
@@ -267,8 +270,7 @@ xfs_sync_inode_attr(
267 goto out_unlock; 270 goto out_unlock;
268 } 271 }
269 272
270 error = xfs_iflush(ip, (flags & SYNC_WAIT) ? 273 error = xfs_iflush(ip, flags);
271 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
272 274
273 out_unlock: 275 out_unlock:
274 xfs_iunlock(ip, XFS_ILOCK_SHARED); 276 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -293,10 +295,7 @@ xfs_sync_data(
293 if (error) 295 if (error)
294 return XFS_ERROR(error); 296 return XFS_ERROR(error);
295 297
296 xfs_log_force(mp, 0, 298 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
297 (flags & SYNC_WAIT) ?
298 XFS_LOG_FORCE | XFS_LOG_SYNC :
299 XFS_LOG_FORCE);
300 return 0; 299 return 0;
301} 300}
302 301
@@ -322,10 +321,6 @@ xfs_commit_dummy_trans(
322 struct xfs_inode *ip = mp->m_rootip; 321 struct xfs_inode *ip = mp->m_rootip;
323 struct xfs_trans *tp; 322 struct xfs_trans *tp;
324 int error; 323 int error;
325 int log_flags = XFS_LOG_FORCE;
326
327 if (flags & SYNC_WAIT)
328 log_flags |= XFS_LOG_SYNC;
329 324
330 /* 325 /*
331 * Put a dummy transaction in the log to tell recovery 326 * Put a dummy transaction in the log to tell recovery
@@ -347,11 +342,11 @@ xfs_commit_dummy_trans(
347 xfs_iunlock(ip, XFS_ILOCK_EXCL); 342 xfs_iunlock(ip, XFS_ILOCK_EXCL);
348 343
349 /* the log force ensures this transaction is pushed to disk */ 344 /* the log force ensures this transaction is pushed to disk */
350 xfs_log_force(mp, 0, log_flags); 345 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
351 return error; 346 return error;
352} 347}
353 348
354int 349STATIC int
355xfs_sync_fsdata( 350xfs_sync_fsdata(
356 struct xfs_mount *mp, 351 struct xfs_mount *mp,
357 int flags) 352 int flags)
@@ -367,7 +362,7 @@ xfs_sync_fsdata(
367 if (flags & SYNC_TRYLOCK) { 362 if (flags & SYNC_TRYLOCK) {
368 ASSERT(!(flags & SYNC_WAIT)); 363 ASSERT(!(flags & SYNC_WAIT));
369 364
370 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); 365 bp = xfs_getsb(mp, XBF_TRYLOCK);
371 if (!bp) 366 if (!bp)
372 goto out; 367 goto out;
373 368
@@ -387,7 +382,7 @@ xfs_sync_fsdata(
387 * become pinned in between there and here. 382 * become pinned in between there and here.
388 */ 383 */
389 if (XFS_BUF_ISPINNED(bp)) 384 if (XFS_BUF_ISPINNED(bp))
390 xfs_log_force(mp, 0, XFS_LOG_FORCE); 385 xfs_log_force(mp, 0);
391 } 386 }
392 387
393 388
@@ -448,9 +443,6 @@ xfs_quiesce_data(
448 xfs_sync_data(mp, SYNC_WAIT); 443 xfs_sync_data(mp, SYNC_WAIT);
449 xfs_qm_sync(mp, SYNC_WAIT); 444 xfs_qm_sync(mp, SYNC_WAIT);
450 445
451 /* drop inode references pinned by filestreams */
452 xfs_filestream_flush(mp);
453
454 /* write superblock and hoover up shutdown errors */ 446 /* write superblock and hoover up shutdown errors */
455 error = xfs_sync_fsdata(mp, SYNC_WAIT); 447 error = xfs_sync_fsdata(mp, SYNC_WAIT);
456 448
@@ -467,16 +459,18 @@ xfs_quiesce_fs(
467{ 459{
468 int count = 0, pincount; 460 int count = 0, pincount;
469 461
462 xfs_reclaim_inodes(mp, 0);
470 xfs_flush_buftarg(mp->m_ddev_targp, 0); 463 xfs_flush_buftarg(mp->m_ddev_targp, 0);
471 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
472 464
473 /* 465 /*
474 * This loop must run at least twice. The first instance of the loop 466 * This loop must run at least twice. The first instance of the loop
475 * will flush most meta data but that will generate more meta data 467 * will flush most meta data but that will generate more meta data
476 * (typically directory updates). Which then must be flushed and 468 * (typically directory updates). Which then must be flushed and
477 * logged before we can write the unmount record. 469 * logged before we can write the unmount record. We also so sync
470 * reclaim of inodes to catch any that the above delwri flush skipped.
478 */ 471 */
479 do { 472 do {
473 xfs_reclaim_inodes(mp, SYNC_WAIT);
480 xfs_sync_attr(mp, SYNC_WAIT); 474 xfs_sync_attr(mp, SYNC_WAIT);
481 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 475 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
482 if (!pincount) { 476 if (!pincount) {
@@ -575,7 +569,7 @@ xfs_flush_inodes(
575 igrab(inode); 569 igrab(inode);
576 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); 570 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
577 wait_for_completion(&completion); 571 wait_for_completion(&completion);
578 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); 572 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
579} 573}
580 574
581/* 575/*
@@ -591,8 +585,8 @@ xfs_sync_worker(
591 int error; 585 int error;
592 586
593 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 587 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
594 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 588 xfs_log_force(mp, 0);
595 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 589 xfs_reclaim_inodes(mp, 0);
596 /* dgc: errors ignored here */ 590 /* dgc: errors ignored here */
597 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 591 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
598 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -690,16 +684,17 @@ void
690xfs_inode_set_reclaim_tag( 684xfs_inode_set_reclaim_tag(
691 xfs_inode_t *ip) 685 xfs_inode_t *ip)
692{ 686{
693 xfs_mount_t *mp = ip->i_mount; 687 struct xfs_mount *mp = ip->i_mount;
694 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 688 struct xfs_perag *pag;
695 689
690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
696 read_lock(&pag->pag_ici_lock); 691 read_lock(&pag->pag_ici_lock);
697 spin_lock(&ip->i_flags_lock); 692 spin_lock(&ip->i_flags_lock);
698 __xfs_inode_set_reclaim_tag(pag, ip); 693 __xfs_inode_set_reclaim_tag(pag, ip);
699 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
700 spin_unlock(&ip->i_flags_lock); 695 spin_unlock(&ip->i_flags_lock);
701 read_unlock(&pag->pag_ici_lock); 696 read_unlock(&pag->pag_ici_lock);
702 xfs_put_perag(mp, pag); 697 xfs_perag_put(pag);
703} 698}
704 699
705void 700void
@@ -712,12 +707,64 @@ __xfs_inode_clear_reclaim_tag(
712 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 707 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
713} 708}
714 709
710/*
711 * Inodes in different states need to be treated differently, and the return
712 * value of xfs_iflush is not sufficient to get this right. The following table
713 * lists the inode states and the reclaim actions necessary for non-blocking
714 * reclaim:
715 *
716 *
717 * inode state iflush ret required action
718 * --------------- ---------- ---------------
719 * bad - reclaim
720 * shutdown EIO unpin and reclaim
721 * clean, unpinned 0 reclaim
722 * stale, unpinned 0 reclaim
723 * clean, pinned(*) 0 requeue
724 * stale, pinned EAGAIN requeue
725 * dirty, delwri ok 0 requeue
726 * dirty, delwri blocked EAGAIN requeue
727 * dirty, sync flush 0 reclaim
728 *
729 * (*) dgc: I don't think the clean, pinned state is possible but it gets
730 * handled anyway given the order of checks implemented.
731 *
732 * As can be seen from the table, the return value of xfs_iflush() is not
733 * sufficient to correctly decide the reclaim action here. The checks in
734 * xfs_iflush() might look like duplicates, but they are not.
735 *
736 * Also, because we get the flush lock first, we know that any inode that has
737 * been flushed delwri has had the flush completed by the time we check that
738 * the inode is clean. The clean inode check needs to be done before flushing
739 * the inode delwri otherwise we would loop forever requeuing clean inodes as
740 * we cannot tell apart a successful delwri flush and a clean inode from the
741 * return value of xfs_iflush().
742 *
743 * Note that because the inode is flushed delayed write by background
744 * writeback, the flush lock may already be held here and waiting on it can
745 * result in very long latencies. Hence for sync reclaims, where we wait on the
746 * flush lock, the caller should push out delayed write inodes first before
747 * trying to reclaim them to minimise the amount of time spent waiting. For
748 * background relaim, we just requeue the inode for the next pass.
749 *
750 * Hence the order of actions after gaining the locks should be:
751 * bad => reclaim
752 * shutdown => unpin and reclaim
753 * pinned, delwri => requeue
754 * pinned, sync => unpin
755 * stale => reclaim
756 * clean => reclaim
757 * dirty, delwri => flush and requeue
758 * dirty, sync => flush, wait and reclaim
759 */
715STATIC int 760STATIC int
716xfs_reclaim_inode( 761xfs_reclaim_inode(
717 struct xfs_inode *ip, 762 struct xfs_inode *ip,
718 struct xfs_perag *pag, 763 struct xfs_perag *pag,
719 int sync_mode) 764 int sync_mode)
720{ 765{
766 int error = 0;
767
721 /* 768 /*
722 * The radix tree lock here protects a thread in xfs_iget from racing 769 * The radix tree lock here protects a thread in xfs_iget from racing
723 * with us starting reclaim on the inode. Once we have the 770 * with us starting reclaim on the inode. Once we have the
@@ -735,33 +782,70 @@ xfs_reclaim_inode(
735 spin_unlock(&ip->i_flags_lock); 782 spin_unlock(&ip->i_flags_lock);
736 write_unlock(&pag->pag_ici_lock); 783 write_unlock(&pag->pag_ici_lock);
737 784
738 /*
739 * If the inode is still dirty, then flush it out. If the inode
740 * is not in the AIL, then it will be OK to flush it delwri as
741 * long as xfs_iflush() does not keep any references to the inode.
742 * We leave that decision up to xfs_iflush() since it has the
743 * knowledge of whether it's OK to simply do a delwri flush of
744 * the inode or whether we need to wait until the inode is
745 * pulled from the AIL.
746 * We get the flush lock regardless, though, just to make sure
747 * we don't free it while it is being flushed.
748 */
749 xfs_ilock(ip, XFS_ILOCK_EXCL); 785 xfs_ilock(ip, XFS_ILOCK_EXCL);
750 xfs_iflock(ip); 786 if (!xfs_iflock_nowait(ip)) {
787 if (!(sync_mode & SYNC_WAIT))
788 goto out;
789 xfs_iflock(ip);
790 }
791
792 if (is_bad_inode(VFS_I(ip)))
793 goto reclaim;
794 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
795 xfs_iunpin_wait(ip);
796 goto reclaim;
797 }
798 if (xfs_ipincount(ip)) {
799 if (!(sync_mode & SYNC_WAIT)) {
800 xfs_ifunlock(ip);
801 goto out;
802 }
803 xfs_iunpin_wait(ip);
804 }
805 if (xfs_iflags_test(ip, XFS_ISTALE))
806 goto reclaim;
807 if (xfs_inode_clean(ip))
808 goto reclaim;
809
810 /* Now we have an inode that needs flushing */
811 error = xfs_iflush(ip, sync_mode);
812 if (sync_mode & SYNC_WAIT) {
813 xfs_iflock(ip);
814 goto reclaim;
815 }
751 816
752 /* 817 /*
753 * In the case of a forced shutdown we rely on xfs_iflush() to 818 * When we have to flush an inode but don't have SYNC_WAIT set, we
754 * wait for the inode to be unpinned before returning an error. 819 * flush the inode out using a delwri buffer and wait for the next
820 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and
824 * pass on the error.
755 */ 825 */
756 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { 826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
757 /* synchronize with xfs_iflush_done */ 827 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
758 xfs_iflock(ip); 828 "inode 0x%llx background reclaim flush failed with %d",
759 xfs_ifunlock(ip); 829 (long long)ip->i_ino, error);
760 } 830 }
831out:
832 xfs_iflags_clear(ip, XFS_IRECLAIM);
833 xfs_iunlock(ip, XFS_ILOCK_EXCL);
834 /*
835 * We could return EAGAIN here to make reclaim rescan the inode tree in
836 * a short while. However, this just burns CPU time scanning the tree
837 * waiting for IO to complete and xfssyncd never goes back to the idle
838 * state. Instead, return 0 to let the next scheduled background reclaim
839 * attempt to reclaim the inode again.
840 */
841 return 0;
761 842
843reclaim:
844 xfs_ifunlock(ip);
762 xfs_iunlock(ip, XFS_ILOCK_EXCL); 845 xfs_iunlock(ip, XFS_ILOCK_EXCL);
763 xfs_ireclaim(ip); 846 xfs_ireclaim(ip);
764 return 0; 847 return error;
848
765} 849}
766 850
767int 851int
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index ea932b43335d..d480c346cabb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -37,7 +37,6 @@ void xfs_syncd_stop(struct xfs_mount *mp);
37 37
38int xfs_sync_attr(struct xfs_mount *mp, int flags); 38int xfs_sync_attr(struct xfs_mount *mp, int flags);
39int xfs_sync_data(struct xfs_mount *mp, int flags); 39int xfs_sync_data(struct xfs_mount *mp, int flags);
40int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
41 40
42int xfs_quiesce_data(struct xfs_mount *mp); 41int xfs_quiesce_data(struct xfs_mount *mp);
43void xfs_quiesce_attr(struct xfs_mount *mp); 42void xfs_quiesce_attr(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index c22a608321a3..a4574dcf5065 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -78,6 +78,33 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
78 ) 78 )
79) 79)
80 80
81#define DEFINE_PERAG_REF_EVENT(name) \
82TRACE_EVENT(name, \
83 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
84 unsigned long caller_ip), \
85 TP_ARGS(mp, agno, refcount, caller_ip), \
86 TP_STRUCT__entry( \
87 __field(dev_t, dev) \
88 __field(xfs_agnumber_t, agno) \
89 __field(int, refcount) \
90 __field(unsigned long, caller_ip) \
91 ), \
92 TP_fast_assign( \
93 __entry->dev = mp->m_super->s_dev; \
94 __entry->agno = agno; \
95 __entry->refcount = refcount; \
96 __entry->caller_ip = caller_ip; \
97 ), \
98 TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
99 MAJOR(__entry->dev), MINOR(__entry->dev), \
100 __entry->agno, \
101 __entry->refcount, \
102 (char *)__entry->caller_ip) \
103);
104
105DEFINE_PERAG_REF_EVENT(xfs_perag_get)
106DEFINE_PERAG_REF_EVENT(xfs_perag_put)
107
81#define DEFINE_ATTR_LIST_EVENT(name) \ 108#define DEFINE_ATTR_LIST_EVENT(name) \
82DEFINE_EVENT(xfs_attr_list_class, name, \ 109DEFINE_EVENT(xfs_attr_list_class, name, \
83 TP_PROTO(struct xfs_attr_list_context *ctx), \ 110 TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -456,6 +483,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
456DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); 483DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
457DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); 484DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
458DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); 485DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
459DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); 487DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
460DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); 488DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
461DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); 489DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -1414,6 +1442,59 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
1414 __entry->count) 1442 __entry->count)
1415); 1443);
1416 1444
1445#define XFS_SWAPEXT_INODES \
1446 { 0, "target" }, \
1447 { 1, "temp" }
1448
1449#define XFS_INODE_FORMAT_STR \
1450 { 0, "invalid" }, \
1451 { 1, "local" }, \
1452 { 2, "extent" }, \
1453 { 3, "btree" }
1454
1455DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1456 TP_PROTO(struct xfs_inode *ip, int which),
1457 TP_ARGS(ip, which),
1458 TP_STRUCT__entry(
1459 __field(dev_t, dev)
1460 __field(int, which)
1461 __field(xfs_ino_t, ino)
1462 __field(int, format)
1463 __field(int, nex)
1464 __field(int, max_nex)
1465 __field(int, broot_size)
1466 __field(int, fork_off)
1467 ),
1468 TP_fast_assign(
1469 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1470 __entry->which = which;
1471 __entry->ino = ip->i_ino;
1472 __entry->format = ip->i_d.di_format;
1473 __entry->nex = ip->i_d.di_nextents;
1474 __entry->max_nex = ip->i_df.if_ext_max;
1475 __entry->broot_size = ip->i_df.if_broot_bytes;
1476 __entry->fork_off = XFS_IFORK_BOFF(ip);
1477 ),
1478 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
1479 "Max in-fork extents %d, broot size %d, fork offset %d",
1480 MAJOR(__entry->dev), MINOR(__entry->dev),
1481 __entry->ino,
1482 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
1483 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
1484 __entry->nex,
1485 __entry->max_nex,
1486 __entry->broot_size,
1487 __entry->fork_off)
1488)
1489
1490#define DEFINE_SWAPEXT_EVENT(name) \
1491DEFINE_EVENT(xfs_swap_extent_class, name, \
1492 TP_PROTO(struct xfs_inode *ip, int which), \
1493 TP_ARGS(ip, which))
1494
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497
1417#endif /* _TRACE_XFS_H */ 1498#endif /* _TRACE_XFS_H */
1418 1499
1419#undef TRACE_INCLUDE_PATH 1500#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 0b1878857fc3..fa01b9daba6b 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -45,7 +45,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
45 value = NULL; 45 value = NULL;
46 } 46 }
47 47
48 error = -xfs_attr_get(ip, name, value, &asize, xflags); 48 error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
49 if (error) 49 if (error)
50 return error; 50 return error;
51 return asize; 51 return asize;
@@ -67,8 +67,9 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
67 xflags |= ATTR_REPLACE; 67 xflags |= ATTR_REPLACE;
68 68
69 if (!value) 69 if (!value)
70 return -xfs_attr_remove(ip, name, xflags); 70 return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
71 return -xfs_attr_set(ip, name, (void *)value, size, xflags); 71 return -xfs_attr_set(ip, (unsigned char *)name,
72 (void *)value, size, xflags);
72} 73}
73 74
74static struct xattr_handler xfs_xattr_user_handler = { 75static struct xattr_handler xfs_xattr_user_handler = {
@@ -124,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags)
124} 125}
125 126
126static int 127static int
127xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, 128xfs_xattr_put_listent(
128 char *name, int namelen, int valuelen, char *value) 129 struct xfs_attr_list_context *context,
130 int flags,
131 unsigned char *name,
132 int namelen,
133 int valuelen,
134 unsigned char *value)
129{ 135{
130 unsigned int prefix_len = xfs_xattr_prefix_len(flags); 136 unsigned int prefix_len = xfs_xattr_prefix_len(flags);
131 char *offset; 137 char *offset;
@@ -148,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
148 offset = (char *)context->alist + context->count; 154 offset = (char *)context->alist + context->count;
149 strncpy(offset, xfs_xattr_prefix(flags), prefix_len); 155 strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
150 offset += prefix_len; 156 offset += prefix_len;
151 strncpy(offset, name, namelen); /* real name */ 157 strncpy(offset, (char *)name, namelen); /* real name */
152 offset += namelen; 158 offset += namelen;
153 *offset = '\0'; 159 *offset = '\0';
154 context->count += prefix_len + namelen + 1; 160 context->count += prefix_len + namelen + 1;
@@ -156,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
156} 162}
157 163
158static int 164static int
159xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, 165xfs_xattr_put_listent_sizes(
160 char *name, int namelen, int valuelen, char *value) 166 struct xfs_attr_list_context *context,
167 int flags,
168 unsigned char *name,
169 int namelen,
170 int valuelen,
171 unsigned char *value)
161{ 172{
162 context->count += xfs_xattr_prefix_len(flags) + namelen + 1; 173 context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
163 return 0; 174 return 0;
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d7c7eea09fc2..5f79dd78626b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1187,7 +1187,7 @@ xfs_qm_dqflush(
1187 * block, nada. 1187 * block, nada.
1188 */ 1188 */
1189 if (!XFS_DQ_IS_DIRTY(dqp) || 1189 if (!XFS_DQ_IS_DIRTY(dqp) ||
1190 (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) { 1190 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
1191 xfs_dqfunlock(dqp); 1191 xfs_dqfunlock(dqp);
1192 return 0; 1192 return 0;
1193 } 1193 }
@@ -1248,23 +1248,20 @@ xfs_qm_dqflush(
1248 */ 1248 */
1249 if (XFS_BUF_ISPINNED(bp)) { 1249 if (XFS_BUF_ISPINNED(bp)) {
1250 trace_xfs_dqflush_force(dqp); 1250 trace_xfs_dqflush_force(dqp);
1251 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 1251 xfs_log_force(mp, 0);
1252 } 1252 }
1253 1253
1254 if (flags & XFS_QMOPT_DELWRI) { 1254 if (flags & SYNC_WAIT)
1255 xfs_bdwrite(mp, bp);
1256 } else if (flags & XFS_QMOPT_ASYNC) {
1257 error = xfs_bawrite(mp, bp);
1258 } else {
1259 error = xfs_bwrite(mp, bp); 1255 error = xfs_bwrite(mp, bp);
1260 } 1256 else
1257 xfs_bdwrite(mp, bp);
1261 1258
1262 trace_xfs_dqflush_done(dqp); 1259 trace_xfs_dqflush_done(dqp);
1263 1260
1264 /* 1261 /*
1265 * dqp is still locked, but caller is free to unlock it now. 1262 * dqp is still locked, but caller is free to unlock it now.
1266 */ 1263 */
1267 return (error); 1264 return error;
1268 1265
1269} 1266}
1270 1267
@@ -1445,7 +1442,7 @@ xfs_qm_dqpurge(
1445 * We don't care about getting disk errors here. We need 1442 * We don't care about getting disk errors here. We need
1446 * to purge this dquot anyway, so we go ahead regardless. 1443 * to purge this dquot anyway, so we go ahead regardless.
1447 */ 1444 */
1448 error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); 1445 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
1449 if (error) 1446 if (error)
1450 xfs_fs_cmn_err(CE_WARN, mp, 1447 xfs_fs_cmn_err(CE_WARN, mp,
1451 "xfs_qm_dqpurge: dquot %p flush failed", dqp); 1448 "xfs_qm_dqpurge: dquot %p flush failed", dqp);
@@ -1529,25 +1526,17 @@ xfs_qm_dqflock_pushbuf_wait(
1529 * the flush lock when the I/O completes. 1526 * the flush lock when the I/O completes.
1530 */ 1527 */
1531 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, 1528 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
1532 XFS_QI_DQCHUNKLEN(dqp->q_mount), 1529 XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
1533 XFS_INCORE_TRYLOCK); 1530 if (!bp)
1534 if (bp != NULL) { 1531 goto out_lock;
1535 if (XFS_BUF_ISDELAYWRITE(bp)) { 1532
1536 int error; 1533 if (XFS_BUF_ISDELAYWRITE(bp)) {
1537 if (XFS_BUF_ISPINNED(bp)) { 1534 if (XFS_BUF_ISPINNED(bp))
1538 xfs_log_force(dqp->q_mount, 1535 xfs_log_force(dqp->q_mount, 0);
1539 (xfs_lsn_t)0, 1536 xfs_buf_delwri_promote(bp);
1540 XFS_LOG_FORCE); 1537 wake_up_process(bp->b_target->bt_task);
1541 }
1542 error = xfs_bawrite(dqp->q_mount, bp);
1543 if (error)
1544 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
1545 "xfs_qm_dqflock_pushbuf_wait: "
1546 "pushbuf error %d on dqp %p, bp %p",
1547 error, dqp, bp);
1548 } else {
1549 xfs_buf_relse(bp);
1550 }
1551 } 1538 }
1539 xfs_buf_relse(bp);
1540out_lock:
1552 xfs_dqflock(dqp); 1541 xfs_dqflock(dqp);
1553} 1542}
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index d0d4a9a0bbd7..4e4ee9a57194 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -74,11 +74,11 @@ xfs_qm_dquot_logitem_format(
74 74
75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; 75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
76 logvec->i_len = sizeof(xfs_dq_logformat_t); 76 logvec->i_len = sizeof(xfs_dq_logformat_t);
77 XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT); 77 logvec->i_type = XLOG_REG_TYPE_QFORMAT;
78 logvec++; 78 logvec++;
79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; 79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
80 logvec->i_len = sizeof(xfs_disk_dquot_t); 80 logvec->i_len = sizeof(xfs_disk_dquot_t);
81 XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT); 81 logvec->i_type = XLOG_REG_TYPE_DQUOT;
82 82
83 ASSERT(2 == logitem->qli_item.li_desc->lid_size); 83 ASSERT(2 == logitem->qli_item.li_desc->lid_size);
84 logitem->qli_format.qlf_size = 2; 84 logitem->qli_format.qlf_size = 2;
@@ -153,7 +153,7 @@ xfs_qm_dquot_logitem_push(
153 * lock without sleeping, then there must not have been 153 * lock without sleeping, then there must not have been
154 * anyone in the process of flushing the dquot. 154 * anyone in the process of flushing the dquot.
155 */ 155 */
156 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 156 error = xfs_qm_dqflush(dqp, 0);
157 if (error) 157 if (error)
158 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 158 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
159 "xfs_qm_dquot_logitem_push: push error %d on dqp %p", 159 "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
@@ -190,7 +190,7 @@ xfs_qm_dqunpin_wait(
190 /* 190 /*
191 * Give the log a push so we don't wait here too long. 191 * Give the log a push so we don't wait here too long.
192 */ 192 */
193 xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); 193 xfs_log_force(dqp->q_mount, 0);
194 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); 194 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
195} 195}
196 196
@@ -212,68 +212,31 @@ xfs_qm_dquot_logitem_pushbuf(
212 xfs_dquot_t *dqp; 212 xfs_dquot_t *dqp;
213 xfs_mount_t *mp; 213 xfs_mount_t *mp;
214 xfs_buf_t *bp; 214 xfs_buf_t *bp;
215 uint dopush;
216 215
217 dqp = qip->qli_dquot; 216 dqp = qip->qli_dquot;
218 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 217 ASSERT(XFS_DQ_IS_LOCKED(dqp));
219 218
220 /* 219 /*
221 * The qli_pushbuf_flag keeps others from
222 * trying to duplicate our effort.
223 */
224 ASSERT(qip->qli_pushbuf_flag != 0);
225 ASSERT(qip->qli_push_owner == current_pid());
226
227 /*
228 * If flushlock isn't locked anymore, chances are that the 220 * If flushlock isn't locked anymore, chances are that the
229 * inode flush completed and the inode was taken off the AIL. 221 * inode flush completed and the inode was taken off the AIL.
230 * So, just get out. 222 * So, just get out.
231 */ 223 */
232 if (completion_done(&dqp->q_flush) || 224 if (completion_done(&dqp->q_flush) ||
233 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { 225 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
234 qip->qli_pushbuf_flag = 0;
235 xfs_dqunlock(dqp); 226 xfs_dqunlock(dqp);
236 return; 227 return;
237 } 228 }
238 mp = dqp->q_mount; 229 mp = dqp->q_mount;
239 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 230 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
240 XFS_QI_DQCHUNKLEN(mp), 231 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
241 XFS_INCORE_TRYLOCK); 232 xfs_dqunlock(dqp);
242 if (bp != NULL) { 233 if (!bp)
243 if (XFS_BUF_ISDELAYWRITE(bp)) {
244 dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
245 !completion_done(&dqp->q_flush));
246 qip->qli_pushbuf_flag = 0;
247 xfs_dqunlock(dqp);
248
249 if (XFS_BUF_ISPINNED(bp)) {
250 xfs_log_force(mp, (xfs_lsn_t)0,
251 XFS_LOG_FORCE);
252 }
253 if (dopush) {
254 int error;
255#ifdef XFSRACEDEBUG
256 delay_for_intr();
257 delay(300);
258#endif
259 error = xfs_bawrite(mp, bp);
260 if (error)
261 xfs_fs_cmn_err(CE_WARN, mp,
262 "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
263 error, qip, bp);
264 } else {
265 xfs_buf_relse(bp);
266 }
267 } else {
268 qip->qli_pushbuf_flag = 0;
269 xfs_dqunlock(dqp);
270 xfs_buf_relse(bp);
271 }
272 return; 234 return;
273 } 235 if (XFS_BUF_ISDELAYWRITE(bp))
236 xfs_buf_delwri_promote(bp);
237 xfs_buf_relse(bp);
238 return;
274 239
275 qip->qli_pushbuf_flag = 0;
276 xfs_dqunlock(dqp);
277} 240}
278 241
279/* 242/*
@@ -291,50 +254,24 @@ xfs_qm_dquot_logitem_trylock(
291 xfs_dq_logitem_t *qip) 254 xfs_dq_logitem_t *qip)
292{ 255{
293 xfs_dquot_t *dqp; 256 xfs_dquot_t *dqp;
294 uint retval;
295 257
296 dqp = qip->qli_dquot; 258 dqp = qip->qli_dquot;
297 if (atomic_read(&dqp->q_pincount) > 0) 259 if (atomic_read(&dqp->q_pincount) > 0)
298 return (XFS_ITEM_PINNED); 260 return XFS_ITEM_PINNED;
299 261
300 if (! xfs_qm_dqlock_nowait(dqp)) 262 if (! xfs_qm_dqlock_nowait(dqp))
301 return (XFS_ITEM_LOCKED); 263 return XFS_ITEM_LOCKED;
302 264
303 retval = XFS_ITEM_SUCCESS;
304 if (!xfs_dqflock_nowait(dqp)) { 265 if (!xfs_dqflock_nowait(dqp)) {
305 /* 266 /*
306 * The dquot is already being flushed. It may have been 267 * dquot has already been flushed to the backing buffer,
307 * flushed delayed write, however, and we don't want to 268 * leave it locked, pushbuf routine will unlock it.
308 * get stuck waiting for that to complete. So, we want to check
309 * to see if we can lock the dquot's buffer without sleeping.
310 * If we can and it is marked for delayed write, then we
311 * hold it and send it out from the push routine. We don't
312 * want to do that now since we might sleep in the device
313 * strategy routine. We also don't want to grab the buffer lock
314 * here because we'd like not to call into the buffer cache
315 * while holding the AIL lock.
316 * Make sure to only return PUSHBUF if we set pushbuf_flag
317 * ourselves. If someone else is doing it then we don't
318 * want to go to the push routine and duplicate their efforts.
319 */ 269 */
320 if (qip->qli_pushbuf_flag == 0) { 270 return XFS_ITEM_PUSHBUF;
321 qip->qli_pushbuf_flag = 1;
322 ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
323#ifdef DEBUG
324 qip->qli_push_owner = current_pid();
325#endif
326 /*
327 * The dquot is left locked.
328 */
329 retval = XFS_ITEM_PUSHBUF;
330 } else {
331 retval = XFS_ITEM_FLUSHING;
332 xfs_dqunlock_nonotify(dqp);
333 }
334 } 271 }
335 272
336 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); 273 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
337 return (retval); 274 return XFS_ITEM_SUCCESS;
338} 275}
339 276
340 277
@@ -467,7 +404,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf,
467 404
468 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); 405 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
469 log_vector->i_len = sizeof(xfs_qoff_logitem_t); 406 log_vector->i_len = sizeof(xfs_qoff_logitem_t);
470 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF); 407 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
471 qf->qql_format.qf_size = 1; 408 qf->qql_format.qf_size = 1;
472} 409}
473 410
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
index 5a632531f843..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem {
27 xfs_log_item_t qli_item; /* common portion */ 27 xfs_log_item_t qli_item; /* common portion */
28 struct xfs_dquot *qli_dquot; /* dquot ptr */ 28 struct xfs_dquot *qli_dquot; /* dquot ptr */
29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ 29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
30 unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */
31#ifdef DEBUG
32 uint64_t qli_push_owner;
33#endif
34 xfs_dq_logformat_t qli_format; /* logged structure */ 30 xfs_dq_logformat_t qli_format; /* logged structure */
35} xfs_dq_logitem_t; 31} xfs_dq_logitem_t;
36 32
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9e627a8b5b0e..417e61e3d9dd 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -118,9 +118,14 @@ xfs_Gqm_init(void)
118 */ 118 */
119 udqhash = kmem_zalloc_greedy(&hsize, 119 udqhash = kmem_zalloc_greedy(&hsize,
120 XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), 120 XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
121 XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t), 121 XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
122 KM_SLEEP | KM_MAYFAIL | KM_LARGE); 122 if (!udqhash)
123 gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE); 123 goto out;
124
125 gdqhash = kmem_zalloc_large(hsize);
126 if (!gdqhash)
127 goto out_free_udqhash;
128
124 hsize /= sizeof(xfs_dqhash_t); 129 hsize /= sizeof(xfs_dqhash_t);
125 ndquot = hsize << 8; 130 ndquot = hsize << 8;
126 131
@@ -170,6 +175,11 @@ xfs_Gqm_init(void)
170 mutex_init(&qcheck_lock); 175 mutex_init(&qcheck_lock);
171#endif 176#endif
172 return xqm; 177 return xqm;
178
179 out_free_udqhash:
180 kmem_free_large(udqhash);
181 out:
182 return NULL;
173} 183}
174 184
175/* 185/*
@@ -189,8 +199,8 @@ xfs_qm_destroy(
189 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); 199 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
190 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); 200 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
191 } 201 }
192 kmem_free(xqm->qm_usr_dqhtable); 202 kmem_free_large(xqm->qm_usr_dqhtable);
193 kmem_free(xqm->qm_grp_dqhtable); 203 kmem_free_large(xqm->qm_grp_dqhtable);
194 xqm->qm_usr_dqhtable = NULL; 204 xqm->qm_usr_dqhtable = NULL;
195 xqm->qm_grp_dqhtable = NULL; 205 xqm->qm_grp_dqhtable = NULL;
196 xqm->qm_dqhashmask = 0; 206 xqm->qm_dqhashmask = 0;
@@ -219,8 +229,12 @@ xfs_qm_hold_quotafs_ref(
219 */ 229 */
220 mutex_lock(&xfs_Gqm_lock); 230 mutex_lock(&xfs_Gqm_lock);
221 231
222 if (xfs_Gqm == NULL) 232 if (!xfs_Gqm) {
223 xfs_Gqm = xfs_Gqm_init(); 233 xfs_Gqm = xfs_Gqm_init();
234 if (!xfs_Gqm)
235 return ENOMEM;
236 }
237
224 /* 238 /*
225 * We can keep a list of all filesystems with quotas mounted for 239 * We can keep a list of all filesystems with quotas mounted for
226 * debugging and statistical purposes, but ... 240 * debugging and statistical purposes, but ...
@@ -436,7 +450,7 @@ xfs_qm_unmount_quotas(
436STATIC int 450STATIC int
437xfs_qm_dqflush_all( 451xfs_qm_dqflush_all(
438 xfs_mount_t *mp, 452 xfs_mount_t *mp,
439 int flags) 453 int sync_mode)
440{ 454{
441 int recl; 455 int recl;
442 xfs_dquot_t *dqp; 456 xfs_dquot_t *dqp;
@@ -472,7 +486,7 @@ again:
472 * across a disk write. 486 * across a disk write.
473 */ 487 */
474 xfs_qm_mplist_unlock(mp); 488 xfs_qm_mplist_unlock(mp);
475 error = xfs_qm_dqflush(dqp, flags); 489 error = xfs_qm_dqflush(dqp, sync_mode);
476 xfs_dqunlock(dqp); 490 xfs_dqunlock(dqp);
477 if (error) 491 if (error)
478 return error; 492 return error;
@@ -912,13 +926,11 @@ xfs_qm_sync(
912{ 926{
913 int recl, restarts; 927 int recl, restarts;
914 xfs_dquot_t *dqp; 928 xfs_dquot_t *dqp;
915 uint flush_flags;
916 int error; 929 int error;
917 930
918 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 931 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
919 return 0; 932 return 0;
920 933
921 flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
922 restarts = 0; 934 restarts = 0;
923 935
924 again: 936 again:
@@ -978,7 +990,7 @@ xfs_qm_sync(
978 * across a disk write 990 * across a disk write
979 */ 991 */
980 xfs_qm_mplist_unlock(mp); 992 xfs_qm_mplist_unlock(mp);
981 error = xfs_qm_dqflush(dqp, flush_flags); 993 error = xfs_qm_dqflush(dqp, flags);
982 xfs_dqunlock(dqp); 994 xfs_dqunlock(dqp);
983 if (error && XFS_FORCED_SHUTDOWN(mp)) 995 if (error && XFS_FORCED_SHUTDOWN(mp))
984 return 0; /* Need to prevent umount failure */ 996 return 0; /* Need to prevent umount failure */
@@ -1782,7 +1794,7 @@ xfs_qm_quotacheck(
1782 * successfully. 1794 * successfully.
1783 */ 1795 */
1784 if (!error) 1796 if (!error)
1785 error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); 1797 error = xfs_qm_dqflush_all(mp, 0);
1786 1798
1787 /* 1799 /*
1788 * We can get this error if we couldn't do a dquot allocation inside 1800 * We can get this error if we couldn't do a dquot allocation inside
@@ -2004,7 +2016,7 @@ xfs_qm_shake_freelist(
2004 * We flush it delayed write, so don't bother 2016 * We flush it delayed write, so don't bother
2005 * releasing the mplock. 2017 * releasing the mplock.
2006 */ 2018 */
2007 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2019 error = xfs_qm_dqflush(dqp, 0);
2008 if (error) { 2020 if (error) {
2009 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2021 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2010 "xfs_qm_dqflush_all: dquot %p flush failed", dqp); 2022 "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
@@ -2187,7 +2199,7 @@ xfs_qm_dqreclaim_one(void)
2187 * We flush it delayed write, so don't bother 2199 * We flush it delayed write, so don't bother
2188 * releasing the freelist lock. 2200 * releasing the freelist lock.
2189 */ 2201 */
2190 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2202 error = xfs_qm_dqflush(dqp, 0);
2191 if (error) { 2203 if (error) {
2192 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2204 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2193 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 2205 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index a5346630dfae..97b410c12794 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
59 be64_to_cpu(dp->d_blk_hardlimit); 59 be64_to_cpu(dp->d_blk_hardlimit);
60 if (limit && statp->f_blocks > limit) { 60 if (limit && statp->f_blocks > limit) {
61 statp->f_blocks = limit; 61 statp->f_blocks = limit;
62 statp->f_bfree = 62 statp->f_bfree = statp->f_bavail =
63 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? 63 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
64 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; 64 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
65 } 65 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 873e07e29074..5d0ee8d492db 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1192,9 +1192,9 @@ xfs_qm_internalqcheck(
1192 if (! XFS_IS_QUOTA_ON(mp)) 1192 if (! XFS_IS_QUOTA_ON(mp))
1193 return XFS_ERROR(ESRCH); 1193 return XFS_ERROR(ESRCH);
1194 1194
1195 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1195 xfs_log_force(mp, XFS_LOG_SYNC);
1196 XFS_bflush(mp->m_ddev_targp); 1196 XFS_bflush(mp->m_ddev_targp);
1197 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1197 xfs_log_force(mp, XFS_LOG_SYNC);
1198 XFS_bflush(mp->m_ddev_targp); 1198 XFS_bflush(mp->m_ddev_targp);
1199 1199
1200 mutex_lock(&qcheck_lock); 1200 mutex_lock(&qcheck_lock);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 97ac9640be98..c3ab75cb1d9a 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -589,12 +589,18 @@ xfs_trans_unreserve_and_mod_dquots(
589 } 589 }
590} 590}
591 591
592STATIC int 592STATIC void
593xfs_quota_error(uint flags) 593xfs_quota_warn(
594 struct xfs_mount *mp,
595 struct xfs_dquot *dqp,
596 int type)
594{ 597{
595 if (flags & XFS_QMOPT_ENOSPC) 598 /* no warnings for project quotas - we just return ENOSPC later */
596 return ENOSPC; 599 if (dqp->dq_flags & XFS_DQ_PROJ)
597 return EDQUOT; 600 return;
601 quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
602 be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
603 type);
598} 604}
599 605
600/* 606/*
@@ -612,7 +618,6 @@ xfs_trans_dqresv(
612 long ninos, 618 long ninos,
613 uint flags) 619 uint flags)
614{ 620{
615 int error;
616 xfs_qcnt_t hardlimit; 621 xfs_qcnt_t hardlimit;
617 xfs_qcnt_t softlimit; 622 xfs_qcnt_t softlimit;
618 time_t timer; 623 time_t timer;
@@ -649,7 +654,6 @@ xfs_trans_dqresv(
649 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); 654 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
650 resbcountp = &dqp->q_res_rtbcount; 655 resbcountp = &dqp->q_res_rtbcount;
651 } 656 }
652 error = 0;
653 657
654 if ((flags & XFS_QMOPT_FORCE_RES) == 0 && 658 if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
655 dqp->q_core.d_id && 659 dqp->q_core.d_id &&
@@ -667,18 +671,20 @@ xfs_trans_dqresv(
667 * nblks. 671 * nblks.
668 */ 672 */
669 if (hardlimit > 0ULL && 673 if (hardlimit > 0ULL &&
670 (hardlimit <= nblks + *resbcountp)) { 674 hardlimit <= nblks + *resbcountp) {
671 error = xfs_quota_error(flags); 675 xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
672 goto error_return; 676 goto error_return;
673 } 677 }
674
675 if (softlimit > 0ULL && 678 if (softlimit > 0ULL &&
676 (softlimit <= nblks + *resbcountp)) { 679 softlimit <= nblks + *resbcountp) {
677 if ((timer != 0 && get_seconds() > timer) || 680 if ((timer != 0 && get_seconds() > timer) ||
678 (warns != 0 && warns >= warnlimit)) { 681 (warns != 0 && warns >= warnlimit)) {
679 error = xfs_quota_error(flags); 682 xfs_quota_warn(mp, dqp,
683 QUOTA_NL_BSOFTLONGWARN);
680 goto error_return; 684 goto error_return;
681 } 685 }
686
687 xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
682 } 688 }
683 } 689 }
684 if (ninos > 0) { 690 if (ninos > 0) {
@@ -692,15 +698,19 @@ xfs_trans_dqresv(
692 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); 698 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
693 if (!softlimit) 699 if (!softlimit)
694 softlimit = q->qi_isoftlimit; 700 softlimit = q->qi_isoftlimit;
701
695 if (hardlimit > 0ULL && count >= hardlimit) { 702 if (hardlimit > 0ULL && count >= hardlimit) {
696 error = xfs_quota_error(flags); 703 xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
697 goto error_return; 704 goto error_return;
698 } else if (softlimit > 0ULL && count >= softlimit) { 705 }
699 if ((timer != 0 && get_seconds() > timer) || 706 if (softlimit > 0ULL && count >= softlimit) {
707 if ((timer != 0 && get_seconds() > timer) ||
700 (warns != 0 && warns >= warnlimit)) { 708 (warns != 0 && warns >= warnlimit)) {
701 error = xfs_quota_error(flags); 709 xfs_quota_warn(mp, dqp,
710 QUOTA_NL_ISOFTLONGWARN);
702 goto error_return; 711 goto error_return;
703 } 712 }
713 xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
704 } 714 }
705 } 715 }
706 } 716 }
@@ -736,9 +746,14 @@ xfs_trans_dqresv(
736 ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); 746 ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
737 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); 747 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
738 748
749 xfs_dqunlock(dqp);
750 return 0;
751
739error_return: 752error_return:
740 xfs_dqunlock(dqp); 753 xfs_dqunlock(dqp);
741 return error; 754 if (flags & XFS_QMOPT_ENOSPC)
755 return ENOSPC;
756 return EDQUOT;
742} 757}
743 758
744 759
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 00fd357c3e46..d13eeba2c8f8 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -36,8 +36,8 @@ struct xfs_acl {
36}; 36};
37 37
38/* On-disk XFS extended attribute names */ 38/* On-disk XFS extended attribute names */
39#define SGI_ACL_FILE "SGI_ACL_FILE" 39#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
40#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" 40#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
41#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) 41#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
43 43
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 6702bd865811..b1a5a1ff88ea 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,17 +187,13 @@ typedef struct xfs_perag_busy {
187/* 187/*
188 * Per-ag incore structure, copies of information in agf and agi, 188 * Per-ag incore structure, copies of information in agf and agi,
189 * to improve the performance of allocation group selection. 189 * to improve the performance of allocation group selection.
190 *
191 * pick sizes which fit in allocation buckets well
192 */ 190 */
193#if (BITS_PER_LONG == 32)
194#define XFS_PAGB_NUM_SLOTS 84
195#elif (BITS_PER_LONG == 64)
196#define XFS_PAGB_NUM_SLOTS 128 191#define XFS_PAGB_NUM_SLOTS 128
197#endif
198 192
199typedef struct xfs_perag 193typedef struct xfs_perag {
200{ 194 struct xfs_mount *pag_mount; /* owner filesystem */
195 xfs_agnumber_t pag_agno; /* AG this structure belongs to */
196 atomic_t pag_ref; /* perag reference count */
201 char pagf_init; /* this agf's entry is initialized */ 197 char pagf_init; /* this agf's entry is initialized */
202 char pagi_init; /* this agi's entry is initialized */ 198 char pagi_init; /* this agi's entry is initialized */
203 char pagf_metadata; /* the agf is preferred to be metadata */ 199 char pagf_metadata; /* the agf is preferred to be metadata */
@@ -210,8 +206,6 @@ typedef struct xfs_perag
210 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ 206 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
211 xfs_agino_t pagi_freecount; /* number of free inodes */ 207 xfs_agino_t pagi_freecount; /* number of free inodes */
212 xfs_agino_t pagi_count; /* number of allocated inodes */ 208 xfs_agino_t pagi_count; /* number of allocated inodes */
213 int pagb_count; /* pagb slots in use */
214 xfs_perag_busy_t *pagb_list; /* unstable blocks */
215 209
216 /* 210 /*
217 * Inode allocation search lookup optimisation. 211 * Inode allocation search lookup optimisation.
@@ -230,6 +224,8 @@ typedef struct xfs_perag
230 rwlock_t pag_ici_lock; /* incore inode lock */ 224 rwlock_t pag_ici_lock; /* incore inode lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 225 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232#endif 226#endif
227 int pagb_count; /* pagb slots in use */
228 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
233} xfs_perag_t; 229} xfs_perag_t;
234 230
235/* 231/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 275b1f4f9430..94cddbfb2560 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1662,11 +1662,13 @@ xfs_free_ag_extent(
1662 xfs_agf_t *agf; 1662 xfs_agf_t *agf;
1663 xfs_perag_t *pag; /* per allocation group data */ 1663 xfs_perag_t *pag; /* per allocation group data */
1664 1664
1665 pag = xfs_perag_get(mp, agno);
1666 pag->pagf_freeblks += len;
1667 xfs_perag_put(pag);
1668
1665 agf = XFS_BUF_TO_AGF(agbp); 1669 agf = XFS_BUF_TO_AGF(agbp);
1666 pag = &mp->m_perag[agno];
1667 be32_add_cpu(&agf->agf_freeblks, len); 1670 be32_add_cpu(&agf->agf_freeblks, len);
1668 xfs_trans_agblocks_delta(tp, len); 1671 xfs_trans_agblocks_delta(tp, len);
1669 pag->pagf_freeblks += len;
1670 XFS_WANT_CORRUPTED_GOTO( 1672 XFS_WANT_CORRUPTED_GOTO(
1671 be32_to_cpu(agf->agf_freeblks) <= 1673 be32_to_cpu(agf->agf_freeblks) <=
1672 be32_to_cpu(agf->agf_length), 1674 be32_to_cpu(agf->agf_length),
@@ -1969,10 +1971,12 @@ xfs_alloc_get_freelist(
1969 xfs_trans_brelse(tp, agflbp); 1971 xfs_trans_brelse(tp, agflbp);
1970 if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) 1972 if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
1971 agf->agf_flfirst = 0; 1973 agf->agf_flfirst = 0;
1972 pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; 1974
1975 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
1973 be32_add_cpu(&agf->agf_flcount, -1); 1976 be32_add_cpu(&agf->agf_flcount, -1);
1974 xfs_trans_agflist_delta(tp, -1); 1977 xfs_trans_agflist_delta(tp, -1);
1975 pag->pagf_flcount--; 1978 pag->pagf_flcount--;
1979 xfs_perag_put(pag);
1976 1980
1977 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 1981 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
1978 if (btreeblk) { 1982 if (btreeblk) {
@@ -2078,7 +2082,8 @@ xfs_alloc_put_freelist(
2078 be32_add_cpu(&agf->agf_fllast, 1); 2082 be32_add_cpu(&agf->agf_fllast, 1);
2079 if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) 2083 if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
2080 agf->agf_fllast = 0; 2084 agf->agf_fllast = 0;
2081 pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; 2085
2086 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2082 be32_add_cpu(&agf->agf_flcount, 1); 2087 be32_add_cpu(&agf->agf_flcount, 1);
2083 xfs_trans_agflist_delta(tp, 1); 2088 xfs_trans_agflist_delta(tp, 1);
2084 pag->pagf_flcount++; 2089 pag->pagf_flcount++;
@@ -2089,6 +2094,7 @@ xfs_alloc_put_freelist(
2089 pag->pagf_btreeblks--; 2094 pag->pagf_btreeblks--;
2090 logflags |= XFS_AGF_BTREEBLKS; 2095 logflags |= XFS_AGF_BTREEBLKS;
2091 } 2096 }
2097 xfs_perag_put(pag);
2092 2098
2093 xfs_alloc_log_agf(tp, agbp, logflags); 2099 xfs_alloc_log_agf(tp, agbp, logflags);
2094 2100
@@ -2152,7 +2158,6 @@ xfs_read_agf(
2152 xfs_trans_brelse(tp, *bpp); 2158 xfs_trans_brelse(tp, *bpp);
2153 return XFS_ERROR(EFSCORRUPTED); 2159 return XFS_ERROR(EFSCORRUPTED);
2154 } 2160 }
2155
2156 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); 2161 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
2157 return 0; 2162 return 0;
2158} 2163}
@@ -2175,7 +2180,7 @@ xfs_alloc_read_agf(
2175 ASSERT(agno != NULLAGNUMBER); 2180 ASSERT(agno != NULLAGNUMBER);
2176 2181
2177 error = xfs_read_agf(mp, tp, agno, 2182 error = xfs_read_agf(mp, tp, agno,
2178 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0, 2183 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2179 bpp); 2184 bpp);
2180 if (error) 2185 if (error)
2181 return error; 2186 return error;
@@ -2184,7 +2189,7 @@ xfs_alloc_read_agf(
2184 ASSERT(!XFS_BUF_GETERROR(*bpp)); 2189 ASSERT(!XFS_BUF_GETERROR(*bpp));
2185 2190
2186 agf = XFS_BUF_TO_AGF(*bpp); 2191 agf = XFS_BUF_TO_AGF(*bpp);
2187 pag = &mp->m_perag[agno]; 2192 pag = xfs_perag_get(mp, agno);
2188 if (!pag->pagf_init) { 2193 if (!pag->pagf_init) {
2189 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 2194 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
2190 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 2195 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -2195,8 +2200,8 @@ xfs_alloc_read_agf(
2195 pag->pagf_levels[XFS_BTNUM_CNTi] = 2200 pag->pagf_levels[XFS_BTNUM_CNTi] =
2196 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2197 spin_lock_init(&pag->pagb_lock); 2202 spin_lock_init(&pag->pagb_lock);
2198 pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * 2203 pag->pagb_count = 0;
2199 sizeof(xfs_perag_busy_t), KM_SLEEP); 2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
2200 pag->pagf_init = 1; 2205 pag->pagf_init = 1;
2201 } 2206 }
2202#ifdef DEBUG 2207#ifdef DEBUG
@@ -2211,6 +2216,7 @@ xfs_alloc_read_agf(
2211 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 2216 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
2212 } 2217 }
2213#endif 2218#endif
2219 xfs_perag_put(pag);
2214 return 0; 2220 return 0;
2215} 2221}
2216 2222
@@ -2270,8 +2276,7 @@ xfs_alloc_vextent(
2270 * These three force us into a single a.g. 2276 * These three force us into a single a.g.
2271 */ 2277 */
2272 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); 2278 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2273 down_read(&mp->m_peraglock); 2279 args->pag = xfs_perag_get(mp, args->agno);
2274 args->pag = &mp->m_perag[args->agno];
2275 args->minleft = 0; 2280 args->minleft = 0;
2276 error = xfs_alloc_fix_freelist(args, 0); 2281 error = xfs_alloc_fix_freelist(args, 0);
2277 args->minleft = minleft; 2282 args->minleft = minleft;
@@ -2280,14 +2285,12 @@ xfs_alloc_vextent(
2280 goto error0; 2285 goto error0;
2281 } 2286 }
2282 if (!args->agbp) { 2287 if (!args->agbp) {
2283 up_read(&mp->m_peraglock);
2284 trace_xfs_alloc_vextent_noagbp(args); 2288 trace_xfs_alloc_vextent_noagbp(args);
2285 break; 2289 break;
2286 } 2290 }
2287 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 2291 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2288 if ((error = xfs_alloc_ag_vextent(args))) 2292 if ((error = xfs_alloc_ag_vextent(args)))
2289 goto error0; 2293 goto error0;
2290 up_read(&mp->m_peraglock);
2291 break; 2294 break;
2292 case XFS_ALLOCTYPE_START_BNO: 2295 case XFS_ALLOCTYPE_START_BNO:
2293 /* 2296 /*
@@ -2339,9 +2342,8 @@ xfs_alloc_vextent(
2339 * Loop over allocation groups twice; first time with 2342 * Loop over allocation groups twice; first time with
2340 * trylock set, second time without. 2343 * trylock set, second time without.
2341 */ 2344 */
2342 down_read(&mp->m_peraglock);
2343 for (;;) { 2345 for (;;) {
2344 args->pag = &mp->m_perag[args->agno]; 2346 args->pag = xfs_perag_get(mp, args->agno);
2345 if (no_min) args->minleft = 0; 2347 if (no_min) args->minleft = 0;
2346 error = xfs_alloc_fix_freelist(args, flags); 2348 error = xfs_alloc_fix_freelist(args, flags);
2347 args->minleft = minleft; 2349 args->minleft = minleft;
@@ -2400,8 +2402,8 @@ xfs_alloc_vextent(
2400 } 2402 }
2401 } 2403 }
2402 } 2404 }
2405 xfs_perag_put(args->pag);
2403 } 2406 }
2404 up_read(&mp->m_peraglock);
2405 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { 2407 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
2406 if (args->agno == sagno) 2408 if (args->agno == sagno)
2407 mp->m_agfrotor = (mp->m_agfrotor + 1) % 2409 mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2427,9 +2429,10 @@ xfs_alloc_vextent(
2427 args->len); 2429 args->len);
2428#endif 2430#endif
2429 } 2431 }
2432 xfs_perag_put(args->pag);
2430 return 0; 2433 return 0;
2431error0: 2434error0:
2432 up_read(&mp->m_peraglock); 2435 xfs_perag_put(args->pag);
2433 return error; 2436 return error;
2434} 2437}
2435 2438
@@ -2454,8 +2457,7 @@ xfs_free_extent(
2454 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2457 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2455 ASSERT(args.agno < args.mp->m_sb.sb_agcount); 2458 ASSERT(args.agno < args.mp->m_sb.sb_agcount);
2456 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2459 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2457 down_read(&args.mp->m_peraglock); 2460 args.pag = xfs_perag_get(args.mp, args.agno);
2458 args.pag = &args.mp->m_perag[args.agno];
2459 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) 2461 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
2460 goto error0; 2462 goto error0;
2461#ifdef DEBUG 2463#ifdef DEBUG
@@ -2465,7 +2467,7 @@ xfs_free_extent(
2465#endif 2467#endif
2466 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2468 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2467error0: 2469error0:
2468 up_read(&args.mp->m_peraglock); 2470 xfs_perag_put(args.pag);
2469 return error; 2471 return error;
2470} 2472}
2471 2473
@@ -2486,15 +2488,15 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2486 xfs_agblock_t bno, 2488 xfs_agblock_t bno,
2487 xfs_extlen_t len) 2489 xfs_extlen_t len)
2488{ 2490{
2489 xfs_mount_t *mp;
2490 xfs_perag_busy_t *bsy; 2491 xfs_perag_busy_t *bsy;
2492 struct xfs_perag *pag;
2491 int n; 2493 int n;
2492 2494
2493 mp = tp->t_mountp; 2495 pag = xfs_perag_get(tp->t_mountp, agno);
2494 spin_lock(&mp->m_perag[agno].pagb_lock); 2496 spin_lock(&pag->pagb_lock);
2495 2497
2496 /* search pagb_list for an open slot */ 2498 /* search pagb_list for an open slot */
2497 for (bsy = mp->m_perag[agno].pagb_list, n = 0; 2499 for (bsy = pag->pagb_list, n = 0;
2498 n < XFS_PAGB_NUM_SLOTS; 2500 n < XFS_PAGB_NUM_SLOTS;
2499 bsy++, n++) { 2501 bsy++, n++) {
2500 if (bsy->busy_tp == NULL) { 2502 if (bsy->busy_tp == NULL) {
@@ -2502,11 +2504,11 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2502 } 2504 }
2503 } 2505 }
2504 2506
2505 trace_xfs_alloc_busy(mp, agno, bno, len, n); 2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
2506 2508
2507 if (n < XFS_PAGB_NUM_SLOTS) { 2509 if (n < XFS_PAGB_NUM_SLOTS) {
2508 bsy = &mp->m_perag[agno].pagb_list[n]; 2510 bsy = &pag->pagb_list[n];
2509 mp->m_perag[agno].pagb_count++; 2511 pag->pagb_count++;
2510 bsy->busy_start = bno; 2512 bsy->busy_start = bno;
2511 bsy->busy_length = len; 2513 bsy->busy_length = len;
2512 bsy->busy_tp = tp; 2514 bsy->busy_tp = tp;
@@ -2521,7 +2523,8 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2521 xfs_trans_set_sync(tp); 2523 xfs_trans_set_sync(tp);
2522 } 2524 }
2523 2525
2524 spin_unlock(&mp->m_perag[agno].pagb_lock); 2526 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag);
2525} 2528}
2526 2529
2527void 2530void
@@ -2529,24 +2532,23 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
2529 xfs_agnumber_t agno, 2532 xfs_agnumber_t agno,
2530 int idx) 2533 int idx)
2531{ 2534{
2532 xfs_mount_t *mp; 2535 struct xfs_perag *pag;
2533 xfs_perag_busy_t *list; 2536 xfs_perag_busy_t *list;
2534 2537
2535 mp = tp->t_mountp;
2536
2537 spin_lock(&mp->m_perag[agno].pagb_lock);
2538 list = mp->m_perag[agno].pagb_list;
2539
2540 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2541 2542
2542 trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp); 2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
2543 2544
2544 if (list[idx].busy_tp == tp) { 2545 if (list[idx].busy_tp == tp) {
2545 list[idx].busy_tp = NULL; 2546 list[idx].busy_tp = NULL;
2546 mp->m_perag[agno].pagb_count--; 2547 pag->pagb_count--;
2547 } 2548 }
2548 2549
2549 spin_unlock(&mp->m_perag[agno].pagb_lock); 2550 spin_unlock(&pag->pagb_lock);
2551 xfs_perag_put(pag);
2550} 2552}
2551 2553
2552 2554
@@ -2560,17 +2562,15 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2560 xfs_agblock_t bno, 2562 xfs_agblock_t bno,
2561 xfs_extlen_t len) 2563 xfs_extlen_t len)
2562{ 2564{
2563 xfs_mount_t *mp; 2565 struct xfs_perag *pag;
2564 xfs_perag_busy_t *bsy; 2566 xfs_perag_busy_t *bsy;
2565 xfs_agblock_t uend, bend; 2567 xfs_agblock_t uend, bend;
2566 xfs_lsn_t lsn = 0; 2568 xfs_lsn_t lsn = 0;
2567 int cnt; 2569 int cnt;
2568 2570
2569 mp = tp->t_mountp; 2571 pag = xfs_perag_get(tp->t_mountp, agno);
2570 2572 spin_lock(&pag->pagb_lock);
2571 spin_lock(&mp->m_perag[agno].pagb_lock); 2573 cnt = pag->pagb_count;
2572
2573 uend = bno + len - 1;
2574 2574
2575 /* 2575 /*
2576 * search pagb_list for this slot, skipping open slots. We have to 2576 * search pagb_list for this slot, skipping open slots. We have to
@@ -2578,8 +2578,9 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2578 * we have to get the most recent LSN for the log force to push out 2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range. 2579 * all the transactions that span the range.
2580 */ 2580 */
2581 for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) { 2581 uend = bno + len - 1;
2582 bsy = &mp->m_perag[agno].pagb_list[cnt]; 2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2583 bsy = &pag->pagb_list[cnt];
2583 if (!bsy->busy_tp) 2584 if (!bsy->busy_tp)
2584 continue; 2585 continue;
2585 2586
@@ -2591,7 +2592,8 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2591 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) 2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
2592 lsn = bsy->busy_tp->t_commit_lsn; 2593 lsn = bsy->busy_tp->t_commit_lsn;
2593 } 2594 }
2594 spin_unlock(&mp->m_perag[agno].pagb_lock); 2595 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag);
2595 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); 2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2596 2598
2597 /* 2599 /*
@@ -2599,5 +2601,5 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2599 * transaction that freed the block 2601 * transaction that freed the block
2600 */ 2602 */
2601 if (lsn) 2603 if (lsn)
2602 xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); 2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2603} 2605}
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index adbd9141aea1..b726e10d2c1c 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -61,12 +61,14 @@ xfs_allocbt_set_root(
61 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 61 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
62 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 62 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
63 int btnum = cur->bc_btnum; 63 int btnum = cur->bc_btnum;
64 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
64 65
65 ASSERT(ptr->s != 0); 66 ASSERT(ptr->s != 0);
66 67
67 agf->agf_roots[btnum] = ptr->s; 68 agf->agf_roots[btnum] = ptr->s;
68 be32_add_cpu(&agf->agf_levels[btnum], inc); 69 be32_add_cpu(&agf->agf_levels[btnum], inc);
69 cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc; 70 pag->pagf_levels[btnum] += inc;
71 xfs_perag_put(pag);
70 72
71 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 73 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
72} 74}
@@ -150,6 +152,7 @@ xfs_allocbt_update_lastrec(
150{ 152{
151 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); 153 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
152 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 154 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
155 struct xfs_perag *pag;
153 __be32 len; 156 __be32 len;
154 int numrecs; 157 int numrecs;
155 158
@@ -193,7 +196,9 @@ xfs_allocbt_update_lastrec(
193 } 196 }
194 197
195 agf->agf_longest = len; 198 agf->agf_longest = len;
196 cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len); 199 pag = xfs_perag_get(cur->bc_mp, seqno);
200 pag->pagf_longest = be32_to_cpu(len);
201 xfs_perag_put(pag);
197 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); 202 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
198} 203}
199 204
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e953b6cfb2a8..b9c196a53c42 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -93,12 +93,12 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
93STATIC int 93STATIC int
94xfs_attr_name_to_xname( 94xfs_attr_name_to_xname(
95 struct xfs_name *xname, 95 struct xfs_name *xname,
96 const char *aname) 96 const unsigned char *aname)
97{ 97{
98 if (!aname) 98 if (!aname)
99 return EINVAL; 99 return EINVAL;
100 xname->name = aname; 100 xname->name = aname;
101 xname->len = strlen(aname); 101 xname->len = strlen((char *)aname);
102 if (xname->len >= MAXNAMELEN) 102 if (xname->len >= MAXNAMELEN)
103 return EFAULT; /* match IRIX behaviour */ 103 return EFAULT; /* match IRIX behaviour */
104 104
@@ -124,7 +124,7 @@ STATIC int
124xfs_attr_get_int( 124xfs_attr_get_int(
125 struct xfs_inode *ip, 125 struct xfs_inode *ip,
126 struct xfs_name *name, 126 struct xfs_name *name,
127 char *value, 127 unsigned char *value,
128 int *valuelenp, 128 int *valuelenp,
129 int flags) 129 int flags)
130{ 130{
@@ -171,8 +171,8 @@ xfs_attr_get_int(
171int 171int
172xfs_attr_get( 172xfs_attr_get(
173 xfs_inode_t *ip, 173 xfs_inode_t *ip,
174 const char *name, 174 const unsigned char *name,
175 char *value, 175 unsigned char *value,
176 int *valuelenp, 176 int *valuelenp,
177 int flags) 177 int flags)
178{ 178{
@@ -197,7 +197,7 @@ xfs_attr_get(
197/* 197/*
198 * Calculate how many blocks we need for the new attribute, 198 * Calculate how many blocks we need for the new attribute,
199 */ 199 */
200int 200STATIC int
201xfs_attr_calc_size( 201xfs_attr_calc_size(
202 struct xfs_inode *ip, 202 struct xfs_inode *ip,
203 int namelen, 203 int namelen,
@@ -235,8 +235,12 @@ xfs_attr_calc_size(
235} 235}
236 236
237STATIC int 237STATIC int
238xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, 238xfs_attr_set_int(
239 char *value, int valuelen, int flags) 239 struct xfs_inode *dp,
240 struct xfs_name *name,
241 unsigned char *value,
242 int valuelen,
243 int flags)
240{ 244{
241 xfs_da_args_t args; 245 xfs_da_args_t args;
242 xfs_fsblock_t firstblock; 246 xfs_fsblock_t firstblock;
@@ -452,8 +456,8 @@ out:
452int 456int
453xfs_attr_set( 457xfs_attr_set(
454 xfs_inode_t *dp, 458 xfs_inode_t *dp,
455 const char *name, 459 const unsigned char *name,
456 char *value, 460 unsigned char *value,
457 int valuelen, 461 int valuelen,
458 int flags) 462 int flags)
459{ 463{
@@ -600,7 +604,7 @@ out:
600int 604int
601xfs_attr_remove( 605xfs_attr_remove(
602 xfs_inode_t *dp, 606 xfs_inode_t *dp,
603 const char *name, 607 const unsigned char *name,
604 int flags) 608 int flags)
605{ 609{
606 int error; 610 int error;
@@ -669,9 +673,13 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
669 */ 673 */
670/*ARGSUSED*/ 674/*ARGSUSED*/
671STATIC int 675STATIC int
672xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, 676xfs_attr_put_listent(
673 char *name, int namelen, 677 xfs_attr_list_context_t *context,
674 int valuelen, char *value) 678 int flags,
679 unsigned char *name,
680 int namelen,
681 int valuelen,
682 unsigned char *value)
675{ 683{
676 struct attrlist *alist = (struct attrlist *)context->alist; 684 struct attrlist *alist = (struct attrlist *)context->alist;
677 attrlist_ent_t *aep; 685 attrlist_ent_t *aep;
@@ -1980,7 +1988,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1980 xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; 1988 xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
1981 xfs_mount_t *mp; 1989 xfs_mount_t *mp;
1982 xfs_daddr_t dblkno; 1990 xfs_daddr_t dblkno;
1983 xfs_caddr_t dst; 1991 void *dst;
1984 xfs_buf_t *bp; 1992 xfs_buf_t *bp;
1985 int nmap, error, tmp, valuelen, blkcnt, i; 1993 int nmap, error, tmp, valuelen, blkcnt, i;
1986 xfs_dablk_t lblkno; 1994 xfs_dablk_t lblkno;
@@ -2007,15 +2015,14 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
2007 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 2015 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
2008 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 2016 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
2009 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, 2017 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
2010 blkcnt, 2018 blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
2011 XFS_BUF_LOCK | XBF_DONT_BLOCK,
2012 &bp); 2019 &bp);
2013 if (error) 2020 if (error)
2014 return(error); 2021 return(error);
2015 2022
2016 tmp = (valuelen < XFS_BUF_SIZE(bp)) 2023 tmp = (valuelen < XFS_BUF_SIZE(bp))
2017 ? valuelen : XFS_BUF_SIZE(bp); 2024 ? valuelen : XFS_BUF_SIZE(bp);
2018 xfs_biomove(bp, 0, tmp, dst, XFS_B_READ); 2025 xfs_biomove(bp, 0, tmp, dst, XBF_READ);
2019 xfs_buf_relse(bp); 2026 xfs_buf_relse(bp);
2020 dst += tmp; 2027 dst += tmp;
2021 valuelen -= tmp; 2028 valuelen -= tmp;
@@ -2039,7 +2046,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2039 xfs_inode_t *dp; 2046 xfs_inode_t *dp;
2040 xfs_bmbt_irec_t map; 2047 xfs_bmbt_irec_t map;
2041 xfs_daddr_t dblkno; 2048 xfs_daddr_t dblkno;
2042 xfs_caddr_t src; 2049 void *src;
2043 xfs_buf_t *bp; 2050 xfs_buf_t *bp;
2044 xfs_dablk_t lblkno; 2051 xfs_dablk_t lblkno;
2045 int blkcnt, valuelen, nmap, error, tmp, committed; 2052 int blkcnt, valuelen, nmap, error, tmp, committed;
@@ -2141,13 +2148,13 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2141 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); 2148 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2142 2149
2143 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 2150 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
2144 XFS_BUF_LOCK | XBF_DONT_BLOCK); 2151 XBF_LOCK | XBF_DONT_BLOCK);
2145 ASSERT(bp); 2152 ASSERT(bp);
2146 ASSERT(!XFS_BUF_GETERROR(bp)); 2153 ASSERT(!XFS_BUF_GETERROR(bp));
2147 2154
2148 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2155 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2149 XFS_BUF_SIZE(bp); 2156 XFS_BUF_SIZE(bp);
2150 xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE); 2157 xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
2151 if (tmp < XFS_BUF_SIZE(bp)) 2158 if (tmp < XFS_BUF_SIZE(bp))
2152 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2159 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2153 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2160 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
@@ -2208,8 +2215,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2208 /* 2215 /*
2209 * If the "remote" value is in the cache, remove it. 2216 * If the "remote" value is in the cache, remove it.
2210 */ 2217 */
2211 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, 2218 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
2212 XFS_INCORE_TRYLOCK);
2213 if (bp) { 2219 if (bp) {
2214 XFS_BUF_STALE(bp); 2220 XFS_BUF_STALE(bp);
2215 XFS_BUF_UNDELAYWRITE(bp); 2221 XFS_BUF_UNDELAYWRITE(bp);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 59b410ce69a1..e920d68ef509 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -113,7 +113,7 @@ typedef struct attrlist_cursor_kern {
113 113
114 114
115typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, 115typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
116 char *, int, int, char *); 116 unsigned char *, int, int, unsigned char *);
117 117
118typedef struct xfs_attr_list_context { 118typedef struct xfs_attr_list_context {
119 struct xfs_inode *dp; /* inode */ 119 struct xfs_inode *dp; /* inode */
@@ -139,7 +139,6 @@ typedef struct xfs_attr_list_context {
139/* 139/*
140 * Overall external interface routines. 140 * Overall external interface routines.
141 */ 141 */
142int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
143int xfs_attr_inactive(struct xfs_inode *dp); 142int xfs_attr_inactive(struct xfs_inode *dp);
144int xfs_attr_rmtval_get(struct xfs_da_args *args); 143int xfs_attr_rmtval_get(struct xfs_da_args *args);
145int xfs_attr_list_int(struct xfs_attr_list_context *); 144int xfs_attr_list_int(struct xfs_attr_list_context *);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index baf41b5af756..a90ce74fc256 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -521,11 +521,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
521 521
522 sfe = &sf->list[0]; 522 sfe = &sf->list[0];
523 for (i = 0; i < sf->hdr.count; i++) { 523 for (i = 0; i < sf->hdr.count; i++) {
524 nargs.name = (char *)sfe->nameval; 524 nargs.name = sfe->nameval;
525 nargs.namelen = sfe->namelen; 525 nargs.namelen = sfe->namelen;
526 nargs.value = (char *)&sfe->nameval[nargs.namelen]; 526 nargs.value = &sfe->nameval[nargs.namelen];
527 nargs.valuelen = sfe->valuelen; 527 nargs.valuelen = sfe->valuelen;
528 nargs.hashval = xfs_da_hashname((char *)sfe->nameval, 528 nargs.hashval = xfs_da_hashname(sfe->nameval,
529 sfe->namelen); 529 sfe->namelen);
530 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); 530 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
531 error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ 531 error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
@@ -612,10 +612,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
612 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { 612 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
613 error = context->put_listent(context, 613 error = context->put_listent(context,
614 sfe->flags, 614 sfe->flags,
615 (char *)sfe->nameval, 615 sfe->nameval,
616 (int)sfe->namelen, 616 (int)sfe->namelen,
617 (int)sfe->valuelen, 617 (int)sfe->valuelen,
618 (char*)&sfe->nameval[sfe->namelen]); 618 &sfe->nameval[sfe->namelen]);
619 619
620 /* 620 /*
621 * Either search callback finished early or 621 * Either search callback finished early or
@@ -659,8 +659,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
659 } 659 }
660 660
661 sbp->entno = i; 661 sbp->entno = i;
662 sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); 662 sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
663 sbp->name = (char *)sfe->nameval; 663 sbp->name = sfe->nameval;
664 sbp->namelen = sfe->namelen; 664 sbp->namelen = sfe->namelen;
665 /* These are bytes, and both on-disk, don't endian-flip */ 665 /* These are bytes, and both on-disk, don't endian-flip */
666 sbp->valuelen = sfe->valuelen; 666 sbp->valuelen = sfe->valuelen;
@@ -818,9 +818,9 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
818 continue; 818 continue;
819 ASSERT(entry->flags & XFS_ATTR_LOCAL); 819 ASSERT(entry->flags & XFS_ATTR_LOCAL);
820 name_loc = xfs_attr_leaf_name_local(leaf, i); 820 name_loc = xfs_attr_leaf_name_local(leaf, i);
821 nargs.name = (char *)name_loc->nameval; 821 nargs.name = name_loc->nameval;
822 nargs.namelen = name_loc->namelen; 822 nargs.namelen = name_loc->namelen;
823 nargs.value = (char *)&name_loc->nameval[nargs.namelen]; 823 nargs.value = &name_loc->nameval[nargs.namelen];
824 nargs.valuelen = be16_to_cpu(name_loc->valuelen); 824 nargs.valuelen = be16_to_cpu(name_loc->valuelen);
825 nargs.hashval = be32_to_cpu(entry->hashval); 825 nargs.hashval = be32_to_cpu(entry->hashval);
826 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); 826 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
@@ -2370,10 +2370,10 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2370 2370
2371 retval = context->put_listent(context, 2371 retval = context->put_listent(context,
2372 entry->flags, 2372 entry->flags,
2373 (char *)name_loc->nameval, 2373 name_loc->nameval,
2374 (int)name_loc->namelen, 2374 (int)name_loc->namelen,
2375 be16_to_cpu(name_loc->valuelen), 2375 be16_to_cpu(name_loc->valuelen),
2376 (char *)&name_loc->nameval[name_loc->namelen]); 2376 &name_loc->nameval[name_loc->namelen]);
2377 if (retval) 2377 if (retval)
2378 return retval; 2378 return retval;
2379 } else { 2379 } else {
@@ -2397,15 +2397,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2397 return retval; 2397 return retval;
2398 retval = context->put_listent(context, 2398 retval = context->put_listent(context,
2399 entry->flags, 2399 entry->flags,
2400 (char *)name_rmt->name, 2400 name_rmt->name,
2401 (int)name_rmt->namelen, 2401 (int)name_rmt->namelen,
2402 valuelen, 2402 valuelen,
2403 (char*)args.value); 2403 args.value);
2404 kmem_free(args.value); 2404 kmem_free(args.value);
2405 } else { 2405 } else {
2406 retval = context->put_listent(context, 2406 retval = context->put_listent(context,
2407 entry->flags, 2407 entry->flags,
2408 (char *)name_rmt->name, 2408 name_rmt->name,
2409 (int)name_rmt->namelen, 2409 (int)name_rmt->namelen,
2410 valuelen, 2410 valuelen,
2411 NULL); 2411 NULL);
@@ -2950,7 +2950,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2950 map.br_blockcount); 2950 map.br_blockcount);
2951 bp = xfs_trans_get_buf(*trans, 2951 bp = xfs_trans_get_buf(*trans,
2952 dp->i_mount->m_ddev_targp, 2952 dp->i_mount->m_ddev_targp,
2953 dblkno, dblkcnt, XFS_BUF_LOCK); 2953 dblkno, dblkcnt, XBF_LOCK);
2954 xfs_trans_binval(*trans, bp); 2954 xfs_trans_binval(*trans, bp);
2955 /* 2955 /*
2956 * Roll to next transaction. 2956 * Roll to next transaction.
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index 76ab7b0cbb3a..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -52,7 +52,7 @@ typedef struct xfs_attr_sf_sort {
52 __uint8_t valuelen; /* length of value */ 52 __uint8_t valuelen; /* length of value */
53 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ 53 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
54 xfs_dahash_t hash; /* this entry's hash value */ 54 xfs_dahash_t hash; /* this entry's hash value */
55 char *name; /* name value, pointer into buffer */ 55 unsigned char *name; /* name value, pointer into buffer */
56} xfs_attr_sf_sort_t; 56} xfs_attr_sf_sort_t;
57 57
58#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ 58#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 98251cdc52aa..1869fb973819 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2629,13 +2629,12 @@ xfs_bmap_btalloc(
2629 if (startag == NULLAGNUMBER) 2629 if (startag == NULLAGNUMBER)
2630 startag = ag = 0; 2630 startag = ag = 0;
2631 notinit = 0; 2631 notinit = 0;
2632 down_read(&mp->m_peraglock); 2632 pag = xfs_perag_get(mp, ag);
2633 while (blen < ap->alen) { 2633 while (blen < ap->alen) {
2634 pag = &mp->m_perag[ag];
2635 if (!pag->pagf_init && 2634 if (!pag->pagf_init &&
2636 (error = xfs_alloc_pagf_init(mp, args.tp, 2635 (error = xfs_alloc_pagf_init(mp, args.tp,
2637 ag, XFS_ALLOC_FLAG_TRYLOCK))) { 2636 ag, XFS_ALLOC_FLAG_TRYLOCK))) {
2638 up_read(&mp->m_peraglock); 2637 xfs_perag_put(pag);
2639 return error; 2638 return error;
2640 } 2639 }
2641 /* 2640 /*
@@ -2667,13 +2666,13 @@ xfs_bmap_btalloc(
2667 break; 2666 break;
2668 2667
2669 error = xfs_filestream_new_ag(ap, &ag); 2668 error = xfs_filestream_new_ag(ap, &ag);
2670 if (error) { 2669 xfs_perag_put(pag);
2671 up_read(&mp->m_peraglock); 2670 if (error)
2672 return error; 2671 return error;
2673 }
2674 2672
2675 /* loop again to set 'blen'*/ 2673 /* loop again to set 'blen'*/
2676 startag = NULLAGNUMBER; 2674 startag = NULLAGNUMBER;
2675 pag = xfs_perag_get(mp, ag);
2677 continue; 2676 continue;
2678 } 2677 }
2679 } 2678 }
@@ -2681,8 +2680,10 @@ xfs_bmap_btalloc(
2681 ag = 0; 2680 ag = 0;
2682 if (ag == startag) 2681 if (ag == startag)
2683 break; 2682 break;
2683 xfs_perag_put(pag);
2684 pag = xfs_perag_get(mp, ag);
2684 } 2685 }
2685 up_read(&mp->m_peraglock); 2686 xfs_perag_put(pag);
2686 /* 2687 /*
2687 * Since the above loop did a BUF_TRYLOCK, it is 2688 * Since the above loop did a BUF_TRYLOCK, it is
2688 * possible that there is space for this request. 2689 * possible that there is space for this request.
@@ -4470,7 +4471,7 @@ xfs_bmapi(
4470 xfs_fsblock_t abno; /* allocated block number */ 4471 xfs_fsblock_t abno; /* allocated block number */
4471 xfs_extlen_t alen; /* allocated extent length */ 4472 xfs_extlen_t alen; /* allocated extent length */
4472 xfs_fileoff_t aoff; /* allocated file offset */ 4473 xfs_fileoff_t aoff; /* allocated file offset */
4473 xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ 4474 xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */
4474 xfs_btree_cur_t *cur; /* bmap btree cursor */ 4475 xfs_btree_cur_t *cur; /* bmap btree cursor */
4475 xfs_fileoff_t end; /* end of mapped file region */ 4476 xfs_fileoff_t end; /* end of mapped file region */
4476 int eof; /* we've hit the end of extents */ 4477 int eof; /* we've hit the end of extents */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 38751d5fac6f..416e47e54b83 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -334,7 +334,7 @@ xfs_bmbt_disk_set_allf(
334/* 334/*
335 * Set all the fields in a bmap extent record from the uncompressed form. 335 * Set all the fields in a bmap extent record from the uncompressed form.
336 */ 336 */
337void 337STATIC void
338xfs_bmbt_disk_set_all( 338xfs_bmbt_disk_set_all(
339 xfs_bmbt_rec_t *r, 339 xfs_bmbt_rec_t *r,
340 xfs_bmbt_irec_t *s) 340 xfs_bmbt_irec_t *s)
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cf07ca7c22e7..0e66c4ea0f85 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -223,7 +223,6 @@ extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
223extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); 223extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
224extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); 224extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
225 225
226extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
227extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, 226extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
228 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); 227 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
229 228
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 36a0992dd669..96be4b0f2496 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -977,7 +977,7 @@ xfs_btree_get_buf_block(
977 xfs_daddr_t d; 977 xfs_daddr_t d;
978 978
979 /* need to sort out how callers deal with failures first */ 979 /* need to sort out how callers deal with failures first */
980 ASSERT(!(flags & XFS_BUF_TRYLOCK)); 980 ASSERT(!(flags & XBF_TRYLOCK));
981 981
982 d = xfs_btree_ptr_to_daddr(cur, ptr); 982 d = xfs_btree_ptr_to_daddr(cur, ptr);
983 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, 983 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
@@ -1008,7 +1008,7 @@ xfs_btree_read_buf_block(
1008 int error; 1008 int error;
1009 1009
1010 /* need to sort out how callers deal with failures first */ 1010 /* need to sort out how callers deal with failures first */
1011 ASSERT(!(flags & XFS_BUF_TRYLOCK)); 1011 ASSERT(!(flags & XBF_TRYLOCK));
1012 1012
1013 d = xfs_btree_ptr_to_daddr(cur, ptr); 1013 d = xfs_btree_ptr_to_daddr(cur, ptr);
1014 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1014 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a30f7e9eb2b9..f3c49e69eab9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -250,7 +250,7 @@ xfs_buf_item_format(
250 ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); 250 ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
251 vecp->i_addr = (xfs_caddr_t)&bip->bli_format; 251 vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
252 vecp->i_len = base_size; 252 vecp->i_len = base_size;
253 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT); 253 vecp->i_type = XLOG_REG_TYPE_BFORMAT;
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
@@ -297,14 +297,14 @@ xfs_buf_item_format(
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 297 buffer_offset = first_bit * XFS_BLI_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 299 vecp->i_len = nbits * XFS_BLI_CHUNK;
300 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 300 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 301 nvecs++;
302 break; 302 break;
303 } else if (next_bit != last_bit + 1) { 303 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 304 buffer_offset = first_bit * XFS_BLI_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 306 vecp->i_len = nbits * XFS_BLI_CHUNK;
307 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 307 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 308 nvecs++;
309 vecp++; 309 vecp++;
310 first_bit = next_bit; 310 first_bit = next_bit;
@@ -316,7 +316,7 @@ xfs_buf_item_format(
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 316 buffer_offset = first_bit * XFS_BLI_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 318 vecp->i_len = nbits * XFS_BLI_CHUNK;
319 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 319 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 320/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 321 * this number is used by recovery, and it gets confused by the boundary
322 * split here 322 * split here
@@ -467,8 +467,10 @@ xfs_buf_item_unpin_remove(
467/* 467/*
468 * This is called to attempt to lock the buffer associated with this 468 * This is called to attempt to lock the buffer associated with this
469 * buf log item. Don't sleep on the buffer lock. If we can't get 469 * buf log item. Don't sleep on the buffer lock. If we can't get
470 * the lock right away, return 0. If we can get the lock, pull the 470 * the lock right away, return 0. If we can get the lock, take a
471 * buffer from the free list, mark it busy, and return 1. 471 * reference to the buffer. If this is a delayed write buffer that
472 * needs AIL help to be written back, invoke the pushbuf routine
473 * rather than the normal success path.
472 */ 474 */
473STATIC uint 475STATIC uint
474xfs_buf_item_trylock( 476xfs_buf_item_trylock(
@@ -477,24 +479,18 @@ xfs_buf_item_trylock(
477 xfs_buf_t *bp; 479 xfs_buf_t *bp;
478 480
479 bp = bip->bli_buf; 481 bp = bip->bli_buf;
480 482 if (XFS_BUF_ISPINNED(bp))
481 if (XFS_BUF_ISPINNED(bp)) {
482 return XFS_ITEM_PINNED; 483 return XFS_ITEM_PINNED;
483 } 484 if (!XFS_BUF_CPSEMA(bp))
484
485 if (!XFS_BUF_CPSEMA(bp)) {
486 return XFS_ITEM_LOCKED; 485 return XFS_ITEM_LOCKED;
487 }
488 486
489 /* 487 /* take a reference to the buffer. */
490 * Remove the buffer from the free list. Only do this
491 * if it's on the free list. Private buffers like the
492 * superblock buffer are not.
493 */
494 XFS_BUF_HOLD(bp); 488 XFS_BUF_HOLD(bp);
495 489
496 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 490 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
497 trace_xfs_buf_item_trylock(bip); 491 trace_xfs_buf_item_trylock(bip);
492 if (XFS_BUF_ISDELAYWRITE(bp))
493 return XFS_ITEM_PUSHBUF;
498 return XFS_ITEM_SUCCESS; 494 return XFS_ITEM_SUCCESS;
499} 495}
500 496
@@ -626,11 +622,9 @@ xfs_buf_item_committed(
626} 622}
627 623
628/* 624/*
629 * This is called to asynchronously write the buffer associated with this 625 * The buffer is locked, but is not a delayed write buffer. This happens
630 * buf log item out to disk. The buffer will already have been locked by 626 * if we race with IO completion and hence we don't want to try to write it
631 * a successful call to xfs_buf_item_trylock(). If the buffer still has 627 * again. Just release the buffer.
632 * B_DELWRI set, then get it going out to disk with a call to bawrite().
633 * If not, then just release the buffer.
634 */ 628 */
635STATIC void 629STATIC void
636xfs_buf_item_push( 630xfs_buf_item_push(
@@ -642,17 +636,29 @@ xfs_buf_item_push(
642 trace_xfs_buf_item_push(bip); 636 trace_xfs_buf_item_push(bip);
643 637
644 bp = bip->bli_buf; 638 bp = bip->bli_buf;
639 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
640 xfs_buf_relse(bp);
641}
645 642
646 if (XFS_BUF_ISDELAYWRITE(bp)) { 643/*
647 int error; 644 * The buffer is locked and is a delayed write buffer. Promote the buffer
648 error = xfs_bawrite(bip->bli_item.li_mountp, bp); 645 * in the delayed write queue as the caller knows that they must invoke
649 if (error) 646 * the xfsbufd to get this buffer written. We have to unlock the buffer
650 xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, 647 * to allow the xfsbufd to write it, too.
651 "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", 648 */
652 error, bip, bp); 649STATIC void
653 } else { 650xfs_buf_item_pushbuf(
654 xfs_buf_relse(bp); 651 xfs_buf_log_item_t *bip)
655 } 652{
653 xfs_buf_t *bp;
654
655 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
656 trace_xfs_buf_item_pushbuf(bip);
657
658 bp = bip->bli_buf;
659 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
660 xfs_buf_delwri_promote(bp);
661 xfs_buf_relse(bp);
656} 662}
657 663
658/* ARGSUSED */ 664/* ARGSUSED */
@@ -677,7 +683,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
677 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 683 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
678 xfs_buf_item_committed, 684 xfs_buf_item_committed,
679 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, 685 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
680 .iop_pushbuf = NULL, 686 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
681 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) 687 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
682 xfs_buf_item_committing 688 xfs_buf_item_committing
683}; 689};
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c0c8869115b1..0ca556b4bf31 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1534,8 +1534,8 @@ xfs_da_hashname(const __uint8_t *name, int namelen)
1534enum xfs_dacmp 1534enum xfs_dacmp
1535xfs_da_compname( 1535xfs_da_compname(
1536 struct xfs_da_args *args, 1536 struct xfs_da_args *args,
1537 const char *name, 1537 const unsigned char *name,
1538 int len) 1538 int len)
1539{ 1539{
1540 return (args->namelen == len && memcmp(args->name, name, len) == 0) ? 1540 return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
1541 XFS_CMP_EXACT : XFS_CMP_DIFFERENT; 1541 XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 30cd08f56a3a..fe9f5a8c1d2a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -209,7 +209,8 @@ typedef struct xfs_da_state {
209 */ 209 */
210struct xfs_nameops { 210struct xfs_nameops {
211 xfs_dahash_t (*hashname)(struct xfs_name *); 211 xfs_dahash_t (*hashname)(struct xfs_name *);
212 enum xfs_dacmp (*compname)(struct xfs_da_args *, const char *, int); 212 enum xfs_dacmp (*compname)(struct xfs_da_args *,
213 const unsigned char *, int);
213}; 214};
214 215
215 216
@@ -260,7 +261,7 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
260 261
261uint xfs_da_hashname(const __uint8_t *name_string, int name_length); 262uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
262enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, 263enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
263 const char *name, int len); 264 const unsigned char *name, int len);
264 265
265 266
266xfs_da_state_t *xfs_da_state_alloc(void); 267xfs_da_state_t *xfs_da_state_alloc(void);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 84ca1cf16a1e..cd27c9d6c71f 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -45,15 +45,21 @@
45#include "xfs_vnodeops.h" 45#include "xfs_vnodeops.h"
46#include "xfs_trace.h" 46#include "xfs_trace.h"
47 47
48
49static int xfs_swap_extents(
50 xfs_inode_t *ip, /* target inode */
51 xfs_inode_t *tip, /* tmp inode */
52 xfs_swapext_t *sxp);
53
48/* 54/*
49 * Syssgi interface for swapext 55 * ioctl interface for swapext
50 */ 56 */
51int 57int
52xfs_swapext( 58xfs_swapext(
53 xfs_swapext_t *sxp) 59 xfs_swapext_t *sxp)
54{ 60{
55 xfs_inode_t *ip, *tip; 61 xfs_inode_t *ip, *tip;
56 struct file *file, *target_file; 62 struct file *file, *tmp_file;
57 int error = 0; 63 int error = 0;
58 64
59 /* Pull information for the target fd */ 65 /* Pull information for the target fd */
@@ -68,46 +74,46 @@ xfs_swapext(
68 goto out_put_file; 74 goto out_put_file;
69 } 75 }
70 76
71 target_file = fget((int)sxp->sx_fdtmp); 77 tmp_file = fget((int)sxp->sx_fdtmp);
72 if (!target_file) { 78 if (!tmp_file) {
73 error = XFS_ERROR(EINVAL); 79 error = XFS_ERROR(EINVAL);
74 goto out_put_file; 80 goto out_put_file;
75 } 81 }
76 82
77 if (!(target_file->f_mode & FMODE_WRITE) || 83 if (!(tmp_file->f_mode & FMODE_WRITE) ||
78 (target_file->f_flags & O_APPEND)) { 84 (tmp_file->f_flags & O_APPEND)) {
79 error = XFS_ERROR(EBADF); 85 error = XFS_ERROR(EBADF);
80 goto out_put_target_file; 86 goto out_put_tmp_file;
81 } 87 }
82 88
83 if (IS_SWAPFILE(file->f_path.dentry->d_inode) || 89 if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
84 IS_SWAPFILE(target_file->f_path.dentry->d_inode)) { 90 IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
85 error = XFS_ERROR(EINVAL); 91 error = XFS_ERROR(EINVAL);
86 goto out_put_target_file; 92 goto out_put_tmp_file;
87 } 93 }
88 94
89 ip = XFS_I(file->f_path.dentry->d_inode); 95 ip = XFS_I(file->f_path.dentry->d_inode);
90 tip = XFS_I(target_file->f_path.dentry->d_inode); 96 tip = XFS_I(tmp_file->f_path.dentry->d_inode);
91 97
92 if (ip->i_mount != tip->i_mount) { 98 if (ip->i_mount != tip->i_mount) {
93 error = XFS_ERROR(EINVAL); 99 error = XFS_ERROR(EINVAL);
94 goto out_put_target_file; 100 goto out_put_tmp_file;
95 } 101 }
96 102
97 if (ip->i_ino == tip->i_ino) { 103 if (ip->i_ino == tip->i_ino) {
98 error = XFS_ERROR(EINVAL); 104 error = XFS_ERROR(EINVAL);
99 goto out_put_target_file; 105 goto out_put_tmp_file;
100 } 106 }
101 107
102 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 108 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
103 error = XFS_ERROR(EIO); 109 error = XFS_ERROR(EIO);
104 goto out_put_target_file; 110 goto out_put_tmp_file;
105 } 111 }
106 112
107 error = xfs_swap_extents(ip, tip, sxp); 113 error = xfs_swap_extents(ip, tip, sxp);
108 114
109 out_put_target_file: 115 out_put_tmp_file:
110 fput(target_file); 116 fput(tmp_file);
111 out_put_file: 117 out_put_file:
112 fput(file); 118 fput(file);
113 out: 119 out:
@@ -186,7 +192,7 @@ xfs_swap_extents_check_format(
186 return 0; 192 return 0;
187} 193}
188 194
189int 195static int
190xfs_swap_extents( 196xfs_swap_extents(
191 xfs_inode_t *ip, /* target inode */ 197 xfs_inode_t *ip, /* target inode */
192 xfs_inode_t *tip, /* tmp inode */ 198 xfs_inode_t *tip, /* tmp inode */
@@ -254,6 +260,9 @@ xfs_swap_extents(
254 goto out_unlock; 260 goto out_unlock;
255 } 261 }
256 262
263 trace_xfs_swap_extent_before(ip, 0);
264 trace_xfs_swap_extent_before(tip, 1);
265
257 /* check inode formats now that data is flushed */ 266 /* check inode formats now that data is flushed */
258 error = xfs_swap_extents_check_format(ip, tip); 267 error = xfs_swap_extents_check_format(ip, tip);
259 if (error) { 268 if (error) {
@@ -421,6 +430,8 @@ xfs_swap_extents(
421 430
422 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); 431 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
423 432
433 trace_xfs_swap_extent_after(ip, 0);
434 trace_xfs_swap_extent_after(tip, 1);
424out: 435out:
425 kmem_free(tempifp); 436 kmem_free(tempifp);
426 return error; 437 return error;
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index 4f55a6306558..20bdd935c121 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,9 +48,6 @@ typedef struct xfs_swapext
48 */ 48 */
49int xfs_swapext(struct xfs_swapext *sx); 49int xfs_swapext(struct xfs_swapext *sx);
50 50
51int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
52 struct xfs_swapext *sxp);
53
54#endif /* __KERNEL__ */ 51#endif /* __KERNEL__ */
55 52
56#endif /* __XFS_DFRAG_H__ */ 53#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 93634a7e90e9..42520f041265 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,7 +44,7 @@
44#include "xfs_vnodeops.h" 44#include "xfs_vnodeops.h"
45#include "xfs_trace.h" 45#include "xfs_trace.h"
46 46
47struct xfs_name xfs_name_dotdot = {"..", 2}; 47struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
48 48
49/* 49/*
50 * ASCII case-insensitive (ie. A-Z) support for directories that was 50 * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -66,8 +66,8 @@ xfs_ascii_ci_hashname(
66STATIC enum xfs_dacmp 66STATIC enum xfs_dacmp
67xfs_ascii_ci_compname( 67xfs_ascii_ci_compname(
68 struct xfs_da_args *args, 68 struct xfs_da_args *args,
69 const char *name, 69 const unsigned char *name,
70 int len) 70 int len)
71{ 71{
72 enum xfs_dacmp result; 72 enum xfs_dacmp result;
73 int i; 73 int i;
@@ -247,7 +247,7 @@ xfs_dir_createname(
247int 247int
248xfs_dir_cilookup_result( 248xfs_dir_cilookup_result(
249 struct xfs_da_args *args, 249 struct xfs_da_args *args,
250 const char *name, 250 const unsigned char *name,
251 int len) 251 int len)
252{ 252{
253 if (args->cmpresult == XFS_CMP_DIFFERENT) 253 if (args->cmpresult == XFS_CMP_DIFFERENT)
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 1d9ef96f33aa..74a3b1057685 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -100,7 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
100extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, 100extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
101 struct xfs_dabuf *bp); 101 struct xfs_dabuf *bp);
102 102
103extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name, 103extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
104 int len); 104 const unsigned char *name, int len);
105 105
106#endif /* __XFS_DIR2_H__ */ 106#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index ddc4ecc7807f..779a267b0a84 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -57,8 +57,8 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
57void 57void
58xfs_dir_startup(void) 58xfs_dir_startup(void)
59{ 59{
60 xfs_dir_hash_dot = xfs_da_hashname(".", 1); 60 xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
61 xfs_dir_hash_dotdot = xfs_da_hashname("..", 2); 61 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
62} 62}
63 63
64/* 64/*
@@ -513,8 +513,9 @@ xfs_dir2_block_getdents(
513 /* 513 /*
514 * If it didn't fit, set the final offset to here & return. 514 * If it didn't fit, set the final offset to here & return.
515 */ 515 */
516 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, 516 if (filldir(dirent, (char *)dep->name, dep->namelen,
517 be64_to_cpu(dep->inumber), DT_UNKNOWN)) { 517 cook & 0x7fffffff, be64_to_cpu(dep->inumber),
518 DT_UNKNOWN)) {
518 *offset = cook & 0x7fffffff; 519 *offset = cook & 0x7fffffff;
519 xfs_da_brelse(NULL, bp); 520 xfs_da_brelse(NULL, bp);
520 return 0; 521 return 0;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 29f484c11b3a..e2d89854ec9e 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1081,7 +1081,7 @@ xfs_dir2_leaf_getdents(
1081 dep = (xfs_dir2_data_entry_t *)ptr; 1081 dep = (xfs_dir2_data_entry_t *)ptr;
1082 length = xfs_dir2_data_entsize(dep->namelen); 1082 length = xfs_dir2_data_entsize(dep->namelen);
1083 1083
1084 if (filldir(dirent, dep->name, dep->namelen, 1084 if (filldir(dirent, (char *)dep->name, dep->namelen,
1085 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, 1085 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
1086 be64_to_cpu(dep->inumber), DT_UNKNOWN)) 1086 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1087 break; 1087 break;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index ce6e355199b5..78fc4d9ae756 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -65,7 +65,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
65/* 65/*
66 * Log entries from a freespace block. 66 * Log entries from a freespace block.
67 */ 67 */
68void 68STATIC void
69xfs_dir2_free_log_bests( 69xfs_dir2_free_log_bests(
70 xfs_trans_t *tp, /* transaction pointer */ 70 xfs_trans_t *tp, /* transaction pointer */
71 xfs_dabuf_t *bp, /* freespace buffer */ 71 xfs_dabuf_t *bp, /* freespace buffer */
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
index dde72db3d695..82dfe7147195 100644
--- a/fs/xfs/xfs_dir2_node.h
+++ b/fs/xfs/xfs_dir2_node.h
@@ -75,8 +75,6 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
75 return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)); 75 return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
76} 76}
77 77
78extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
79 int first, int last);
80extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, 78extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
81 struct xfs_dabuf *lbp); 79 struct xfs_dabuf *lbp);
82extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); 80extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 9d4f17a69676..c1a5945d463a 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -782,7 +782,7 @@ xfs_dir2_sf_getdents(
782 } 782 }
783 783
784 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); 784 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
785 if (filldir(dirent, sfep->name, sfep->namelen, 785 if (filldir(dirent, (char *)sfep->name, sfep->namelen,
786 off & 0x7fffffff, ino, DT_UNKNOWN)) { 786 off & 0x7fffffff, ino, DT_UNKNOWN)) {
787 *offset = off & 0x7fffffff; 787 *offset = off & 0x7fffffff;
788 return 0; 788 return 0;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 05a4bdd4be39..6f35ed1b39b9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
82 82
83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); 83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
84 log_vector->i_len = size; 84 log_vector->i_len = size;
85 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT); 85 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
86 ASSERT(size >= sizeof(xfs_efi_log_format_t)); 86 ASSERT(size >= sizeof(xfs_efi_log_format_t));
87} 87}
88 88
@@ -406,7 +406,7 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp,
406 406
407 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); 407 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
408 log_vector->i_len = size; 408 log_vector->i_len = size;
409 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT); 409 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
410 ASSERT(size >= sizeof(xfs_efd_log_format_t)); 410 ASSERT(size >= sizeof(xfs_efd_log_format_t));
411} 411}
412 412
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a631e1451abb..390850ee6603 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,6 +140,7 @@ _xfs_filestream_pick_ag(
140 int flags, 140 int flags,
141 xfs_extlen_t minlen) 141 xfs_extlen_t minlen)
142{ 142{
143 int streams, max_streams;
143 int err, trylock, nscan; 144 int err, trylock, nscan;
144 xfs_extlen_t longest, free, minfree, maxfree = 0; 145 xfs_extlen_t longest, free, minfree, maxfree = 0;
145 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 146 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
@@ -155,15 +156,15 @@ _xfs_filestream_pick_ag(
155 trylock = XFS_ALLOC_FLAG_TRYLOCK; 156 trylock = XFS_ALLOC_FLAG_TRYLOCK;
156 157
157 for (nscan = 0; 1; nscan++) { 158 for (nscan = 0; 1; nscan++) {
158 159 pag = xfs_perag_get(mp, ag);
159 TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag)); 160 TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
160
161 pag = mp->m_perag + ag;
162 161
163 if (!pag->pagf_init) { 162 if (!pag->pagf_init) {
164 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); 163 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
165 if (err && !trylock) 164 if (err && !trylock) {
165 xfs_perag_put(pag);
166 return err; 166 return err;
167 }
167 } 168 }
168 169
169 /* Might fail sometimes during the 1st pass with trylock set. */ 170 /* Might fail sometimes during the 1st pass with trylock set. */
@@ -173,6 +174,7 @@ _xfs_filestream_pick_ag(
173 /* Keep track of the AG with the most free blocks. */ 174 /* Keep track of the AG with the most free blocks. */
174 if (pag->pagf_freeblks > maxfree) { 175 if (pag->pagf_freeblks > maxfree) {
175 maxfree = pag->pagf_freeblks; 176 maxfree = pag->pagf_freeblks;
177 max_streams = atomic_read(&pag->pagf_fstrms);
176 max_ag = ag; 178 max_ag = ag;
177 } 179 }
178 180
@@ -195,6 +197,8 @@ _xfs_filestream_pick_ag(
195 197
196 /* Break out, retaining the reference on the AG. */ 198 /* Break out, retaining the reference on the AG. */
197 free = pag->pagf_freeblks; 199 free = pag->pagf_freeblks;
200 streams = atomic_read(&pag->pagf_fstrms);
201 xfs_perag_put(pag);
198 *agp = ag; 202 *agp = ag;
199 break; 203 break;
200 } 204 }
@@ -202,6 +206,7 @@ _xfs_filestream_pick_ag(
202 /* Drop the reference on this AG, it's not usable. */ 206 /* Drop the reference on this AG, it's not usable. */
203 xfs_filestream_put_ag(mp, ag); 207 xfs_filestream_put_ag(mp, ag);
204next_ag: 208next_ag:
209 xfs_perag_put(pag);
205 /* Move to the next AG, wrapping to AG 0 if necessary. */ 210 /* Move to the next AG, wrapping to AG 0 if necessary. */
206 if (++ag >= mp->m_sb.sb_agcount) 211 if (++ag >= mp->m_sb.sb_agcount)
207 ag = 0; 212 ag = 0;
@@ -229,6 +234,7 @@ next_ag:
229 if (max_ag != NULLAGNUMBER) { 234 if (max_ag != NULLAGNUMBER) {
230 xfs_filestream_get_ag(mp, max_ag); 235 xfs_filestream_get_ag(mp, max_ag);
231 TRACE_AG_PICK1(mp, max_ag, maxfree); 236 TRACE_AG_PICK1(mp, max_ag, maxfree);
237 streams = max_streams;
232 free = maxfree; 238 free = maxfree;
233 *agp = max_ag; 239 *agp = max_ag;
234 break; 240 break;
@@ -240,16 +246,14 @@ next_ag:
240 return 0; 246 return 0;
241 } 247 }
242 248
243 TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp), 249 TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
244 free, nscan, flags);
245 250
246 return 0; 251 return 0;
247} 252}
248 253
249/* 254/*
250 * Set the allocation group number for a file or a directory, updating inode 255 * Set the allocation group number for a file or a directory, updating inode
251 * references and per-AG references as appropriate. Must be called with the 256 * references and per-AG references as appropriate.
252 * m_peraglock held in read mode.
253 */ 257 */
254static int 258static int
255_xfs_filestream_update_ag( 259_xfs_filestream_update_ag(
@@ -451,20 +455,6 @@ xfs_filestream_unmount(
451} 455}
452 456
453/* 457/*
454 * If the mount point's m_perag array is going to be reallocated, all
455 * outstanding cache entries must be flushed to avoid accessing reference count
456 * addresses that have been freed. The call to xfs_filestream_flush() must be
457 * made inside the block that holds the m_peraglock in write mode to do the
458 * reallocation.
459 */
460void
461xfs_filestream_flush(
462 xfs_mount_t *mp)
463{
464 xfs_mru_cache_flush(mp->m_filestream);
465}
466
467/*
468 * Return the AG of the filestream the file or directory belongs to, or 458 * Return the AG of the filestream the file or directory belongs to, or
469 * NULLAGNUMBER otherwise. 459 * NULLAGNUMBER otherwise.
470 */ 460 */
@@ -526,7 +516,6 @@ xfs_filestream_associate(
526 516
527 mp = pip->i_mount; 517 mp = pip->i_mount;
528 cache = mp->m_filestream; 518 cache = mp->m_filestream;
529 down_read(&mp->m_peraglock);
530 519
531 /* 520 /*
532 * We have a problem, Houston. 521 * We have a problem, Houston.
@@ -543,10 +532,8 @@ xfs_filestream_associate(
543 * 532 *
544 * So, if we can't get the iolock without sleeping then just give up 533 * So, if we can't get the iolock without sleeping then just give up
545 */ 534 */
546 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { 535 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
547 up_read(&mp->m_peraglock);
548 return 1; 536 return 1;
549 }
550 537
551 /* If the parent directory is already in the cache, use its AG. */ 538 /* If the parent directory is already in the cache, use its AG. */
552 item = xfs_mru_cache_lookup(cache, pip->i_ino); 539 item = xfs_mru_cache_lookup(cache, pip->i_ino);
@@ -601,7 +588,6 @@ exit_did_pick:
601 588
602exit: 589exit:
603 xfs_iunlock(pip, XFS_IOLOCK_EXCL); 590 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
604 up_read(&mp->m_peraglock);
605 return -err; 591 return -err;
606} 592}
607 593
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 4aba67c5f64f..260f757bbc5d 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -79,12 +79,21 @@ extern ktrace_t *xfs_filestreams_trace_buf;
79 * the cache that reference per-ag array elements that have since been 79 * the cache that reference per-ag array elements that have since been
80 * reallocated. 80 * reallocated.
81 */ 81 */
82/*
83 * xfs_filestream_peek_ag is only used in tracing code
84 */
82static inline int 85static inline int
83xfs_filestream_peek_ag( 86xfs_filestream_peek_ag(
84 xfs_mount_t *mp, 87 xfs_mount_t *mp,
85 xfs_agnumber_t agno) 88 xfs_agnumber_t agno)
86{ 89{
87 return atomic_read(&mp->m_perag[agno].pagf_fstrms); 90 struct xfs_perag *pag;
91 int ret;
92
93 pag = xfs_perag_get(mp, agno);
94 ret = atomic_read(&pag->pagf_fstrms);
95 xfs_perag_put(pag);
96 return ret;
88} 97}
89 98
90static inline int 99static inline int
@@ -92,7 +101,13 @@ xfs_filestream_get_ag(
92 xfs_mount_t *mp, 101 xfs_mount_t *mp,
93 xfs_agnumber_t agno) 102 xfs_agnumber_t agno)
94{ 103{
95 return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); 104 struct xfs_perag *pag;
105 int ret;
106
107 pag = xfs_perag_get(mp, agno);
108 ret = atomic_inc_return(&pag->pagf_fstrms);
109 xfs_perag_put(pag);
110 return ret;
96} 111}
97 112
98static inline int 113static inline int
@@ -100,7 +115,13 @@ xfs_filestream_put_ag(
100 xfs_mount_t *mp, 115 xfs_mount_t *mp,
101 xfs_agnumber_t agno) 116 xfs_agnumber_t agno)
102{ 117{
103 return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms); 118 struct xfs_perag *pag;
119 int ret;
120
121 pag = xfs_perag_get(mp, agno);
122 ret = atomic_dec_return(&pag->pagf_fstrms);
123 xfs_perag_put(pag);
124 return ret;
104} 125}
105 126
106/* allocation selection flags */ 127/* allocation selection flags */
@@ -114,7 +135,6 @@ int xfs_filestream_init(void);
114void xfs_filestream_uninit(void); 135void xfs_filestream_uninit(void);
115int xfs_filestream_mount(struct xfs_mount *mp); 136int xfs_filestream_mount(struct xfs_mount *mp);
116void xfs_filestream_unmount(struct xfs_mount *mp); 137void xfs_filestream_unmount(struct xfs_mount *mp);
117void xfs_filestream_flush(struct xfs_mount *mp);
118xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); 138xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
119int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); 139int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
120void xfs_filestream_deassociate(struct xfs_inode *ip); 140void xfs_filestream_deassociate(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a13919a6a364..37a6f62c57b6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,27 +167,14 @@ xfs_growfs_data_private(
167 } 167 }
168 new = nb - mp->m_sb.sb_dblocks; 168 new = nb - mp->m_sb.sb_dblocks;
169 oagcount = mp->m_sb.sb_agcount; 169 oagcount = mp->m_sb.sb_agcount;
170 if (nagcount > oagcount) {
171 void *new_perag, *old_perag;
172
173 xfs_filestream_flush(mp);
174
175 new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
176 KM_MAYFAIL);
177 if (!new_perag)
178 return XFS_ERROR(ENOMEM);
179
180 down_write(&mp->m_peraglock);
181 memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
182 old_perag = mp->m_perag;
183 mp->m_perag = new_perag;
184
185 mp->m_flags |= XFS_MOUNT_32BITINODES;
186 nagimax = xfs_initialize_perag(mp, nagcount);
187 up_write(&mp->m_peraglock);
188 170
189 kmem_free(old_perag); 171 /* allocate the new per-ag structures */
172 if (nagcount > oagcount) {
173 error = xfs_initialize_perag(mp, nagcount, &nagimax);
174 if (error)
175 return error;
190 } 176 }
177
191 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); 178 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
192 tp->t_flags |= XFS_TRANS_RESERVE; 179 tp->t_flags |= XFS_TRANS_RESERVE;
193 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), 180 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
@@ -196,6 +183,11 @@ xfs_growfs_data_private(
196 return error; 183 return error;
197 } 184 }
198 185
186 /*
187 * Write new AG headers to disk. Non-transactional, but written
188 * synchronously so they are completed prior to the growfs transaction
189 * being logged.
190 */
199 nfree = 0; 191 nfree = 0;
200 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { 192 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
201 /* 193 /*
@@ -359,6 +351,12 @@ xfs_growfs_data_private(
359 goto error0; 351 goto error0;
360 } 352 }
361 } 353 }
354
355 /*
356 * Update changed superblock fields transactionally. These are not
357 * seen by the rest of the world until the transaction commit applies
358 * them atomically to the superblock.
359 */
362 if (nagcount > oagcount) 360 if (nagcount > oagcount)
363 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); 361 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
364 if (nb > mp->m_sb.sb_dblocks) 362 if (nb > mp->m_sb.sb_dblocks)
@@ -369,9 +367,9 @@ xfs_growfs_data_private(
369 if (dpct) 367 if (dpct)
370 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); 368 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
371 error = xfs_trans_commit(tp, 0); 369 error = xfs_trans_commit(tp, 0);
372 if (error) { 370 if (error)
373 return error; 371 return error;
374 } 372
375 /* New allocation groups fully initialized, so update mount struct */ 373 /* New allocation groups fully initialized, so update mount struct */
376 if (nagimax) 374 if (nagimax)
377 mp->m_maxagi = nagimax; 375 mp->m_maxagi = nagimax;
@@ -381,6 +379,8 @@ xfs_growfs_data_private(
381 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 379 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
382 } else 380 } else
383 mp->m_maxicount = 0; 381 mp->m_maxicount = 0;
382
383 /* update secondary superblocks. */
384 for (agno = 1; agno < nagcount; agno++) { 384 for (agno = 1; agno < nagcount; agno++) {
385 error = xfs_read_buf(mp, mp->m_ddev_targp, 385 error = xfs_read_buf(mp, mp->m_ddev_targp,
386 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 386 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index cb907ba69c4c..9d884c127bb9 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -205,7 +205,7 @@ xfs_ialloc_inode_init(
205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); 205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
207 mp->m_bsize * blks_per_cluster, 207 mp->m_bsize * blks_per_cluster,
208 XFS_BUF_LOCK); 208 XBF_LOCK);
209 ASSERT(fbuf); 209 ASSERT(fbuf);
210 ASSERT(!XFS_BUF_GETERROR(fbuf)); 210 ASSERT(!XFS_BUF_GETERROR(fbuf));
211 211
@@ -253,6 +253,7 @@ xfs_ialloc_ag_alloc(
253 xfs_agino_t thisino; /* current inode number, for loop */ 253 xfs_agino_t thisino; /* current inode number, for loop */
254 int isaligned = 0; /* inode allocation at stripe unit */ 254 int isaligned = 0; /* inode allocation at stripe unit */
255 /* boundary */ 255 /* boundary */
256 struct xfs_perag *pag;
256 257
257 args.tp = tp; 258 args.tp = tp;
258 args.mp = tp->t_mountp; 259 args.mp = tp->t_mountp;
@@ -382,9 +383,9 @@ xfs_ialloc_ag_alloc(
382 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); 383 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
383 be32_add_cpu(&agi->agi_count, newlen); 384 be32_add_cpu(&agi->agi_count, newlen);
384 be32_add_cpu(&agi->agi_freecount, newlen); 385 be32_add_cpu(&agi->agi_freecount, newlen);
385 down_read(&args.mp->m_peraglock); 386 pag = xfs_perag_get(args.mp, agno);
386 args.mp->m_perag[agno].pagi_freecount += newlen; 387 pag->pagi_freecount += newlen;
387 up_read(&args.mp->m_peraglock); 388 xfs_perag_put(pag);
388 agi->agi_newino = cpu_to_be32(newino); 389 agi->agi_newino = cpu_to_be32(newino);
389 390
390 /* 391 /*
@@ -486,9 +487,8 @@ xfs_ialloc_ag_select(
486 */ 487 */
487 agno = pagno; 488 agno = pagno;
488 flags = XFS_ALLOC_FLAG_TRYLOCK; 489 flags = XFS_ALLOC_FLAG_TRYLOCK;
489 down_read(&mp->m_peraglock);
490 for (;;) { 490 for (;;) {
491 pag = &mp->m_perag[agno]; 491 pag = xfs_perag_get(mp, agno);
492 if (!pag->pagi_init) { 492 if (!pag->pagi_init) {
493 if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { 493 if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
494 agbp = NULL; 494 agbp = NULL;
@@ -527,7 +527,7 @@ xfs_ialloc_ag_select(
527 agbp = NULL; 527 agbp = NULL;
528 goto nextag; 528 goto nextag;
529 } 529 }
530 up_read(&mp->m_peraglock); 530 xfs_perag_put(pag);
531 return agbp; 531 return agbp;
532 } 532 }
533 } 533 }
@@ -535,22 +535,19 @@ unlock_nextag:
535 if (agbp) 535 if (agbp)
536 xfs_trans_brelse(tp, agbp); 536 xfs_trans_brelse(tp, agbp);
537nextag: 537nextag:
538 xfs_perag_put(pag);
538 /* 539 /*
539 * No point in iterating over the rest, if we're shutting 540 * No point in iterating over the rest, if we're shutting
540 * down. 541 * down.
541 */ 542 */
542 if (XFS_FORCED_SHUTDOWN(mp)) { 543 if (XFS_FORCED_SHUTDOWN(mp))
543 up_read(&mp->m_peraglock);
544 return NULL; 544 return NULL;
545 }
546 agno++; 545 agno++;
547 if (agno >= agcount) 546 if (agno >= agcount)
548 agno = 0; 547 agno = 0;
549 if (agno == pagno) { 548 if (agno == pagno) {
550 if (flags == 0) { 549 if (flags == 0)
551 up_read(&mp->m_peraglock);
552 return NULL; 550 return NULL;
553 }
554 flags = 0; 551 flags = 0;
555 } 552 }
556 } 553 }
@@ -672,6 +669,7 @@ xfs_dialloc(
672 xfs_agnumber_t tagno; /* testing allocation group number */ 669 xfs_agnumber_t tagno; /* testing allocation group number */
673 xfs_btree_cur_t *tcur; /* temp cursor */ 670 xfs_btree_cur_t *tcur; /* temp cursor */
674 xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ 671 xfs_inobt_rec_incore_t trec; /* temp inode allocation record */
672 struct xfs_perag *pag;
675 673
676 674
677 if (*IO_agbp == NULL) { 675 if (*IO_agbp == NULL) {
@@ -771,13 +769,13 @@ nextag:
771 *inop = NULLFSINO; 769 *inop = NULLFSINO;
772 return noroom ? ENOSPC : 0; 770 return noroom ? ENOSPC : 0;
773 } 771 }
774 down_read(&mp->m_peraglock); 772 pag = xfs_perag_get(mp, tagno);
775 if (mp->m_perag[tagno].pagi_inodeok == 0) { 773 if (pag->pagi_inodeok == 0) {
776 up_read(&mp->m_peraglock); 774 xfs_perag_put(pag);
777 goto nextag; 775 goto nextag;
778 } 776 }
779 error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); 777 error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
780 up_read(&mp->m_peraglock); 778 xfs_perag_put(pag);
781 if (error) 779 if (error)
782 goto nextag; 780 goto nextag;
783 agi = XFS_BUF_TO_AGI(agbp); 781 agi = XFS_BUF_TO_AGI(agbp);
@@ -790,6 +788,7 @@ nextag:
790 */ 788 */
791 agno = tagno; 789 agno = tagno;
792 *IO_agbp = NULL; 790 *IO_agbp = NULL;
791 pag = xfs_perag_get(mp, agno);
793 792
794 restart_pagno: 793 restart_pagno:
795 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); 794 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
@@ -808,7 +807,6 @@ nextag:
808 * If in the same AG as the parent, try to get near the parent. 807 * If in the same AG as the parent, try to get near the parent.
809 */ 808 */
810 if (pagno == agno) { 809 if (pagno == agno) {
811 xfs_perag_t *pag = &mp->m_perag[agno];
812 int doneleft; /* done, to the left */ 810 int doneleft; /* done, to the left */
813 int doneright; /* done, to the right */ 811 int doneright; /* done, to the right */
814 int searchdistance = 10; 812 int searchdistance = 10;
@@ -1006,9 +1004,7 @@ alloc_inode:
1006 goto error0; 1004 goto error0;
1007 be32_add_cpu(&agi->agi_freecount, -1); 1005 be32_add_cpu(&agi->agi_freecount, -1);
1008 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1006 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1009 down_read(&mp->m_peraglock); 1007 pag->pagi_freecount--;
1010 mp->m_perag[tagno].pagi_freecount--;
1011 up_read(&mp->m_peraglock);
1012 1008
1013 error = xfs_check_agi_freecount(cur, agi); 1009 error = xfs_check_agi_freecount(cur, agi);
1014 if (error) 1010 if (error)
@@ -1016,12 +1012,14 @@ alloc_inode:
1016 1012
1017 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1013 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1018 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1014 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1015 xfs_perag_put(pag);
1019 *inop = ino; 1016 *inop = ino;
1020 return 0; 1017 return 0;
1021error1: 1018error1:
1022 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 1019 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1023error0: 1020error0:
1024 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 1021 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1022 xfs_perag_put(pag);
1025 return error; 1023 return error;
1026} 1024}
1027 1025
@@ -1052,6 +1050,7 @@ xfs_difree(
1052 xfs_mount_t *mp; /* mount structure for filesystem */ 1050 xfs_mount_t *mp; /* mount structure for filesystem */
1053 int off; /* offset of inode in inode chunk */ 1051 int off; /* offset of inode in inode chunk */
1054 xfs_inobt_rec_incore_t rec; /* btree record */ 1052 xfs_inobt_rec_incore_t rec; /* btree record */
1053 struct xfs_perag *pag;
1055 1054
1056 mp = tp->t_mountp; 1055 mp = tp->t_mountp;
1057 1056
@@ -1088,9 +1087,7 @@ xfs_difree(
1088 /* 1087 /*
1089 * Get the allocation group header. 1088 * Get the allocation group header.
1090 */ 1089 */
1091 down_read(&mp->m_peraglock);
1092 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1090 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1093 up_read(&mp->m_peraglock);
1094 if (error) { 1091 if (error) {
1095 cmn_err(CE_WARN, 1092 cmn_err(CE_WARN,
1096 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", 1093 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.",
@@ -1157,9 +1154,9 @@ xfs_difree(
1157 be32_add_cpu(&agi->agi_count, -ilen); 1154 be32_add_cpu(&agi->agi_count, -ilen);
1158 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 1155 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1159 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); 1156 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1160 down_read(&mp->m_peraglock); 1157 pag = xfs_perag_get(mp, agno);
1161 mp->m_perag[agno].pagi_freecount -= ilen - 1; 1158 pag->pagi_freecount -= ilen - 1;
1162 up_read(&mp->m_peraglock); 1159 xfs_perag_put(pag);
1163 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); 1160 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1164 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 1161 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1165 1162
@@ -1188,9 +1185,9 @@ xfs_difree(
1188 */ 1185 */
1189 be32_add_cpu(&agi->agi_freecount, 1); 1186 be32_add_cpu(&agi->agi_freecount, 1);
1190 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1187 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1191 down_read(&mp->m_peraglock); 1188 pag = xfs_perag_get(mp, agno);
1192 mp->m_perag[agno].pagi_freecount++; 1189 pag->pagi_freecount++;
1193 up_read(&mp->m_peraglock); 1190 xfs_perag_put(pag);
1194 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); 1191 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1195 } 1192 }
1196 1193
@@ -1312,9 +1309,7 @@ xfs_imap(
1312 xfs_buf_t *agbp; /* agi buffer */ 1309 xfs_buf_t *agbp; /* agi buffer */
1313 int i; /* temp state */ 1310 int i; /* temp state */
1314 1311
1315 down_read(&mp->m_peraglock);
1316 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1312 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1317 up_read(&mp->m_peraglock);
1318 if (error) { 1313 if (error) {
1319 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1314 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1320 "xfs_ialloc_read_agi() returned " 1315 "xfs_ialloc_read_agi() returned "
@@ -1379,7 +1374,6 @@ xfs_imap(
1379 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 1374 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1380 return XFS_ERROR(EINVAL); 1375 return XFS_ERROR(EINVAL);
1381 } 1376 }
1382
1383 return 0; 1377 return 0;
1384} 1378}
1385 1379
@@ -1523,8 +1517,7 @@ xfs_ialloc_read_agi(
1523 return error; 1517 return error;
1524 1518
1525 agi = XFS_BUF_TO_AGI(*bpp); 1519 agi = XFS_BUF_TO_AGI(*bpp);
1526 pag = &mp->m_perag[agno]; 1520 pag = xfs_perag_get(mp, agno);
1527
1528 if (!pag->pagi_init) { 1521 if (!pag->pagi_init) {
1529 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); 1522 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1530 pag->pagi_count = be32_to_cpu(agi->agi_count); 1523 pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -1537,6 +1530,7 @@ xfs_ialloc_read_agi(
1537 */ 1530 */
1538 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || 1531 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
1539 XFS_FORCED_SHUTDOWN(mp)); 1532 XFS_FORCED_SHUTDOWN(mp));
1533 xfs_perag_put(pag);
1540 return 0; 1534 return 0;
1541} 1535}
1542 1536
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 155e798f30a1..e281eb4a1c49 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -374,7 +374,7 @@ xfs_iget(
374 return EINVAL; 374 return EINVAL;
375 375
376 /* get the perag structure and ensure that it's inode capable */ 376 /* get the perag structure and ensure that it's inode capable */
377 pag = xfs_get_perag(mp, ino); 377 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
378 if (!pag->pagi_inodeok) 378 if (!pag->pagi_inodeok)
379 return EINVAL; 379 return EINVAL;
380 ASSERT(pag->pag_ici_init); 380 ASSERT(pag->pag_ici_init);
@@ -398,7 +398,7 @@ again:
398 if (error) 398 if (error)
399 goto out_error_or_again; 399 goto out_error_or_again;
400 } 400 }
401 xfs_put_perag(mp, pag); 401 xfs_perag_put(pag);
402 402
403 *ipp = ip; 403 *ipp = ip;
404 404
@@ -417,7 +417,7 @@ out_error_or_again:
417 delay(1); 417 delay(1);
418 goto again; 418 goto again;
419 } 419 }
420 xfs_put_perag(mp, pag); 420 xfs_perag_put(pag);
421 return error; 421 return error;
422} 422}
423 423
@@ -488,12 +488,12 @@ xfs_ireclaim(
488 * added to the tree assert that it's been there before to catch 488 * added to the tree assert that it's been there before to catch
489 * problems with the inode life time early on. 489 * problems with the inode life time early on.
490 */ 490 */
491 pag = xfs_get_perag(mp, ip->i_ino); 491 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
492 write_lock(&pag->pag_ici_lock); 492 write_lock(&pag->pag_ici_lock);
493 if (!radix_tree_delete(&pag->pag_ici_root, agino)) 493 if (!radix_tree_delete(&pag->pag_ici_root, agino))
494 ASSERT(0); 494 ASSERT(0);
495 write_unlock(&pag->pag_ici_lock); 495 write_unlock(&pag->pag_ici_lock);
496 xfs_put_perag(mp, pag); 496 xfs_perag_put(pag);
497 497
498 /* 498 /*
499 * Here we do an (almost) spurious inode lock in order to coordinate 499 * Here we do an (almost) spurious inode lock in order to coordinate
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ef77fd88c8e3..fa31360046d4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -151,7 +151,7 @@ xfs_imap_to_bp(
151 "an error %d on %s. Returning error.", 151 "an error %d on %s. Returning error.",
152 error, mp->m_fsname); 152 error, mp->m_fsname);
153 } else { 153 } else {
154 ASSERT(buf_flags & XFS_BUF_TRYLOCK); 154 ASSERT(buf_flags & XBF_TRYLOCK);
155 } 155 }
156 return error; 156 return error;
157 } 157 }
@@ -239,7 +239,7 @@ xfs_inotobp(
239 if (error) 239 if (error)
240 return error; 240 return error;
241 241
242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); 242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
243 if (error) 243 if (error)
244 return error; 244 return error;
245 245
@@ -285,7 +285,7 @@ xfs_itobp(
285 return error; 285 return error;
286 286
287 if (!bp) { 287 if (!bp) {
288 ASSERT(buf_flags & XFS_BUF_TRYLOCK); 288 ASSERT(buf_flags & XBF_TRYLOCK);
289 ASSERT(tp == NULL); 289 ASSERT(tp == NULL);
290 *bpp = NULL; 290 *bpp = NULL;
291 return EAGAIN; 291 return EAGAIN;
@@ -807,7 +807,7 @@ xfs_iread(
807 * Get pointers to the on-disk inode and the buffer containing it. 807 * Get pointers to the on-disk inode and the buffer containing it.
808 */ 808 */
809 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 809 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
810 XFS_BUF_LOCK, iget_flags); 810 XBF_LOCK, iget_flags);
811 if (error) 811 if (error)
812 return error; 812 return error;
813 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 813 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -1751,7 +1751,7 @@ xfs_iunlink(
1751 * Here we put the head pointer into our next pointer, 1751 * Here we put the head pointer into our next pointer,
1752 * and then we fall through to point the head at us. 1752 * and then we fall through to point the head at us.
1753 */ 1753 */
1754 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1754 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1755 if (error) 1755 if (error)
1756 return error; 1756 return error;
1757 1757
@@ -1833,7 +1833,7 @@ xfs_iunlink_remove(
1833 * of dealing with the buffer when there is no need to 1833 * of dealing with the buffer when there is no need to
1834 * change it. 1834 * change it.
1835 */ 1835 */
1836 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1836 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1837 if (error) { 1837 if (error) {
1838 cmn_err(CE_WARN, 1838 cmn_err(CE_WARN,
1839 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1839 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1895,7 +1895,7 @@ xfs_iunlink_remove(
1895 * Now last_ibp points to the buffer previous to us on 1895 * Now last_ibp points to the buffer previous to us on
1896 * the unlinked list. Pull us from the list. 1896 * the unlinked list. Pull us from the list.
1897 */ 1897 */
1898 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1898 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1899 if (error) { 1899 if (error) {
1900 cmn_err(CE_WARN, 1900 cmn_err(CE_WARN,
1901 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1901 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1946,8 +1946,9 @@ xfs_ifree_cluster(
1946 xfs_inode_t *ip, **ip_found; 1946 xfs_inode_t *ip, **ip_found;
1947 xfs_inode_log_item_t *iip; 1947 xfs_inode_log_item_t *iip;
1948 xfs_log_item_t *lip; 1948 xfs_log_item_t *lip;
1949 xfs_perag_t *pag = xfs_get_perag(mp, inum); 1949 struct xfs_perag *pag;
1950 1950
1951 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1951 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 1952 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1952 blks_per_cluster = 1; 1953 blks_per_cluster = 1;
1953 ninodes = mp->m_sb.sb_inopblock; 1954 ninodes = mp->m_sb.sb_inopblock;
@@ -2039,7 +2040,7 @@ xfs_ifree_cluster(
2039 2040
2040 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2041 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2041 mp->m_bsize * blks_per_cluster, 2042 mp->m_bsize * blks_per_cluster,
2042 XFS_BUF_LOCK); 2043 XBF_LOCK);
2043 2044
2044 pre_flushed = 0; 2045 pre_flushed = 0;
2045 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2046 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -2088,7 +2089,7 @@ xfs_ifree_cluster(
2088 } 2089 }
2089 2090
2090 kmem_free(ip_found); 2091 kmem_free(ip_found);
2091 xfs_put_perag(mp, pag); 2092 xfs_perag_put(pag);
2092} 2093}
2093 2094
2094/* 2095/*
@@ -2150,7 +2151,7 @@ xfs_ifree(
2150 2151
2151 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2152 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2152 2153
2153 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 2154 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
2154 if (error) 2155 if (error)
2155 return error; 2156 return error;
2156 2157
@@ -2483,13 +2484,16 @@ __xfs_iunpin_wait(
2483 return; 2484 return;
2484 2485
2485 /* Give the log a push to start the unpinning I/O */ 2486 /* Give the log a push to start the unpinning I/O */
2486 xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? 2487 if (iip && iip->ili_last_lsn)
2487 iip->ili_last_lsn : 0, XFS_LOG_FORCE); 2488 xfs_log_force_lsn(ip->i_mount, iip->ili_last_lsn, 0);
2489 else
2490 xfs_log_force(ip->i_mount, 0);
2491
2488 if (wait) 2492 if (wait)
2489 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); 2493 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2490} 2494}
2491 2495
2492static inline void 2496void
2493xfs_iunpin_wait( 2497xfs_iunpin_wait(
2494 xfs_inode_t *ip) 2498 xfs_inode_t *ip)
2495{ 2499{
@@ -2675,7 +2679,7 @@ xfs_iflush_cluster(
2675 xfs_buf_t *bp) 2679 xfs_buf_t *bp)
2676{ 2680{
2677 xfs_mount_t *mp = ip->i_mount; 2681 xfs_mount_t *mp = ip->i_mount;
2678 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 2682 struct xfs_perag *pag;
2679 unsigned long first_index, mask; 2683 unsigned long first_index, mask;
2680 unsigned long inodes_per_cluster; 2684 unsigned long inodes_per_cluster;
2681 int ilist_size; 2685 int ilist_size;
@@ -2686,6 +2690,7 @@ xfs_iflush_cluster(
2686 int bufwasdelwri; 2690 int bufwasdelwri;
2687 int i; 2691 int i;
2688 2692
2693 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2689 ASSERT(pag->pagi_inodeok); 2694 ASSERT(pag->pagi_inodeok);
2690 ASSERT(pag->pag_ici_init); 2695 ASSERT(pag->pag_ici_init);
2691 2696
@@ -2693,7 +2698,7 @@ xfs_iflush_cluster(
2693 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2698 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2694 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2699 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2695 if (!ilist) 2700 if (!ilist)
2696 return 0; 2701 goto out_put;
2697 2702
2698 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2703 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2699 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2704 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
@@ -2762,6 +2767,8 @@ xfs_iflush_cluster(
2762out_free: 2767out_free:
2763 read_unlock(&pag->pag_ici_lock); 2768 read_unlock(&pag->pag_ici_lock);
2764 kmem_free(ilist); 2769 kmem_free(ilist);
2770out_put:
2771 xfs_perag_put(pag);
2765 return 0; 2772 return 0;
2766 2773
2767 2774
@@ -2805,6 +2812,7 @@ cluster_corrupt_out:
2805 */ 2812 */
2806 xfs_iflush_abort(iq); 2813 xfs_iflush_abort(iq);
2807 kmem_free(ilist); 2814 kmem_free(ilist);
2815 xfs_perag_put(pag);
2808 return XFS_ERROR(EFSCORRUPTED); 2816 return XFS_ERROR(EFSCORRUPTED);
2809} 2817}
2810 2818
@@ -2827,8 +2835,6 @@ xfs_iflush(
2827 xfs_dinode_t *dip; 2835 xfs_dinode_t *dip;
2828 xfs_mount_t *mp; 2836 xfs_mount_t *mp;
2829 int error; 2837 int error;
2830 int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
2831 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
2832 2838
2833 XFS_STATS_INC(xs_iflush_count); 2839 XFS_STATS_INC(xs_iflush_count);
2834 2840
@@ -2841,15 +2847,6 @@ xfs_iflush(
2841 mp = ip->i_mount; 2847 mp = ip->i_mount;
2842 2848
2843 /* 2849 /*
2844 * If the inode isn't dirty, then just release the inode flush lock and
2845 * do nothing.
2846 */
2847 if (xfs_inode_clean(ip)) {
2848 xfs_ifunlock(ip);
2849 return 0;
2850 }
2851
2852 /*
2853 * We can't flush the inode until it is unpinned, so wait for it if we 2850 * We can't flush the inode until it is unpinned, so wait for it if we
2854 * are allowed to block. We know noone new can pin it, because we are 2851 * are allowed to block. We know noone new can pin it, because we are
2855 * holding the inode lock shared and you need to hold it exclusively to 2852 * holding the inode lock shared and you need to hold it exclusively to
@@ -2860,7 +2857,7 @@ xfs_iflush(
2860 * in the same cluster are dirty, they will probably write the inode 2857 * in the same cluster are dirty, they will probably write the inode
2861 * out for us if they occur after the log force completes. 2858 * out for us if they occur after the log force completes.
2862 */ 2859 */
2863 if (noblock && xfs_ipincount(ip)) { 2860 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2864 xfs_iunpin_nowait(ip); 2861 xfs_iunpin_nowait(ip);
2865 xfs_ifunlock(ip); 2862 xfs_ifunlock(ip);
2866 return EAGAIN; 2863 return EAGAIN;
@@ -2894,60 +2891,10 @@ xfs_iflush(
2894 } 2891 }
2895 2892
2896 /* 2893 /*
2897 * Decide how buffer will be flushed out. This is done before
2898 * the call to xfs_iflush_int because this field is zeroed by it.
2899 */
2900 if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2901 /*
2902 * Flush out the inode buffer according to the directions
2903 * of the caller. In the cases where the caller has given
2904 * us a choice choose the non-delwri case. This is because
2905 * the inode is in the AIL and we need to get it out soon.
2906 */
2907 switch (flags) {
2908 case XFS_IFLUSH_SYNC:
2909 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2910 flags = 0;
2911 break;
2912 case XFS_IFLUSH_ASYNC_NOBLOCK:
2913 case XFS_IFLUSH_ASYNC:
2914 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2915 flags = INT_ASYNC;
2916 break;
2917 case XFS_IFLUSH_DELWRI:
2918 flags = INT_DELWRI;
2919 break;
2920 default:
2921 ASSERT(0);
2922 flags = 0;
2923 break;
2924 }
2925 } else {
2926 switch (flags) {
2927 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2928 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2929 case XFS_IFLUSH_DELWRI:
2930 flags = INT_DELWRI;
2931 break;
2932 case XFS_IFLUSH_ASYNC_NOBLOCK:
2933 case XFS_IFLUSH_ASYNC:
2934 flags = INT_ASYNC;
2935 break;
2936 case XFS_IFLUSH_SYNC:
2937 flags = 0;
2938 break;
2939 default:
2940 ASSERT(0);
2941 flags = 0;
2942 break;
2943 }
2944 }
2945
2946 /*
2947 * Get the buffer containing the on-disk inode. 2894 * Get the buffer containing the on-disk inode.
2948 */ 2895 */
2949 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2896 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2950 noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); 2897 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
2951 if (error || !bp) { 2898 if (error || !bp) {
2952 xfs_ifunlock(ip); 2899 xfs_ifunlock(ip);
2953 return error; 2900 return error;
@@ -2965,7 +2912,7 @@ xfs_iflush(
2965 * get stuck waiting in the write for too long. 2912 * get stuck waiting in the write for too long.
2966 */ 2913 */
2967 if (XFS_BUF_ISPINNED(bp)) 2914 if (XFS_BUF_ISPINNED(bp))
2968 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 2915 xfs_log_force(mp, 0);
2969 2916
2970 /* 2917 /*
2971 * inode clustering: 2918 * inode clustering:
@@ -2975,13 +2922,10 @@ xfs_iflush(
2975 if (error) 2922 if (error)
2976 goto cluster_corrupt_out; 2923 goto cluster_corrupt_out;
2977 2924
2978 if (flags & INT_DELWRI) { 2925 if (flags & SYNC_WAIT)
2979 xfs_bdwrite(mp, bp);
2980 } else if (flags & INT_ASYNC) {
2981 error = xfs_bawrite(mp, bp);
2982 } else {
2983 error = xfs_bwrite(mp, bp); 2926 error = xfs_bwrite(mp, bp);
2984 } 2927 else
2928 xfs_bdwrite(mp, bp);
2985 return error; 2929 return error;
2986 2930
2987corrupt_out: 2931corrupt_out:
@@ -3016,16 +2960,6 @@ xfs_iflush_int(
3016 iip = ip->i_itemp; 2960 iip = ip->i_itemp;
3017 mp = ip->i_mount; 2961 mp = ip->i_mount;
3018 2962
3019
3020 /*
3021 * If the inode isn't dirty, then just release the inode
3022 * flush lock and do nothing.
3023 */
3024 if (xfs_inode_clean(ip)) {
3025 xfs_ifunlock(ip);
3026 return 0;
3027 }
3028
3029 /* set *dip = inode's place in the buffer */ 2963 /* set *dip = inode's place in the buffer */
3030 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2964 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3031 2965
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ec1f28c4fc4f..6c912b027596 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
421 421
422/* 422/*
423 * Flags for xfs_iflush()
424 */
425#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1
426#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2
427#define XFS_IFLUSH_SYNC 3
428#define XFS_IFLUSH_ASYNC 4
429#define XFS_IFLUSH_DELWRI 5
430#define XFS_IFLUSH_ASYNC_NOBLOCK 6
431
432/*
433 * Flags for xfs_itruncate_start(). 423 * Flags for xfs_itruncate_start().
434 */ 424 */
435#define XFS_ITRUNC_DEFINITE 0x1 425#define XFS_ITRUNC_DEFINITE 0x1
@@ -483,6 +473,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
483void xfs_iext_realloc(xfs_inode_t *, int, int); 473void xfs_iext_realloc(xfs_inode_t *, int, int);
484void xfs_ipin(xfs_inode_t *); 474void xfs_ipin(xfs_inode_t *);
485void xfs_iunpin(xfs_inode_t *); 475void xfs_iunpin(xfs_inode_t *);
476void xfs_iunpin_wait(xfs_inode_t *);
486int xfs_iflush(xfs_inode_t *, uint); 477int xfs_iflush(xfs_inode_t *, uint);
487void xfs_ichgtime(xfs_inode_t *, int); 478void xfs_ichgtime(xfs_inode_t *, int);
488void xfs_lock_inodes(xfs_inode_t **, int, uint); 479void xfs_lock_inodes(xfs_inode_t **, int, uint);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f38855d21ea5..d4dc063111f8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -228,7 +228,7 @@ xfs_inode_item_format(
228 228
229 vecp->i_addr = (xfs_caddr_t)&iip->ili_format; 229 vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
230 vecp->i_len = sizeof(xfs_inode_log_format_t); 230 vecp->i_len = sizeof(xfs_inode_log_format_t);
231 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT); 231 vecp->i_type = XLOG_REG_TYPE_IFORMAT;
232 vecp++; 232 vecp++;
233 nvecs = 1; 233 nvecs = 1;
234 234
@@ -279,7 +279,7 @@ xfs_inode_item_format(
279 279
280 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 280 vecp->i_addr = (xfs_caddr_t)&ip->i_d;
281 vecp->i_len = sizeof(struct xfs_icdinode); 281 vecp->i_len = sizeof(struct xfs_icdinode);
282 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); 282 vecp->i_type = XLOG_REG_TYPE_ICORE;
283 vecp++; 283 vecp++;
284 nvecs++; 284 nvecs++;
285 iip->ili_format.ilf_fields |= XFS_ILOG_CORE; 285 iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
@@ -336,7 +336,7 @@ xfs_inode_item_format(
336 vecp->i_addr = 336 vecp->i_addr =
337 (char *)(ip->i_df.if_u1.if_extents); 337 (char *)(ip->i_df.if_u1.if_extents);
338 vecp->i_len = ip->i_df.if_bytes; 338 vecp->i_len = ip->i_df.if_bytes;
339 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 339 vecp->i_type = XLOG_REG_TYPE_IEXT;
340 } else 340 } else
341#endif 341#endif
342 { 342 {
@@ -355,7 +355,7 @@ xfs_inode_item_format(
355 vecp->i_addr = (xfs_caddr_t)ext_buffer; 355 vecp->i_addr = (xfs_caddr_t)ext_buffer;
356 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 356 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
357 XFS_DATA_FORK); 357 XFS_DATA_FORK);
358 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 358 vecp->i_type = XLOG_REG_TYPE_IEXT;
359 } 359 }
360 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 360 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
361 iip->ili_format.ilf_dsize = vecp->i_len; 361 iip->ili_format.ilf_dsize = vecp->i_len;
@@ -373,7 +373,7 @@ xfs_inode_item_format(
373 ASSERT(ip->i_df.if_broot != NULL); 373 ASSERT(ip->i_df.if_broot != NULL);
374 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; 374 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
375 vecp->i_len = ip->i_df.if_broot_bytes; 375 vecp->i_len = ip->i_df.if_broot_bytes;
376 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT); 376 vecp->i_type = XLOG_REG_TYPE_IBROOT;
377 vecp++; 377 vecp++;
378 nvecs++; 378 nvecs++;
379 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; 379 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
@@ -399,7 +399,7 @@ xfs_inode_item_format(
399 ASSERT((ip->i_df.if_real_bytes == 0) || 399 ASSERT((ip->i_df.if_real_bytes == 0) ||
400 (ip->i_df.if_real_bytes == data_bytes)); 400 (ip->i_df.if_real_bytes == data_bytes));
401 vecp->i_len = (int)data_bytes; 401 vecp->i_len = (int)data_bytes;
402 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL); 402 vecp->i_type = XLOG_REG_TYPE_ILOCAL;
403 vecp++; 403 vecp++;
404 nvecs++; 404 nvecs++;
405 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 405 iip->ili_format.ilf_dsize = (unsigned)data_bytes;
@@ -477,7 +477,7 @@ xfs_inode_item_format(
477 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 477 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
478 XFS_ATTR_FORK); 478 XFS_ATTR_FORK);
479#endif 479#endif
480 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT); 480 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
481 iip->ili_format.ilf_asize = vecp->i_len; 481 iip->ili_format.ilf_asize = vecp->i_len;
482 vecp++; 482 vecp++;
483 nvecs++; 483 nvecs++;
@@ -492,7 +492,7 @@ xfs_inode_item_format(
492 ASSERT(ip->i_afp->if_broot != NULL); 492 ASSERT(ip->i_afp->if_broot != NULL);
493 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; 493 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
494 vecp->i_len = ip->i_afp->if_broot_bytes; 494 vecp->i_len = ip->i_afp->if_broot_bytes;
495 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT); 495 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
496 vecp++; 496 vecp++;
497 nvecs++; 497 nvecs++;
498 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; 498 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
@@ -516,7 +516,7 @@ xfs_inode_item_format(
516 ASSERT((ip->i_afp->if_real_bytes == 0) || 516 ASSERT((ip->i_afp->if_real_bytes == 0) ||
517 (ip->i_afp->if_real_bytes == data_bytes)); 517 (ip->i_afp->if_real_bytes == data_bytes));
518 vecp->i_len = (int)data_bytes; 518 vecp->i_len = (int)data_bytes;
519 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL); 519 vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
520 vecp++; 520 vecp++;
521 nvecs++; 521 nvecs++;
522 iip->ili_format.ilf_asize = (unsigned)data_bytes; 522 iip->ili_format.ilf_asize = (unsigned)data_bytes;
@@ -602,33 +602,20 @@ xfs_inode_item_trylock(
602 602
603 if (!xfs_iflock_nowait(ip)) { 603 if (!xfs_iflock_nowait(ip)) {
604 /* 604 /*
605 * If someone else isn't already trying to push the inode 605 * inode has already been flushed to the backing buffer,
606 * buffer, we get to do it. 606 * leave it locked in shared mode, pushbuf routine will
607 * unlock it.
607 */ 608 */
608 if (iip->ili_pushbuf_flag == 0) { 609 return XFS_ITEM_PUSHBUF;
609 iip->ili_pushbuf_flag = 1;
610#ifdef DEBUG
611 iip->ili_push_owner = current_pid();
612#endif
613 /*
614 * Inode is left locked in shared mode.
615 * Pushbuf routine gets to unlock it.
616 */
617 return XFS_ITEM_PUSHBUF;
618 } else {
619 /*
620 * We hold the AIL lock, so we must specify the
621 * NONOTIFY flag so that we won't double trip.
622 */
623 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
624 return XFS_ITEM_FLUSHING;
625 }
626 /* NOTREACHED */
627 } 610 }
628 611
629 /* Stale items should force out the iclog */ 612 /* Stale items should force out the iclog */
630 if (ip->i_flags & XFS_ISTALE) { 613 if (ip->i_flags & XFS_ISTALE) {
631 xfs_ifunlock(ip); 614 xfs_ifunlock(ip);
615 /*
616 * we hold the AIL lock - notify the unlock routine of this
617 * so it doesn't try to get the lock again.
618 */
632 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); 619 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
633 return XFS_ITEM_PINNED; 620 return XFS_ITEM_PINNED;
634 } 621 }
@@ -746,11 +733,8 @@ xfs_inode_item_committed(
746 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK 733 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
747 * failed to get the inode flush lock but did get the inode locked SHARED. 734 * failed to get the inode flush lock but did get the inode locked SHARED.
748 * Here we're trying to see if the inode buffer is incore, and if so whether it's 735 * Here we're trying to see if the inode buffer is incore, and if so whether it's
749 * marked delayed write. If that's the case, we'll initiate a bawrite on that 736 * marked delayed write. If that's the case, we'll promote it and that will
750 * buffer to expedite the process. 737 * allow the caller to write the buffer by triggering the xfsbufd to run.
751 *
752 * We aren't holding the AIL lock (or the flush lock) when this gets called,
753 * so it is inherently race-y.
754 */ 738 */
755STATIC void 739STATIC void
756xfs_inode_item_pushbuf( 740xfs_inode_item_pushbuf(
@@ -759,82 +743,30 @@ xfs_inode_item_pushbuf(
759 xfs_inode_t *ip; 743 xfs_inode_t *ip;
760 xfs_mount_t *mp; 744 xfs_mount_t *mp;
761 xfs_buf_t *bp; 745 xfs_buf_t *bp;
762 uint dopush;
763 746
764 ip = iip->ili_inode; 747 ip = iip->ili_inode;
765
766 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 748 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
767 749
768 /* 750 /*
769 * The ili_pushbuf_flag keeps others from
770 * trying to duplicate our effort.
771 */
772 ASSERT(iip->ili_pushbuf_flag != 0);
773 ASSERT(iip->ili_push_owner == current_pid());
774
775 /*
776 * If a flush is not in progress anymore, chances are that the 751 * If a flush is not in progress anymore, chances are that the
777 * inode was taken off the AIL. So, just get out. 752 * inode was taken off the AIL. So, just get out.
778 */ 753 */
779 if (completion_done(&ip->i_flush) || 754 if (completion_done(&ip->i_flush) ||
780 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 755 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
781 iip->ili_pushbuf_flag = 0;
782 xfs_iunlock(ip, XFS_ILOCK_SHARED); 756 xfs_iunlock(ip, XFS_ILOCK_SHARED);
783 return; 757 return;
784 } 758 }
785 759
786 mp = ip->i_mount; 760 mp = ip->i_mount;
787 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, 761 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
788 iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); 762 iip->ili_format.ilf_len, XBF_TRYLOCK);
789 763
790 if (bp != NULL) {
791 if (XFS_BUF_ISDELAYWRITE(bp)) {
792 /*
793 * We were racing with iflush because we don't hold
794 * the AIL lock or the flush lock. However, at this point,
795 * we have the buffer, and we know that it's dirty.
796 * So, it's possible that iflush raced with us, and
797 * this item is already taken off the AIL.
798 * If not, we can flush it async.
799 */
800 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
801 !completion_done(&ip->i_flush));
802 iip->ili_pushbuf_flag = 0;
803 xfs_iunlock(ip, XFS_ILOCK_SHARED);
804
805 trace_xfs_inode_item_push(bp, _RET_IP_);
806
807 if (XFS_BUF_ISPINNED(bp)) {
808 xfs_log_force(mp, (xfs_lsn_t)0,
809 XFS_LOG_FORCE);
810 }
811 if (dopush) {
812 int error;
813 error = xfs_bawrite(mp, bp);
814 if (error)
815 xfs_fs_cmn_err(CE_WARN, mp,
816 "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
817 error, iip, bp);
818 } else {
819 xfs_buf_relse(bp);
820 }
821 } else {
822 iip->ili_pushbuf_flag = 0;
823 xfs_iunlock(ip, XFS_ILOCK_SHARED);
824 xfs_buf_relse(bp);
825 }
826 return;
827 }
828 /*
829 * We have to be careful about resetting pushbuf flag too early (above).
830 * Even though in theory we can do it as soon as we have the buflock,
831 * we don't want others to be doing work needlessly. They'll come to
832 * this function thinking that pushing the buffer is their
833 * responsibility only to find that the buffer is still locked by
834 * another doing the same thing
835 */
836 iip->ili_pushbuf_flag = 0;
837 xfs_iunlock(ip, XFS_ILOCK_SHARED); 764 xfs_iunlock(ip, XFS_ILOCK_SHARED);
765 if (!bp)
766 return;
767 if (XFS_BUF_ISDELAYWRITE(bp))
768 xfs_buf_delwri_promote(bp);
769 xfs_buf_relse(bp);
838 return; 770 return;
839} 771}
840 772
@@ -867,10 +799,14 @@ xfs_inode_item_push(
867 iip->ili_format.ilf_fields != 0); 799 iip->ili_format.ilf_fields != 0);
868 800
869 /* 801 /*
870 * Write out the inode. The completion routine ('iflush_done') will 802 * Push the inode to it's backing buffer. This will not remove the
871 * pull it from the AIL, mark it clean, unlock the flush lock. 803 * inode from the AIL - a further push will be required to trigger a
804 * buffer push. However, this allows all the dirty inodes to be pushed
805 * to the buffer before it is pushed to disk. THe buffer IO completion
806 * will pull th einode from the AIL, mark it clean and unlock the flush
807 * lock.
872 */ 808 */
873 (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); 809 (void) xfs_iflush(ip, 0);
874 xfs_iunlock(ip, XFS_ILOCK_SHARED); 810 xfs_iunlock(ip, XFS_ILOCK_SHARED);
875 811
876 return; 812 return;
@@ -934,7 +870,6 @@ xfs_inode_item_init(
934 /* 870 /*
935 We have zeroed memory. No need ... 871 We have zeroed memory. No need ...
936 iip->ili_extents_buf = NULL; 872 iip->ili_extents_buf = NULL;
937 iip->ili_pushbuf_flag = 0;
938 */ 873 */
939 874
940 iip->ili_format.ilf_type = XFS_LI_INODE; 875 iip->ili_format.ilf_type = XFS_LI_INODE;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index cc8df1ac7783..9a467958ecdd 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -144,12 +144,6 @@ typedef struct xfs_inode_log_item {
144 data exts */ 144 data exts */
145 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged 145 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
146 attr exts */ 146 attr exts */
147 unsigned int ili_pushbuf_flag; /* one bit used in push_ail */
148
149#ifdef DEBUG
150 uint64_t ili_push_owner; /* one who sets pushbuf_flag
151 above gets to push the buf */
152#endif
153#ifdef XFS_TRANS_DEBUG 147#ifdef XFS_TRANS_DEBUG
154 int ili_root_size; 148 int ili_root_size;
155 char *ili_orig_root; 149 char *ili_orig_root;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 62efab2f3839..3af02314c605 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -408,8 +408,10 @@ xfs_bulkstat(
408 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); 408 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
409 nimask = ~(nicluster - 1); 409 nimask = ~(nicluster - 1);
410 nbcluster = nicluster >> mp->m_sb.sb_inopblog; 410 nbcluster = nicluster >> mp->m_sb.sb_inopblog;
411 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4, 411 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
412 KM_SLEEP | KM_MAYFAIL | KM_LARGE); 412 if (!irbuf)
413 return ENOMEM;
414
413 nirbuf = irbsize / sizeof(*irbuf); 415 nirbuf = irbsize / sizeof(*irbuf);
414 416
415 /* 417 /*
@@ -420,9 +422,7 @@ xfs_bulkstat(
420 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { 422 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
421 cond_resched(); 423 cond_resched();
422 bp = NULL; 424 bp = NULL;
423 down_read(&mp->m_peraglock);
424 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 425 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
425 up_read(&mp->m_peraglock);
426 if (error) { 426 if (error) {
427 /* 427 /*
428 * Skip this allocation group and go to the next one. 428 * Skip this allocation group and go to the next one.
@@ -729,7 +729,7 @@ xfs_bulkstat(
729 /* 729 /*
730 * Done, we're either out of filesystem or space to put the data. 730 * Done, we're either out of filesystem or space to put the data.
731 */ 731 */
732 kmem_free(irbuf); 732 kmem_free_large(irbuf);
733 *ubcountp = ubelem; 733 *ubcountp = ubelem;
734 /* 734 /*
735 * Found some inodes, return them now and return the error next time. 735 * Found some inodes, return them now and return the error next time.
@@ -849,9 +849,7 @@ xfs_inumbers(
849 agbp = NULL; 849 agbp = NULL;
850 while (left > 0 && agno < mp->m_sb.sb_agcount) { 850 while (left > 0 && agno < mp->m_sb.sb_agcount) {
851 if (agbp == NULL) { 851 if (agbp == NULL) {
852 down_read(&mp->m_peraglock);
853 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 852 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
854 up_read(&mp->m_peraglock);
855 if (error) { 853 if (error) {
856 /* 854 /*
857 * If we can't read the AGI of this ag, 855 * If we can't read the AGI of this ag,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 600b5b06aaeb..4f16be4b6ee5 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -50,7 +50,6 @@ kmem_zone_t *xfs_log_ticket_zone;
50 (off) += (bytes);} 50 (off) += (bytes);}
51 51
52/* Local miscellaneous function prototypes */ 52/* Local miscellaneous function prototypes */
53STATIC int xlog_bdstrat_cb(struct xfs_buf *);
54STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, 53STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
55 xlog_in_core_t **, xfs_lsn_t *); 54 xlog_in_core_t **, xfs_lsn_t *);
56STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, 55STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
@@ -80,11 +79,6 @@ STATIC int xlog_state_release_iclog(xlog_t *log,
80STATIC void xlog_state_switch_iclogs(xlog_t *log, 79STATIC void xlog_state_switch_iclogs(xlog_t *log,
81 xlog_in_core_t *iclog, 80 xlog_in_core_t *iclog,
82 int eventual_size); 81 int eventual_size);
83STATIC int xlog_state_sync(xlog_t *log,
84 xfs_lsn_t lsn,
85 uint flags,
86 int *log_flushed);
87STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
88STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); 82STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
89 83
90/* local functions to manipulate grant head */ 84/* local functions to manipulate grant head */
@@ -297,65 +291,6 @@ xfs_log_done(xfs_mount_t *mp,
297 return lsn; 291 return lsn;
298} /* xfs_log_done */ 292} /* xfs_log_done */
299 293
300
301/*
302 * Force the in-core log to disk. If flags == XFS_LOG_SYNC,
303 * the force is done synchronously.
304 *
305 * Asynchronous forces are implemented by setting the WANT_SYNC
306 * bit in the appropriate in-core log and then returning.
307 *
308 * Synchronous forces are implemented with a signal variable. All callers
309 * to force a given lsn to disk will wait on a the sv attached to the
310 * specific in-core log. When given in-core log finally completes its
311 * write to disk, that thread will wake up all threads waiting on the
312 * sv.
313 */
314int
315_xfs_log_force(
316 xfs_mount_t *mp,
317 xfs_lsn_t lsn,
318 uint flags,
319 int *log_flushed)
320{
321 xlog_t *log = mp->m_log;
322 int dummy;
323
324 if (!log_flushed)
325 log_flushed = &dummy;
326
327 ASSERT(flags & XFS_LOG_FORCE);
328
329 XFS_STATS_INC(xs_log_force);
330
331 if (log->l_flags & XLOG_IO_ERROR)
332 return XFS_ERROR(EIO);
333 if (lsn == 0)
334 return xlog_state_sync_all(log, flags, log_flushed);
335 else
336 return xlog_state_sync(log, lsn, flags, log_flushed);
337} /* _xfs_log_force */
338
339/*
340 * Wrapper for _xfs_log_force(), to be used when caller doesn't care
341 * about errors or whether the log was flushed or not. This is the normal
342 * interface to use when trying to unpin items or move the log forward.
343 */
344void
345xfs_log_force(
346 xfs_mount_t *mp,
347 xfs_lsn_t lsn,
348 uint flags)
349{
350 int error;
351 error = _xfs_log_force(mp, lsn, flags, NULL);
352 if (error) {
353 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
354 "error %d returned.", error);
355 }
356}
357
358
359/* 294/*
360 * Attaches a new iclog I/O completion callback routine during 295 * Attaches a new iclog I/O completion callback routine during
361 * transaction commit. If the log is in error state, a non-zero 296 * transaction commit. If the log is in error state, a non-zero
@@ -602,7 +537,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
602 if (mp->m_flags & XFS_MOUNT_RDONLY) 537 if (mp->m_flags & XFS_MOUNT_RDONLY)
603 return 0; 538 return 0;
604 539
605 error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL); 540 error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
606 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); 541 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
607 542
608#ifdef DEBUG 543#ifdef DEBUG
@@ -618,7 +553,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
618 if (! (XLOG_FORCED_SHUTDOWN(log))) { 553 if (! (XLOG_FORCED_SHUTDOWN(log))) {
619 reg[0].i_addr = (void*)&magic; 554 reg[0].i_addr = (void*)&magic;
620 reg[0].i_len = sizeof(magic); 555 reg[0].i_len = sizeof(magic);
621 XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT); 556 reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
622 557
623 error = xfs_log_reserve(mp, 600, 1, &tic, 558 error = xfs_log_reserve(mp, 600, 1, &tic,
624 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); 559 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
@@ -988,35 +923,6 @@ xlog_iodone(xfs_buf_t *bp)
988} /* xlog_iodone */ 923} /* xlog_iodone */
989 924
990/* 925/*
991 * The bdstrat callback function for log bufs. This gives us a central
992 * place to trap bufs in case we get hit by a log I/O error and need to
993 * shutdown. Actually, in practice, even when we didn't get a log error,
994 * we transition the iclogs to IOERROR state *after* flushing all existing
995 * iclogs to disk. This is because we don't want anymore new transactions to be
996 * started or completed afterwards.
997 */
998STATIC int
999xlog_bdstrat_cb(struct xfs_buf *bp)
1000{
1001 xlog_in_core_t *iclog;
1002
1003 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
1004
1005 if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
1006 /* note for irix bstrat will need struct bdevsw passed
1007 * Fix the following macro if the code ever is merged
1008 */
1009 XFS_bdstrat(bp);
1010 return 0;
1011 }
1012
1013 XFS_BUF_ERROR(bp, EIO);
1014 XFS_BUF_STALE(bp);
1015 xfs_biodone(bp);
1016 return XFS_ERROR(EIO);
1017}
1018
1019/*
1020 * Return size of each in-core log record buffer. 926 * Return size of each in-core log record buffer.
1021 * 927 *
1022 * All machines get 8 x 32kB buffers by default, unless tuned otherwise. 928 * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
@@ -1158,7 +1064,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1158 if (!bp) 1064 if (!bp)
1159 goto out_free_log; 1065 goto out_free_log;
1160 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1066 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1161 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1162 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1067 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1163 ASSERT(XFS_BUF_ISBUSY(bp)); 1068 ASSERT(XFS_BUF_ISBUSY(bp));
1164 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 1069 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
@@ -1196,7 +1101,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1196 if (!XFS_BUF_CPSEMA(bp)) 1101 if (!XFS_BUF_CPSEMA(bp))
1197 ASSERT(0); 1102 ASSERT(0);
1198 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1103 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1199 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1200 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1104 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1201 iclog->ic_bp = bp; 1105 iclog->ic_bp = bp;
1202 iclog->ic_data = bp->b_addr; 1106 iclog->ic_data = bp->b_addr;
@@ -1268,7 +1172,7 @@ xlog_commit_record(xfs_mount_t *mp,
1268 1172
1269 reg[0].i_addr = NULL; 1173 reg[0].i_addr = NULL;
1270 reg[0].i_len = 0; 1174 reg[0].i_len = 0;
1271 XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT); 1175 reg[0].i_type = XLOG_REG_TYPE_COMMIT;
1272 1176
1273 ASSERT_ALWAYS(iclog); 1177 ASSERT_ALWAYS(iclog);
1274 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, 1178 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
@@ -1343,6 +1247,37 @@ xlog_grant_push_ail(xfs_mount_t *mp,
1343 xfs_trans_ail_push(log->l_ailp, threshold_lsn); 1247 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1344} /* xlog_grant_push_ail */ 1248} /* xlog_grant_push_ail */
1345 1249
1250/*
1251 * The bdstrat callback function for log bufs. This gives us a central
1252 * place to trap bufs in case we get hit by a log I/O error and need to
1253 * shutdown. Actually, in practice, even when we didn't get a log error,
1254 * we transition the iclogs to IOERROR state *after* flushing all existing
1255 * iclogs to disk. This is because we don't want anymore new transactions to be
1256 * started or completed afterwards.
1257 */
1258STATIC int
1259xlog_bdstrat(
1260 struct xfs_buf *bp)
1261{
1262 struct xlog_in_core *iclog;
1263
1264 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
1265 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1266 XFS_BUF_ERROR(bp, EIO);
1267 XFS_BUF_STALE(bp);
1268 xfs_biodone(bp);
1269 /*
1270 * It would seem logical to return EIO here, but we rely on
1271 * the log state machine to propagate I/O errors instead of
1272 * doing it here.
1273 */
1274 return 0;
1275 }
1276
1277 bp->b_flags |= _XBF_RUN_QUEUES;
1278 xfs_buf_iorequest(bp);
1279 return 0;
1280}
1346 1281
1347/* 1282/*
1348 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 1283 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
@@ -1462,7 +1397,7 @@ xlog_sync(xlog_t *log,
1462 */ 1397 */
1463 XFS_BUF_WRITE(bp); 1398 XFS_BUF_WRITE(bp);
1464 1399
1465 if ((error = XFS_bwrite(bp))) { 1400 if ((error = xlog_bdstrat(bp))) {
1466 xfs_ioerror_alert("xlog_sync", log->l_mp, bp, 1401 xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
1467 XFS_BUF_ADDR(bp)); 1402 XFS_BUF_ADDR(bp));
1468 return error; 1403 return error;
@@ -1502,7 +1437,7 @@ xlog_sync(xlog_t *log,
1502 /* account for internal log which doesn't start at block #0 */ 1437 /* account for internal log which doesn't start at block #0 */
1503 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1438 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1504 XFS_BUF_WRITE(bp); 1439 XFS_BUF_WRITE(bp);
1505 if ((error = XFS_bwrite(bp))) { 1440 if ((error = xlog_bdstrat(bp))) {
1506 xfs_ioerror_alert("xlog_sync (split)", log->l_mp, 1441 xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
1507 bp, XFS_BUF_ADDR(bp)); 1442 bp, XFS_BUF_ADDR(bp));
1508 return error; 1443 return error;
@@ -2854,7 +2789,6 @@ xlog_state_switch_iclogs(xlog_t *log,
2854 log->l_iclog = iclog->ic_next; 2789 log->l_iclog = iclog->ic_next;
2855} /* xlog_state_switch_iclogs */ 2790} /* xlog_state_switch_iclogs */
2856 2791
2857
2858/* 2792/*
2859 * Write out all data in the in-core log as of this exact moment in time. 2793 * Write out all data in the in-core log as of this exact moment in time.
2860 * 2794 *
@@ -2882,11 +2816,17 @@ xlog_state_switch_iclogs(xlog_t *log,
2882 * b) when we return from flushing out this iclog, it is still 2816 * b) when we return from flushing out this iclog, it is still
2883 * not in the active nor dirty state. 2817 * not in the active nor dirty state.
2884 */ 2818 */
2885STATIC int 2819int
2886xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) 2820_xfs_log_force(
2821 struct xfs_mount *mp,
2822 uint flags,
2823 int *log_flushed)
2887{ 2824{
2888 xlog_in_core_t *iclog; 2825 struct log *log = mp->m_log;
2889 xfs_lsn_t lsn; 2826 struct xlog_in_core *iclog;
2827 xfs_lsn_t lsn;
2828
2829 XFS_STATS_INC(xs_log_force);
2890 2830
2891 spin_lock(&log->l_icloglock); 2831 spin_lock(&log->l_icloglock);
2892 2832
@@ -2932,7 +2872,9 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
2932 2872
2933 if (xlog_state_release_iclog(log, iclog)) 2873 if (xlog_state_release_iclog(log, iclog))
2934 return XFS_ERROR(EIO); 2874 return XFS_ERROR(EIO);
2935 *log_flushed = 1; 2875
2876 if (log_flushed)
2877 *log_flushed = 1;
2936 spin_lock(&log->l_icloglock); 2878 spin_lock(&log->l_icloglock);
2937 if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && 2879 if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
2938 iclog->ic_state != XLOG_STATE_DIRTY) 2880 iclog->ic_state != XLOG_STATE_DIRTY)
@@ -2976,19 +2918,37 @@ maybe_sleep:
2976 */ 2918 */
2977 if (iclog->ic_state & XLOG_STATE_IOERROR) 2919 if (iclog->ic_state & XLOG_STATE_IOERROR)
2978 return XFS_ERROR(EIO); 2920 return XFS_ERROR(EIO);
2979 *log_flushed = 1; 2921 if (log_flushed)
2980 2922 *log_flushed = 1;
2981 } else { 2923 } else {
2982 2924
2983no_sleep: 2925no_sleep:
2984 spin_unlock(&log->l_icloglock); 2926 spin_unlock(&log->l_icloglock);
2985 } 2927 }
2986 return 0; 2928 return 0;
2987} /* xlog_state_sync_all */ 2929}
2988 2930
2931/*
2932 * Wrapper for _xfs_log_force(), to be used when caller doesn't care
2933 * about errors or whether the log was flushed or not. This is the normal
2934 * interface to use when trying to unpin items or move the log forward.
2935 */
2936void
2937xfs_log_force(
2938 xfs_mount_t *mp,
2939 uint flags)
2940{
2941 int error;
2942
2943 error = _xfs_log_force(mp, flags, NULL);
2944 if (error) {
2945 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
2946 "error %d returned.", error);
2947 }
2948}
2989 2949
2990/* 2950/*
2991 * Used by code which implements synchronous log forces. 2951 * Force the in-core log to disk for a specific LSN.
2992 * 2952 *
2993 * Find in-core log with lsn. 2953 * Find in-core log with lsn.
2994 * If it is in the DIRTY state, just return. 2954 * If it is in the DIRTY state, just return.
@@ -2996,109 +2956,142 @@ no_sleep:
2996 * state and go to sleep or return. 2956 * state and go to sleep or return.
2997 * If it is in any other state, go to sleep or return. 2957 * If it is in any other state, go to sleep or return.
2998 * 2958 *
2999 * If filesystem activity goes to zero, the iclog will get flushed only by 2959 * Synchronous forces are implemented with a signal variable. All callers
3000 * bdflush(). 2960 * to force a given lsn to disk will wait on a the sv attached to the
2961 * specific in-core log. When given in-core log finally completes its
2962 * write to disk, that thread will wake up all threads waiting on the
2963 * sv.
3001 */ 2964 */
3002STATIC int 2965int
3003xlog_state_sync(xlog_t *log, 2966_xfs_log_force_lsn(
3004 xfs_lsn_t lsn, 2967 struct xfs_mount *mp,
3005 uint flags, 2968 xfs_lsn_t lsn,
3006 int *log_flushed) 2969 uint flags,
2970 int *log_flushed)
3007{ 2971{
3008 xlog_in_core_t *iclog; 2972 struct log *log = mp->m_log;
3009 int already_slept = 0; 2973 struct xlog_in_core *iclog;
2974 int already_slept = 0;
3010 2975
3011try_again: 2976 ASSERT(lsn != 0);
3012 spin_lock(&log->l_icloglock);
3013 iclog = log->l_iclog;
3014 2977
3015 if (iclog->ic_state & XLOG_STATE_IOERROR) { 2978 XFS_STATS_INC(xs_log_force);
3016 spin_unlock(&log->l_icloglock);
3017 return XFS_ERROR(EIO);
3018 }
3019
3020 do {
3021 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3022 iclog = iclog->ic_next;
3023 continue;
3024 }
3025 2979
3026 if (iclog->ic_state == XLOG_STATE_DIRTY) { 2980try_again:
2981 spin_lock(&log->l_icloglock);
2982 iclog = log->l_iclog;
2983 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3027 spin_unlock(&log->l_icloglock); 2984 spin_unlock(&log->l_icloglock);
3028 return 0; 2985 return XFS_ERROR(EIO);
3029 } 2986 }
3030 2987
3031 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 2988 do {
3032 /* 2989 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3033 * We sleep here if we haven't already slept (e.g. 2990 iclog = iclog->ic_next;
3034 * this is the first time we've looked at the correct 2991 continue;
3035 * iclog buf) and the buffer before us is going to 2992 }
3036 * be sync'ed. The reason for this is that if we 2993
3037 * are doing sync transactions here, by waiting for 2994 if (iclog->ic_state == XLOG_STATE_DIRTY) {
3038 * the previous I/O to complete, we can allow a few 2995 spin_unlock(&log->l_icloglock);
3039 * more transactions into this iclog before we close 2996 return 0;
3040 * it down. 2997 }
3041 * 2998
3042 * Otherwise, we mark the buffer WANT_SYNC, and bump 2999 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3043 * up the refcnt so we can release the log (which drops 3000 /*
3044 * the ref count). The state switch keeps new transaction 3001 * We sleep here if we haven't already slept (e.g.
3045 * commits from using this buffer. When the current commits 3002 * this is the first time we've looked at the correct
3046 * finish writing into the buffer, the refcount will drop to 3003 * iclog buf) and the buffer before us is going to
3047 * zero and the buffer will go out then. 3004 * be sync'ed. The reason for this is that if we
3048 */ 3005 * are doing sync transactions here, by waiting for
3049 if (!already_slept && 3006 * the previous I/O to complete, we can allow a few
3050 (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | 3007 * more transactions into this iclog before we close
3051 XLOG_STATE_SYNCING))) { 3008 * it down.
3052 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); 3009 *
3053 XFS_STATS_INC(xs_log_force_sleep); 3010 * Otherwise, we mark the buffer WANT_SYNC, and bump
3054 sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, 3011 * up the refcnt so we can release the log (which
3055 &log->l_icloglock, s); 3012 * drops the ref count). The state switch keeps new
3056 *log_flushed = 1; 3013 * transaction commits from using this buffer. When
3057 already_slept = 1; 3014 * the current commits finish writing into the buffer,
3058 goto try_again; 3015 * the refcount will drop to zero and the buffer will
3059 } else { 3016 * go out then.
3017 */
3018 if (!already_slept &&
3019 (iclog->ic_prev->ic_state &
3020 (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
3021 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
3022
3023 XFS_STATS_INC(xs_log_force_sleep);
3024
3025 sv_wait(&iclog->ic_prev->ic_write_wait,
3026 PSWP, &log->l_icloglock, s);
3027 if (log_flushed)
3028 *log_flushed = 1;
3029 already_slept = 1;
3030 goto try_again;
3031 }
3060 atomic_inc(&iclog->ic_refcnt); 3032 atomic_inc(&iclog->ic_refcnt);
3061 xlog_state_switch_iclogs(log, iclog, 0); 3033 xlog_state_switch_iclogs(log, iclog, 0);
3062 spin_unlock(&log->l_icloglock); 3034 spin_unlock(&log->l_icloglock);
3063 if (xlog_state_release_iclog(log, iclog)) 3035 if (xlog_state_release_iclog(log, iclog))
3064 return XFS_ERROR(EIO); 3036 return XFS_ERROR(EIO);
3065 *log_flushed = 1; 3037 if (log_flushed)
3038 *log_flushed = 1;
3066 spin_lock(&log->l_icloglock); 3039 spin_lock(&log->l_icloglock);
3067 } 3040 }
3068 }
3069 3041
3070 if ((flags & XFS_LOG_SYNC) && /* sleep */ 3042 if ((flags & XFS_LOG_SYNC) && /* sleep */
3071 !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { 3043 !(iclog->ic_state &
3044 (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
3045 /*
3046 * Don't wait on completion if we know that we've
3047 * gotten a log write error.
3048 */
3049 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3050 spin_unlock(&log->l_icloglock);
3051 return XFS_ERROR(EIO);
3052 }
3053 XFS_STATS_INC(xs_log_force_sleep);
3054 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
3055 /*
3056 * No need to grab the log lock here since we're
3057 * only deciding whether or not to return EIO
3058 * and the memory read should be atomic.
3059 */
3060 if (iclog->ic_state & XLOG_STATE_IOERROR)
3061 return XFS_ERROR(EIO);
3072 3062
3073 /* 3063 if (log_flushed)
3074 * Don't wait on completion if we know that we've 3064 *log_flushed = 1;
3075 * gotten a log write error. 3065 } else { /* just return */
3076 */
3077 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3078 spin_unlock(&log->l_icloglock); 3066 spin_unlock(&log->l_icloglock);
3079 return XFS_ERROR(EIO);
3080 } 3067 }
3081 XFS_STATS_INC(xs_log_force_sleep);
3082 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
3083 /*
3084 * No need to grab the log lock here since we're
3085 * only deciding whether or not to return EIO
3086 * and the memory read should be atomic.
3087 */
3088 if (iclog->ic_state & XLOG_STATE_IOERROR)
3089 return XFS_ERROR(EIO);
3090 *log_flushed = 1;
3091 } else { /* just return */
3092 spin_unlock(&log->l_icloglock);
3093 }
3094 return 0;
3095 3068
3096 } while (iclog != log->l_iclog); 3069 return 0;
3070 } while (iclog != log->l_iclog);
3097 3071
3098 spin_unlock(&log->l_icloglock); 3072 spin_unlock(&log->l_icloglock);
3099 return 0; 3073 return 0;
3100} /* xlog_state_sync */ 3074}
3101 3075
3076/*
3077 * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
3078 * about errors or whether the log was flushed or not. This is the normal
3079 * interface to use when trying to unpin items or move the log forward.
3080 */
3081void
3082xfs_log_force_lsn(
3083 xfs_mount_t *mp,
3084 xfs_lsn_t lsn,
3085 uint flags)
3086{
3087 int error;
3088
3089 error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3090 if (error) {
3091 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
3092 "error %d returned.", error);
3093 }
3094}
3102 3095
3103/* 3096/*
3104 * Called when we want to mark the current iclog as being ready to sync to 3097 * Called when we want to mark the current iclog as being ready to sync to
@@ -3463,7 +3456,6 @@ xfs_log_force_umount(
3463 xlog_ticket_t *tic; 3456 xlog_ticket_t *tic;
3464 xlog_t *log; 3457 xlog_t *log;
3465 int retval; 3458 int retval;
3466 int dummy;
3467 3459
3468 log = mp->m_log; 3460 log = mp->m_log;
3469 3461
@@ -3537,13 +3529,14 @@ xfs_log_force_umount(
3537 } 3529 }
3538 spin_unlock(&log->l_grant_lock); 3530 spin_unlock(&log->l_grant_lock);
3539 3531
3540 if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3532 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3541 ASSERT(!logerror); 3533 ASSERT(!logerror);
3542 /* 3534 /*
3543 * Force the incore logs to disk before shutting the 3535 * Force the incore logs to disk before shutting the
3544 * log down completely. 3536 * log down completely.
3545 */ 3537 */
3546 xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); 3538 _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
3539
3547 spin_lock(&log->l_icloglock); 3540 spin_lock(&log->l_icloglock);
3548 retval = xlog_state_ioerror(log); 3541 retval = xlog_state_ioerror(log);
3549 spin_unlock(&log->l_icloglock); 3542 spin_unlock(&log->l_icloglock);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d0c9baa50b1a..7074be9d13e9 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -70,14 +70,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
70 * Flags to xfs_log_force() 70 * Flags to xfs_log_force()
71 * 71 *
72 * XFS_LOG_SYNC: Synchronous force in-core log to disk 72 * XFS_LOG_SYNC: Synchronous force in-core log to disk
73 * XFS_LOG_FORCE: Start in-core log write now.
74 * XFS_LOG_URGE: Start write within some window of time.
75 *
76 * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
77 */ 73 */
78#define XFS_LOG_SYNC 0x1 74#define XFS_LOG_SYNC 0x1
79#define XFS_LOG_FORCE 0x2
80#define XFS_LOG_URGE 0x4
81 75
82#endif /* __KERNEL__ */ 76#endif /* __KERNEL__ */
83 77
@@ -110,10 +104,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
110#define XLOG_REG_TYPE_TRANSHDR 19 104#define XLOG_REG_TYPE_TRANSHDR 19
111#define XLOG_REG_TYPE_MAX 19 105#define XLOG_REG_TYPE_MAX 19
112 106
113#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
114
115typedef struct xfs_log_iovec { 107typedef struct xfs_log_iovec {
116 xfs_caddr_t i_addr; /* beginning address of region */ 108 xfs_caddr_t i_addr; /* beginning address of region */
117 int i_len; /* length in bytes of region */ 109 int i_len; /* length in bytes of region */
118 uint i_type; /* type of region */ 110 uint i_type; /* type of region */
119} xfs_log_iovec_t; 111} xfs_log_iovec_t;
@@ -140,12 +132,17 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
140 void **iclog, 132 void **iclog,
141 uint flags); 133 uint flags);
142int _xfs_log_force(struct xfs_mount *mp, 134int _xfs_log_force(struct xfs_mount *mp,
143 xfs_lsn_t lsn,
144 uint flags, 135 uint flags,
145 int *log_forced); 136 int *log_forced);
146void xfs_log_force(struct xfs_mount *mp, 137void xfs_log_force(struct xfs_mount *mp,
147 xfs_lsn_t lsn,
148 uint flags); 138 uint flags);
139int _xfs_log_force_lsn(struct xfs_mount *mp,
140 xfs_lsn_t lsn,
141 uint flags,
142 int *log_forced);
143void xfs_log_force_lsn(struct xfs_mount *mp,
144 xfs_lsn_t lsn,
145 uint flags);
149int xfs_log_mount(struct xfs_mount *mp, 146int xfs_log_mount(struct xfs_mount *mp,
150 struct xfs_buftarg *log_target, 147 struct xfs_buftarg *log_target,
151 xfs_daddr_t start_block, 148 xfs_daddr_t start_block,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d55662db7077..fd02a18facd5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -443,14 +443,9 @@ typedef struct log {
443 443
444/* common routines */ 444/* common routines */
445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
446extern int xlog_find_tail(xlog_t *log,
447 xfs_daddr_t *head_blk,
448 xfs_daddr_t *tail_blk);
449extern int xlog_recover(xlog_t *log); 446extern int xlog_recover(xlog_t *log);
450extern int xlog_recover_finish(xlog_t *log); 447extern int xlog_recover_finish(xlog_t *log);
451extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 448extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
452extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
453extern void xlog_put_bp(struct xfs_buf *);
454 449
455extern kmem_zone_t *xfs_log_ticket_zone; 450extern kmem_zone_t *xfs_log_ticket_zone;
456 451
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 69ac2e5ef20c..22e6efdc17ea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -50,8 +50,6 @@
50 50
51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); 51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); 52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
53STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q,
54 xlog_recover_item_t *item);
55#if defined(DEBUG) 53#if defined(DEBUG)
56STATIC void xlog_recover_check_summary(xlog_t *); 54STATIC void xlog_recover_check_summary(xlog_t *);
57#else 55#else
@@ -68,7 +66,7 @@ STATIC void xlog_recover_check_summary(xlog_t *);
68 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) 66 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
69#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) 67#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask)
70 68
71xfs_buf_t * 69STATIC xfs_buf_t *
72xlog_get_bp( 70xlog_get_bp(
73 xlog_t *log, 71 xlog_t *log,
74 int nbblks) 72 int nbblks)
@@ -88,7 +86,7 @@ xlog_get_bp(
88 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 86 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
89} 87}
90 88
91void 89STATIC void
92xlog_put_bp( 90xlog_put_bp(
93 xfs_buf_t *bp) 91 xfs_buf_t *bp)
94{ 92{
@@ -805,7 +803,7 @@ xlog_find_head(
805 * We could speed up search by using current head_blk buffer, but it is not 803 * We could speed up search by using current head_blk buffer, but it is not
806 * available. 804 * available.
807 */ 805 */
808int 806STATIC int
809xlog_find_tail( 807xlog_find_tail(
810 xlog_t *log, 808 xlog_t *log,
811 xfs_daddr_t *head_blk, 809 xfs_daddr_t *head_blk,
@@ -1367,36 +1365,45 @@ xlog_clear_stale_blocks(
1367 1365
1368STATIC xlog_recover_t * 1366STATIC xlog_recover_t *
1369xlog_recover_find_tid( 1367xlog_recover_find_tid(
1370 xlog_recover_t *q, 1368 struct hlist_head *head,
1371 xlog_tid_t tid) 1369 xlog_tid_t tid)
1372{ 1370{
1373 xlog_recover_t *p = q; 1371 xlog_recover_t *trans;
1372 struct hlist_node *n;
1374 1373
1375 while (p != NULL) { 1374 hlist_for_each_entry(trans, n, head, r_list) {
1376 if (p->r_log_tid == tid) 1375 if (trans->r_log_tid == tid)
1377 break; 1376 return trans;
1378 p = p->r_next;
1379 } 1377 }
1380 return p; 1378 return NULL;
1381} 1379}
1382 1380
1383STATIC void 1381STATIC void
1384xlog_recover_put_hashq( 1382xlog_recover_new_tid(
1385 xlog_recover_t **q, 1383 struct hlist_head *head,
1386 xlog_recover_t *trans) 1384 xlog_tid_t tid,
1385 xfs_lsn_t lsn)
1387{ 1386{
1388 trans->r_next = *q; 1387 xlog_recover_t *trans;
1389 *q = trans; 1388
1389 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1390 trans->r_log_tid = tid;
1391 trans->r_lsn = lsn;
1392 INIT_LIST_HEAD(&trans->r_itemq);
1393
1394 INIT_HLIST_NODE(&trans->r_list);
1395 hlist_add_head(&trans->r_list, head);
1390} 1396}
1391 1397
1392STATIC void 1398STATIC void
1393xlog_recover_add_item( 1399xlog_recover_add_item(
1394 xlog_recover_item_t **itemq) 1400 struct list_head *head)
1395{ 1401{
1396 xlog_recover_item_t *item; 1402 xlog_recover_item_t *item;
1397 1403
1398 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 1404 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1399 xlog_recover_insert_item_backq(itemq, item); 1405 INIT_LIST_HEAD(&item->ri_list);
1406 list_add_tail(&item->ri_list, head);
1400} 1407}
1401 1408
1402STATIC int 1409STATIC int
@@ -1409,8 +1416,7 @@ xlog_recover_add_to_cont_trans(
1409 xfs_caddr_t ptr, old_ptr; 1416 xfs_caddr_t ptr, old_ptr;
1410 int old_len; 1417 int old_len;
1411 1418
1412 item = trans->r_itemq; 1419 if (list_empty(&trans->r_itemq)) {
1413 if (item == NULL) {
1414 /* finish copying rest of trans header */ 1420 /* finish copying rest of trans header */
1415 xlog_recover_add_item(&trans->r_itemq); 1421 xlog_recover_add_item(&trans->r_itemq);
1416 ptr = (xfs_caddr_t) &trans->r_theader + 1422 ptr = (xfs_caddr_t) &trans->r_theader +
@@ -1418,7 +1424,8 @@ xlog_recover_add_to_cont_trans(
1418 memcpy(ptr, dp, len); /* d, s, l */ 1424 memcpy(ptr, dp, len); /* d, s, l */
1419 return 0; 1425 return 0;
1420 } 1426 }
1421 item = item->ri_prev; 1427 /* take the tail entry */
1428 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1422 1429
1423 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 1430 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1424 old_len = item->ri_buf[item->ri_cnt-1].i_len; 1431 old_len = item->ri_buf[item->ri_cnt-1].i_len;
@@ -1455,8 +1462,7 @@ xlog_recover_add_to_trans(
1455 1462
1456 if (!len) 1463 if (!len)
1457 return 0; 1464 return 0;
1458 item = trans->r_itemq; 1465 if (list_empty(&trans->r_itemq)) {
1459 if (item == NULL) {
1460 /* we need to catch log corruptions here */ 1466 /* we need to catch log corruptions here */
1461 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 1467 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1462 xlog_warn("XFS: xlog_recover_add_to_trans: " 1468 xlog_warn("XFS: xlog_recover_add_to_trans: "
@@ -1474,12 +1480,15 @@ xlog_recover_add_to_trans(
1474 memcpy(ptr, dp, len); 1480 memcpy(ptr, dp, len);
1475 in_f = (xfs_inode_log_format_t *)ptr; 1481 in_f = (xfs_inode_log_format_t *)ptr;
1476 1482
1477 if (item->ri_prev->ri_total != 0 && 1483 /* take the tail entry */
1478 item->ri_prev->ri_total == item->ri_prev->ri_cnt) { 1484 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1485 if (item->ri_total != 0 &&
1486 item->ri_total == item->ri_cnt) {
1487 /* tail item is in use, get a new one */
1479 xlog_recover_add_item(&trans->r_itemq); 1488 xlog_recover_add_item(&trans->r_itemq);
1489 item = list_entry(trans->r_itemq.prev,
1490 xlog_recover_item_t, ri_list);
1480 } 1491 }
1481 item = trans->r_itemq;
1482 item = item->ri_prev;
1483 1492
1484 if (item->ri_total == 0) { /* first region to be added */ 1493 if (item->ri_total == 0) { /* first region to be added */
1485 if (in_f->ilf_size == 0 || 1494 if (in_f->ilf_size == 0 ||
@@ -1504,96 +1513,29 @@ xlog_recover_add_to_trans(
1504 return 0; 1513 return 0;
1505} 1514}
1506 1515
1507STATIC void 1516/*
1508xlog_recover_new_tid( 1517 * Sort the log items in the transaction. Cancelled buffers need
1509 xlog_recover_t **q, 1518 * to be put first so they are processed before any items that might
1510 xlog_tid_t tid, 1519 * modify the buffers. If they are cancelled, then the modifications
1511 xfs_lsn_t lsn) 1520 * don't need to be replayed.
1512{ 1521 */
1513 xlog_recover_t *trans;
1514
1515 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1516 trans->r_log_tid = tid;
1517 trans->r_lsn = lsn;
1518 xlog_recover_put_hashq(q, trans);
1519}
1520
1521STATIC int
1522xlog_recover_unlink_tid(
1523 xlog_recover_t **q,
1524 xlog_recover_t *trans)
1525{
1526 xlog_recover_t *tp;
1527 int found = 0;
1528
1529 ASSERT(trans != NULL);
1530 if (trans == *q) {
1531 *q = (*q)->r_next;
1532 } else {
1533 tp = *q;
1534 while (tp) {
1535 if (tp->r_next == trans) {
1536 found = 1;
1537 break;
1538 }
1539 tp = tp->r_next;
1540 }
1541 if (!found) {
1542 xlog_warn(
1543 "XFS: xlog_recover_unlink_tid: trans not found");
1544 ASSERT(0);
1545 return XFS_ERROR(EIO);
1546 }
1547 tp->r_next = tp->r_next->r_next;
1548 }
1549 return 0;
1550}
1551
1552STATIC void
1553xlog_recover_insert_item_backq(
1554 xlog_recover_item_t **q,
1555 xlog_recover_item_t *item)
1556{
1557 if (*q == NULL) {
1558 item->ri_prev = item->ri_next = item;
1559 *q = item;
1560 } else {
1561 item->ri_next = *q;
1562 item->ri_prev = (*q)->ri_prev;
1563 (*q)->ri_prev = item;
1564 item->ri_prev->ri_next = item;
1565 }
1566}
1567
1568STATIC void
1569xlog_recover_insert_item_frontq(
1570 xlog_recover_item_t **q,
1571 xlog_recover_item_t *item)
1572{
1573 xlog_recover_insert_item_backq(q, item);
1574 *q = item;
1575}
1576
1577STATIC int 1522STATIC int
1578xlog_recover_reorder_trans( 1523xlog_recover_reorder_trans(
1579 xlog_recover_t *trans) 1524 xlog_recover_t *trans)
1580{ 1525{
1581 xlog_recover_item_t *first_item, *itemq, *itemq_next; 1526 xlog_recover_item_t *item, *n;
1582 xfs_buf_log_format_t *buf_f; 1527 LIST_HEAD(sort_list);
1583 ushort flags = 0;
1584 1528
1585 first_item = itemq = trans->r_itemq; 1529 list_splice_init(&trans->r_itemq, &sort_list);
1586 trans->r_itemq = NULL; 1530 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1587 do { 1531 xfs_buf_log_format_t *buf_f;
1588 itemq_next = itemq->ri_next;
1589 buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
1590 1532
1591 switch (ITEM_TYPE(itemq)) { 1533 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
1534
1535 switch (ITEM_TYPE(item)) {
1592 case XFS_LI_BUF: 1536 case XFS_LI_BUF:
1593 flags = buf_f->blf_flags; 1537 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
1594 if (!(flags & XFS_BLI_CANCEL)) { 1538 list_move(&item->ri_list, &trans->r_itemq);
1595 xlog_recover_insert_item_frontq(&trans->r_itemq,
1596 itemq);
1597 break; 1539 break;
1598 } 1540 }
1599 case XFS_LI_INODE: 1541 case XFS_LI_INODE:
@@ -1601,7 +1543,7 @@ xlog_recover_reorder_trans(
1601 case XFS_LI_QUOTAOFF: 1543 case XFS_LI_QUOTAOFF:
1602 case XFS_LI_EFD: 1544 case XFS_LI_EFD:
1603 case XFS_LI_EFI: 1545 case XFS_LI_EFI:
1604 xlog_recover_insert_item_backq(&trans->r_itemq, itemq); 1546 list_move_tail(&item->ri_list, &trans->r_itemq);
1605 break; 1547 break;
1606 default: 1548 default:
1607 xlog_warn( 1549 xlog_warn(
@@ -1609,8 +1551,8 @@ xlog_recover_reorder_trans(
1609 ASSERT(0); 1551 ASSERT(0);
1610 return XFS_ERROR(EIO); 1552 return XFS_ERROR(EIO);
1611 } 1553 }
1612 itemq = itemq_next; 1554 }
1613 } while (first_item != itemq); 1555 ASSERT(list_empty(&sort_list));
1614 return 0; 1556 return 0;
1615} 1557}
1616 1558
@@ -2242,9 +2184,9 @@ xlog_recover_do_buffer_trans(
2242 } 2184 }
2243 2185
2244 mp = log->l_mp; 2186 mp = log->l_mp;
2245 buf_flags = XFS_BUF_LOCK; 2187 buf_flags = XBF_LOCK;
2246 if (!(flags & XFS_BLI_INODE_BUF)) 2188 if (!(flags & XFS_BLI_INODE_BUF))
2247 buf_flags |= XFS_BUF_MAPPED; 2189 buf_flags |= XBF_MAPPED;
2248 2190
2249 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2191 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
2250 if (XFS_BUF_ISERROR(bp)) { 2192 if (XFS_BUF_ISERROR(bp)) {
@@ -2346,7 +2288,7 @@ xlog_recover_do_inode_trans(
2346 } 2288 }
2347 2289
2348 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2290 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2349 XFS_BUF_LOCK); 2291 XBF_LOCK);
2350 if (XFS_BUF_ISERROR(bp)) { 2292 if (XFS_BUF_ISERROR(bp)) {
2351 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, 2293 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2352 bp, in_f->ilf_blkno); 2294 bp, in_f->ilf_blkno);
@@ -2814,14 +2756,13 @@ xlog_recover_do_trans(
2814 int pass) 2756 int pass)
2815{ 2757{
2816 int error = 0; 2758 int error = 0;
2817 xlog_recover_item_t *item, *first_item; 2759 xlog_recover_item_t *item;
2818 2760
2819 error = xlog_recover_reorder_trans(trans); 2761 error = xlog_recover_reorder_trans(trans);
2820 if (error) 2762 if (error)
2821 return error; 2763 return error;
2822 2764
2823 first_item = item = trans->r_itemq; 2765 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2824 do {
2825 switch (ITEM_TYPE(item)) { 2766 switch (ITEM_TYPE(item)) {
2826 case XFS_LI_BUF: 2767 case XFS_LI_BUF:
2827 error = xlog_recover_do_buffer_trans(log, item, pass); 2768 error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2854,8 +2795,7 @@ xlog_recover_do_trans(
2854 2795
2855 if (error) 2796 if (error)
2856 return error; 2797 return error;
2857 item = item->ri_next; 2798 }
2858 } while (first_item != item);
2859 2799
2860 return 0; 2800 return 0;
2861} 2801}
@@ -2869,21 +2809,18 @@ STATIC void
2869xlog_recover_free_trans( 2809xlog_recover_free_trans(
2870 xlog_recover_t *trans) 2810 xlog_recover_t *trans)
2871{ 2811{
2872 xlog_recover_item_t *first_item, *item, *free_item; 2812 xlog_recover_item_t *item, *n;
2873 int i; 2813 int i;
2874 2814
2875 item = first_item = trans->r_itemq; 2815 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2876 do { 2816 /* Free the regions in the item. */
2877 free_item = item; 2817 list_del(&item->ri_list);
2878 item = item->ri_next; 2818 for (i = 0; i < item->ri_cnt; i++)
2879 /* Free the regions in the item. */ 2819 kmem_free(item->ri_buf[i].i_addr);
2880 for (i = 0; i < free_item->ri_cnt; i++) {
2881 kmem_free(free_item->ri_buf[i].i_addr);
2882 }
2883 /* Free the item itself */ 2820 /* Free the item itself */
2884 kmem_free(free_item->ri_buf); 2821 kmem_free(item->ri_buf);
2885 kmem_free(free_item); 2822 kmem_free(item);
2886 } while (first_item != item); 2823 }
2887 /* Free the transaction recover structure */ 2824 /* Free the transaction recover structure */
2888 kmem_free(trans); 2825 kmem_free(trans);
2889} 2826}
@@ -2891,14 +2828,12 @@ xlog_recover_free_trans(
2891STATIC int 2828STATIC int
2892xlog_recover_commit_trans( 2829xlog_recover_commit_trans(
2893 xlog_t *log, 2830 xlog_t *log,
2894 xlog_recover_t **q,
2895 xlog_recover_t *trans, 2831 xlog_recover_t *trans,
2896 int pass) 2832 int pass)
2897{ 2833{
2898 int error; 2834 int error;
2899 2835
2900 if ((error = xlog_recover_unlink_tid(q, trans))) 2836 hlist_del(&trans->r_list);
2901 return error;
2902 if ((error = xlog_recover_do_trans(log, trans, pass))) 2837 if ((error = xlog_recover_do_trans(log, trans, pass)))
2903 return error; 2838 return error;
2904 xlog_recover_free_trans(trans); /* no error */ 2839 xlog_recover_free_trans(trans); /* no error */
@@ -2926,7 +2861,7 @@ xlog_recover_unmount_trans(
2926STATIC int 2861STATIC int
2927xlog_recover_process_data( 2862xlog_recover_process_data(
2928 xlog_t *log, 2863 xlog_t *log,
2929 xlog_recover_t *rhash[], 2864 struct hlist_head rhash[],
2930 xlog_rec_header_t *rhead, 2865 xlog_rec_header_t *rhead,
2931 xfs_caddr_t dp, 2866 xfs_caddr_t dp,
2932 int pass) 2867 int pass)
@@ -2960,7 +2895,7 @@ xlog_recover_process_data(
2960 } 2895 }
2961 tid = be32_to_cpu(ohead->oh_tid); 2896 tid = be32_to_cpu(ohead->oh_tid);
2962 hash = XLOG_RHASH(tid); 2897 hash = XLOG_RHASH(tid);
2963 trans = xlog_recover_find_tid(rhash[hash], tid); 2898 trans = xlog_recover_find_tid(&rhash[hash], tid);
2964 if (trans == NULL) { /* not found; add new tid */ 2899 if (trans == NULL) { /* not found; add new tid */
2965 if (ohead->oh_flags & XLOG_START_TRANS) 2900 if (ohead->oh_flags & XLOG_START_TRANS)
2966 xlog_recover_new_tid(&rhash[hash], tid, 2901 xlog_recover_new_tid(&rhash[hash], tid,
@@ -2978,7 +2913,7 @@ xlog_recover_process_data(
2978 switch (flags) { 2913 switch (flags) {
2979 case XLOG_COMMIT_TRANS: 2914 case XLOG_COMMIT_TRANS:
2980 error = xlog_recover_commit_trans(log, 2915 error = xlog_recover_commit_trans(log,
2981 &rhash[hash], trans, pass); 2916 trans, pass);
2982 break; 2917 break;
2983 case XLOG_UNMOUNT_TRANS: 2918 case XLOG_UNMOUNT_TRANS:
2984 error = xlog_recover_unmount_trans(trans); 2919 error = xlog_recover_unmount_trans(trans);
@@ -3211,7 +3146,7 @@ xlog_recover_process_one_iunlink(
3211 /* 3146 /*
3212 * Get the on disk inode to find the next inode in the bucket. 3147 * Get the on disk inode to find the next inode in the bucket.
3213 */ 3148 */
3214 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK); 3149 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
3215 if (error) 3150 if (error)
3216 goto fail_iput; 3151 goto fail_iput;
3217 3152
@@ -3517,7 +3452,7 @@ xlog_do_recovery_pass(
3517 int error = 0, h_size; 3452 int error = 0, h_size;
3518 int bblks, split_bblks; 3453 int bblks, split_bblks;
3519 int hblks, split_hblks, wrapped_hblks; 3454 int hblks, split_hblks, wrapped_hblks;
3520 xlog_recover_t *rhash[XLOG_RHASH_SIZE]; 3455 struct hlist_head rhash[XLOG_RHASH_SIZE];
3521 3456
3522 ASSERT(head_blk != tail_blk); 3457 ASSERT(head_blk != tail_blk);
3523 3458
@@ -3978,8 +3913,7 @@ xlog_recover_finish(
3978 * case the unlink transactions would have problems 3913 * case the unlink transactions would have problems
3979 * pushing the EFIs out of the way. 3914 * pushing the EFIs out of the way.
3980 */ 3915 */
3981 xfs_log_force(log->l_mp, (xfs_lsn_t)0, 3916 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3982 (XFS_LOG_FORCE | XFS_LOG_SYNC));
3983 3917
3984 xlog_recover_process_iunlinks(log); 3918 xlog_recover_process_iunlinks(log);
3985 3919
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index b22545555301..75d749207258 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -35,22 +35,21 @@
35 * item headers are in ri_buf[0]. Additional buffers follow. 35 * item headers are in ri_buf[0]. Additional buffers follow.
36 */ 36 */
37typedef struct xlog_recover_item { 37typedef struct xlog_recover_item {
38 struct xlog_recover_item *ri_next; 38 struct list_head ri_list;
39 struct xlog_recover_item *ri_prev; 39 int ri_type;
40 int ri_type; 40 int ri_cnt; /* count of regions found */
41 int ri_cnt; /* count of regions found */ 41 int ri_total; /* total regions */
42 int ri_total; /* total regions */ 42 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
43 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
44} xlog_recover_item_t; 43} xlog_recover_item_t;
45 44
46struct xlog_tid; 45struct xlog_tid;
47typedef struct xlog_recover { 46typedef struct xlog_recover {
48 struct xlog_recover *r_next; 47 struct hlist_node r_list;
49 xlog_tid_t r_log_tid; /* log's transaction id */ 48 xlog_tid_t r_log_tid; /* log's transaction id */
50 xfs_trans_header_t r_theader; /* trans header for partial */ 49 xfs_trans_header_t r_theader; /* trans header for partial */
51 int r_state; /* not needed */ 50 int r_state; /* not needed */
52 xfs_lsn_t r_lsn; /* xact lsn */ 51 xfs_lsn_t r_lsn; /* xact lsn */
53 xlog_recover_item_t *r_itemq; /* q for items */ 52 struct list_head r_itemq; /* q for items */
54} xlog_recover_t; 53} xlog_recover_t;
55 54
56#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) 55#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index eb403b40e120..6afaaeb2950a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -201,6 +201,38 @@ xfs_uuid_unmount(
201 201
202 202
203/* 203/*
204 * Reference counting access wrappers to the perag structures.
205 */
206struct xfs_perag *
207xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
208{
209 struct xfs_perag *pag;
210 int ref = 0;
211
212 spin_lock(&mp->m_perag_lock);
213 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
214 if (pag) {
215 ASSERT(atomic_read(&pag->pag_ref) >= 0);
216 /* catch leaks in the positive direction during testing */
217 ASSERT(atomic_read(&pag->pag_ref) < 1000);
218 ref = atomic_inc_return(&pag->pag_ref);
219 }
220 spin_unlock(&mp->m_perag_lock);
221 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
222 return pag;
223}
224
225void
226xfs_perag_put(struct xfs_perag *pag)
227{
228 int ref;
229
230 ASSERT(atomic_read(&pag->pag_ref) > 0);
231 ref = atomic_dec_return(&pag->pag_ref);
232 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
233}
234
235/*
204 * Free up the resources associated with a mount structure. Assume that 236 * Free up the resources associated with a mount structure. Assume that
205 * the structure was initially zeroed, so we can tell which fields got 237 * the structure was initially zeroed, so we can tell which fields got
206 * initialized. 238 * initialized.
@@ -209,13 +241,16 @@ STATIC void
209xfs_free_perag( 241xfs_free_perag(
210 xfs_mount_t *mp) 242 xfs_mount_t *mp)
211{ 243{
212 if (mp->m_perag) { 244 xfs_agnumber_t agno;
213 int agno; 245 struct xfs_perag *pag;
214 246
215 for (agno = 0; agno < mp->m_maxagi; agno++) 247 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
216 if (mp->m_perag[agno].pagb_list) 248 spin_lock(&mp->m_perag_lock);
217 kmem_free(mp->m_perag[agno].pagb_list); 249 pag = radix_tree_delete(&mp->m_perag_tree, agno);
218 kmem_free(mp->m_perag); 250 ASSERT(pag);
251 ASSERT(atomic_read(&pag->pag_ref) == 0);
252 spin_unlock(&mp->m_perag_lock);
253 kmem_free(pag);
219 } 254 }
220} 255}
221 256
@@ -389,22 +424,57 @@ xfs_initialize_perag_icache(
389 } 424 }
390} 425}
391 426
392xfs_agnumber_t 427int
393xfs_initialize_perag( 428xfs_initialize_perag(
394 xfs_mount_t *mp, 429 xfs_mount_t *mp,
395 xfs_agnumber_t agcount) 430 xfs_agnumber_t agcount,
431 xfs_agnumber_t *maxagi)
396{ 432{
397 xfs_agnumber_t index, max_metadata; 433 xfs_agnumber_t index, max_metadata;
434 xfs_agnumber_t first_initialised = 0;
398 xfs_perag_t *pag; 435 xfs_perag_t *pag;
399 xfs_agino_t agino; 436 xfs_agino_t agino;
400 xfs_ino_t ino; 437 xfs_ino_t ino;
401 xfs_sb_t *sbp = &mp->m_sb; 438 xfs_sb_t *sbp = &mp->m_sb;
402 xfs_ino_t max_inum = XFS_MAXINUMBER_32; 439 xfs_ino_t max_inum = XFS_MAXINUMBER_32;
440 int error = -ENOMEM;
403 441
404 /* Check to see if the filesystem can overflow 32 bit inodes */ 442 /* Check to see if the filesystem can overflow 32 bit inodes */
405 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); 443 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
406 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); 444 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
407 445
446 /*
447 * Walk the current per-ag tree so we don't try to initialise AGs
448 * that already exist (growfs case). Allocate and insert all the
449 * AGs we don't find ready for initialisation.
450 */
451 for (index = 0; index < agcount; index++) {
452 pag = xfs_perag_get(mp, index);
453 if (pag) {
454 xfs_perag_put(pag);
455 continue;
456 }
457 if (!first_initialised)
458 first_initialised = index;
459 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
460 if (!pag)
461 goto out_unwind;
462 if (radix_tree_preload(GFP_NOFS))
463 goto out_unwind;
464 spin_lock(&mp->m_perag_lock);
465 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
466 BUG();
467 spin_unlock(&mp->m_perag_lock);
468 radix_tree_preload_end();
469 error = -EEXIST;
470 goto out_unwind;
471 }
472 pag->pag_agno = index;
473 pag->pag_mount = mp;
474 spin_unlock(&mp->m_perag_lock);
475 radix_tree_preload_end();
476 }
477
408 /* Clear the mount flag if no inode can overflow 32 bits 478 /* Clear the mount flag if no inode can overflow 32 bits
409 * on this filesystem, or if specifically requested.. 479 * on this filesystem, or if specifically requested..
410 */ 480 */
@@ -438,21 +508,33 @@ xfs_initialize_perag(
438 } 508 }
439 509
440 /* This ag is preferred for inodes */ 510 /* This ag is preferred for inodes */
441 pag = &mp->m_perag[index]; 511 pag = xfs_perag_get(mp, index);
442 pag->pagi_inodeok = 1; 512 pag->pagi_inodeok = 1;
443 if (index < max_metadata) 513 if (index < max_metadata)
444 pag->pagf_metadata = 1; 514 pag->pagf_metadata = 1;
445 xfs_initialize_perag_icache(pag); 515 xfs_initialize_perag_icache(pag);
516 xfs_perag_put(pag);
446 } 517 }
447 } else { 518 } else {
448 /* Setup default behavior for smaller filesystems */ 519 /* Setup default behavior for smaller filesystems */
449 for (index = 0; index < agcount; index++) { 520 for (index = 0; index < agcount; index++) {
450 pag = &mp->m_perag[index]; 521 pag = xfs_perag_get(mp, index);
451 pag->pagi_inodeok = 1; 522 pag->pagi_inodeok = 1;
452 xfs_initialize_perag_icache(pag); 523 xfs_initialize_perag_icache(pag);
524 xfs_perag_put(pag);
453 } 525 }
454 } 526 }
455 return index; 527 if (maxagi)
528 *maxagi = index;
529 return 0;
530
531out_unwind:
532 kmem_free(pag);
533 for (; index > first_initialised; index--) {
534 pag = radix_tree_delete(&mp->m_perag_tree, index);
535 kmem_free(pag);
536 }
537 return error;
456} 538}
457 539
458void 540void
@@ -583,7 +665,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
583 * access to the superblock. 665 * access to the superblock.
584 */ 666 */
585 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 667 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
586 extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; 668 extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
587 669
588 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 670 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
589 extra_flags); 671 extra_flags);
@@ -731,12 +813,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
731 error = xfs_ialloc_pagi_init(mp, NULL, index); 813 error = xfs_ialloc_pagi_init(mp, NULL, index);
732 if (error) 814 if (error)
733 return error; 815 return error;
734 pag = &mp->m_perag[index]; 816 pag = xfs_perag_get(mp, index);
735 ifree += pag->pagi_freecount; 817 ifree += pag->pagi_freecount;
736 ialloc += pag->pagi_count; 818 ialloc += pag->pagi_count;
737 bfree += pag->pagf_freeblks; 819 bfree += pag->pagf_freeblks;
738 bfreelst += pag->pagf_flcount; 820 bfreelst += pag->pagf_flcount;
739 btree += pag->pagf_btreeblks; 821 btree += pag->pagf_btreeblks;
822 xfs_perag_put(pag);
740 } 823 }
741 /* 824 /*
742 * Overwrite incore superblock counters with just-read data 825 * Overwrite incore superblock counters with just-read data
@@ -1008,6 +1091,22 @@ xfs_mount_reset_sbqflags(
1008 return xfs_trans_commit(tp, 0); 1091 return xfs_trans_commit(tp, 0);
1009} 1092}
1010 1093
1094__uint64_t
1095xfs_default_resblks(xfs_mount_t *mp)
1096{
1097 __uint64_t resblks;
1098
1099 /*
1100 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
1101 * This may drive us straight to ENOSPC on mount, but that implies
1102 * we were already there on the last unmount. Warn if this occurs.
1103 */
1104 resblks = mp->m_sb.sb_dblocks;
1105 do_div(resblks, 20);
1106 resblks = min_t(__uint64_t, resblks, 1024);
1107 return resblks;
1108}
1109
1011/* 1110/*
1012 * This function does the following on an initial mount of a file system: 1111 * This function does the following on an initial mount of a file system:
1013 * - reads the superblock from disk and init the mount struct 1112 * - reads the superblock from disk and init the mount struct
@@ -1152,13 +1251,13 @@ xfs_mountfs(
1152 /* 1251 /*
1153 * Allocate and initialize the per-ag data. 1252 * Allocate and initialize the per-ag data.
1154 */ 1253 */
1155 init_rwsem(&mp->m_peraglock); 1254 spin_lock_init(&mp->m_perag_lock);
1156 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), 1255 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
1157 KM_MAYFAIL); 1256 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1158 if (!mp->m_perag) 1257 if (error) {
1258 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
1159 goto out_remove_uuid; 1259 goto out_remove_uuid;
1160 1260 }
1161 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
1162 1261
1163 if (!sbp->sb_logblocks) { 1262 if (!sbp->sb_logblocks) {
1164 cmn_err(CE_WARN, "XFS: no log defined"); 1263 cmn_err(CE_WARN, "XFS: no log defined");
@@ -1318,18 +1417,14 @@ xfs_mountfs(
1318 * when at ENOSPC. This is needed for operations like create with 1417 * when at ENOSPC. This is needed for operations like create with
1319 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations 1418 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
1320 * are not allowed to use this reserved space. 1419 * are not allowed to use this reserved space.
1321 *
1322 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
1323 * This may drive us straight to ENOSPC on mount, but that implies
1324 * we were already there on the last unmount. Warn if this occurs.
1325 */ 1420 */
1326 resblks = mp->m_sb.sb_dblocks; 1421 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
1327 do_div(resblks, 20); 1422 resblks = xfs_default_resblks(mp);
1328 resblks = min_t(__uint64_t, resblks, 1024); 1423 error = xfs_reserve_blocks(mp, &resblks, NULL);
1329 error = xfs_reserve_blocks(mp, &resblks, NULL); 1424 if (error)
1330 if (error) 1425 cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
1331 cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. " 1426 "blocks. Continuing without a reserve pool.");
1332 "Continuing without a reserve pool."); 1427 }
1333 1428
1334 return 0; 1429 return 0;
1335 1430
@@ -1372,8 +1467,19 @@ xfs_unmountfs(
1372 * push out the iclog we will never get that unlocked. hence we 1467 * push out the iclog we will never get that unlocked. hence we
1373 * need to force the log first. 1468 * need to force the log first.
1374 */ 1469 */
1375 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1470 xfs_log_force(mp, XFS_LOG_SYNC);
1376 xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); 1471
1472 /*
1473 * Do a delwri reclaim pass first so that as many dirty inodes are
1474 * queued up for IO as possible. Then flush the buffers before making
1475 * a synchronous path to catch all the remaining inodes are reclaimed.
1476 * This makes the reclaim process as quick as possible by avoiding
1477 * synchronous writeout and blocking on inodes already in the delwri
1478 * state as much as possible.
1479 */
1480 xfs_reclaim_inodes(mp, 0);
1481 XFS_bflush(mp->m_ddev_targp);
1482 xfs_reclaim_inodes(mp, SYNC_WAIT);
1377 1483
1378 xfs_qm_unmount(mp); 1484 xfs_qm_unmount(mp);
1379 1485
@@ -1382,7 +1488,7 @@ xfs_unmountfs(
1382 * that nothing is pinned. This is important because bflush() 1488 * that nothing is pinned. This is important because bflush()
1383 * will skip pinned buffers. 1489 * will skip pinned buffers.
1384 */ 1490 */
1385 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1491 xfs_log_force(mp, XFS_LOG_SYNC);
1386 1492
1387 xfs_binval(mp->m_ddev_targp); 1493 xfs_binval(mp->m_ddev_targp);
1388 if (mp->m_rtdev_targp) { 1494 if (mp->m_rtdev_targp) {
@@ -1548,15 +1654,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1548 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); 1654 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
1549 1655
1550 /* find modified range */ 1656 /* find modified range */
1657 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1658 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1659 last = xfs_sb_info[f + 1].offset - 1;
1551 1660
1552 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 1661 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1553 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 1662 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1554 first = xfs_sb_info[f].offset; 1663 first = xfs_sb_info[f].offset;
1555 1664
1556 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1557 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1558 last = xfs_sb_info[f + 1].offset - 1;
1559
1560 xfs_trans_log_buf(tp, bp, first, last); 1665 xfs_trans_log_buf(tp, bp, first, last);
1561} 1666}
1562 1667
@@ -1887,7 +1992,7 @@ xfs_getsb(
1887 1992
1888 ASSERT(mp->m_sb_bp != NULL); 1993 ASSERT(mp->m_sb_bp != NULL);
1889 bp = mp->m_sb_bp; 1994 bp = mp->m_sb_bp;
1890 if (flags & XFS_BUF_TRYLOCK) { 1995 if (flags & XBF_TRYLOCK) {
1891 if (!XFS_BUF_CPSEMA(bp)) { 1996 if (!XFS_BUF_CPSEMA(bp)) {
1892 return NULL; 1997 return NULL;
1893 } 1998 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1df7e4502967..14dafd608230 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -78,7 +78,8 @@ typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, 78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
79 struct xfs_inode *, dm_right_t, 79 struct xfs_inode *, dm_right_t,
80 struct xfs_inode *, dm_right_t, 80 struct xfs_inode *, dm_right_t,
81 const char *, const char *, mode_t, int, int); 81 const unsigned char *, const unsigned char *,
82 mode_t, int, int);
82typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, 83typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
83 char *, char *); 84 char *, char *);
84typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, 85typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
@@ -207,8 +208,8 @@ typedef struct xfs_mount {
207 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ 208 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
208 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ 209 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
209 uint m_in_maxlevels; /* max inobt btree levels. */ 210 uint m_in_maxlevels; /* max inobt btree levels. */
210 struct xfs_perag *m_perag; /* per-ag accounting info */ 211 struct radix_tree_root m_perag_tree; /* per-ag accounting info */
211 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ 212 spinlock_t m_perag_lock; /* lock for m_perag_tree */
212 struct mutex m_growlock; /* growfs mutex */ 213 struct mutex m_growlock; /* growfs mutex */
213 int m_fixedfsid[2]; /* unchanged for life of FS */ 214 int m_fixedfsid[2]; /* unchanged for life of FS */
214 uint m_dmevmask; /* DMI events for this FS */ 215 uint m_dmevmask; /* DMI events for this FS */
@@ -224,6 +225,7 @@ typedef struct xfs_mount {
224 __uint64_t m_maxioffset; /* maximum inode offset */ 225 __uint64_t m_maxioffset; /* maximum inode offset */
225 __uint64_t m_resblks; /* total reserved blocks */ 226 __uint64_t m_resblks; /* total reserved blocks */
226 __uint64_t m_resblks_avail;/* available reserved blocks */ 227 __uint64_t m_resblks_avail;/* available reserved blocks */
228 __uint64_t m_resblks_save; /* reserved blks @ remount,ro */
227 int m_dalign; /* stripe unit */ 229 int m_dalign; /* stripe unit */
228 int m_swidth; /* stripe width */ 230 int m_swidth; /* stripe width */
229 int m_sinoalign; /* stripe unit inode alignment */ 231 int m_sinoalign; /* stripe unit inode alignment */
@@ -243,7 +245,7 @@ typedef struct xfs_mount {
243 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ 245 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */
244 atomic_t m_active_trans; /* number trans frozen */ 246 atomic_t m_active_trans; /* number trans frozen */
245#ifdef HAVE_PERCPU_SB 247#ifdef HAVE_PERCPU_SB
246 xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ 248 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
247 unsigned long m_icsb_counters; /* disabled per-cpu counters */ 249 unsigned long m_icsb_counters; /* disabled per-cpu counters */
248 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ 250 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
249 struct mutex m_icsb_mutex; /* balancer sync lock */ 251 struct mutex m_icsb_mutex; /* balancer sync lock */
@@ -384,19 +386,10 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
384} 386}
385 387
386/* 388/*
387 * perag get/put wrappers for eventual ref counting 389 * perag get/put wrappers for ref counting
388 */ 390 */
389static inline xfs_perag_t * 391struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
390xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino) 392void xfs_perag_put(struct xfs_perag *pag);
391{
392 return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
393}
394
395static inline void
396xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
397{
398 /* nothing to see here, move along */
399}
400 393
401/* 394/*
402 * Per-cpu superblock locking functions 395 * Per-cpu superblock locking functions
@@ -428,6 +421,7 @@ typedef struct xfs_mod_sb {
428} xfs_mod_sb_t; 421} xfs_mod_sb_t;
429 422
430extern int xfs_log_sbcount(xfs_mount_t *, uint); 423extern int xfs_log_sbcount(xfs_mount_t *, uint);
424extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
431extern int xfs_mountfs(xfs_mount_t *mp); 425extern int xfs_mountfs(xfs_mount_t *mp);
432 426
433extern void xfs_unmountfs(xfs_mount_t *); 427extern void xfs_unmountfs(xfs_mount_t *);
@@ -450,7 +444,8 @@ extern struct xfs_dmops xfs_dmcore_xfs;
450#endif /* __KERNEL__ */ 444#endif /* __KERNEL__ */
451 445
452extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 446extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
453extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t); 447extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
448 xfs_agnumber_t *);
454extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); 449extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
455extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 450extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
456 451
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4b0613d99faa..45ce15dc5b2b 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -398,7 +398,7 @@ exit:
398 * guaranteed that all the free functions for all the elements have finished 398 * guaranteed that all the free functions for all the elements have finished
399 * executing and the reaper is not running. 399 * executing and the reaper is not running.
400 */ 400 */
401void 401static void
402xfs_mru_cache_flush( 402xfs_mru_cache_flush(
403 xfs_mru_cache_t *mru) 403 xfs_mru_cache_t *mru)
404{ 404{
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 5d439f34b0c9..36dd3ec8b4eb 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -42,7 +42,6 @@ void xfs_mru_cache_uninit(void);
42int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, 42int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
43 unsigned int grp_count, 43 unsigned int grp_count,
44 xfs_mru_cache_free_func_t free_func); 44 xfs_mru_cache_free_func_t free_func);
45void xfs_mru_cache_flush(xfs_mru_cache_t *mru);
46void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); 45void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
47int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, 46int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
48 void *value); 47 void *value);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 91bfd60f4c74..fdcab3f81dde 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -223,16 +223,9 @@ typedef struct xfs_qoff_logformat {
223#define XFS_QMOPT_RES_INOS 0x0800000 223#define XFS_QMOPT_RES_INOS 0x0800000
224 224
225/* 225/*
226 * flags for dqflush and dqflush_all.
227 */
228#define XFS_QMOPT_SYNC 0x1000000
229#define XFS_QMOPT_ASYNC 0x2000000
230#define XFS_QMOPT_DELWRI 0x4000000
231
232/*
233 * flags for dqalloc. 226 * flags for dqalloc.
234 */ 227 */
235#define XFS_QMOPT_INHERIT 0x8000000 228#define XFS_QMOPT_INHERIT 0x1000000
236 229
237/* 230/*
238 * flags to xfs_trans_mod_dquot. 231 * flags to xfs_trans_mod_dquot.
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 5aa07caea5f1..e336742a58a4 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -47,48 +47,6 @@
47#include "xfs_trace.h" 47#include "xfs_trace.h"
48 48
49/* 49/*
50 * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
51 * which clears the setuid and setgid bits when a file is written.
52 */
53int
54xfs_write_clear_setuid(
55 xfs_inode_t *ip)
56{
57 xfs_mount_t *mp;
58 xfs_trans_t *tp;
59 int error;
60
61 mp = ip->i_mount;
62 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
63 if ((error = xfs_trans_reserve(tp, 0,
64 XFS_WRITEID_LOG_RES(mp),
65 0, 0, 0))) {
66 xfs_trans_cancel(tp, 0);
67 return error;
68 }
69 xfs_ilock(ip, XFS_ILOCK_EXCL);
70 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
71 xfs_trans_ihold(tp, ip);
72 ip->i_d.di_mode &= ~S_ISUID;
73
74 /*
75 * Note that we don't have to worry about mandatory
76 * file locking being disabled here because we only
77 * clear the S_ISGID bit if the Group execute bit is
78 * on, but if it was on then mandatory locking wouldn't
79 * have been enabled.
80 */
81 if (ip->i_d.di_mode & S_IXGRP) {
82 ip->i_d.di_mode &= ~S_ISGID;
83 }
84 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
85 xfs_trans_set_sync(tp);
86 error = xfs_trans_commit(tp, 0);
87 xfs_iunlock(ip, XFS_ILOCK_EXCL);
88 return 0;
89}
90
91/*
92 * Force a shutdown of the filesystem instantly while keeping 50 * Force a shutdown of the filesystem instantly while keeping
93 * the filesystem consistent. We don't do an unmount here; just shutdown 51 * the filesystem consistent. We don't do an unmount here; just shutdown
94 * the shop, make sure that absolutely nothing persistent happens to 52 * the shop, make sure that absolutely nothing persistent happens to
@@ -153,88 +111,6 @@ xfs_do_force_shutdown(
153 } 111 }
154} 112}
155 113
156
157/*
158 * Called when we want to stop a buffer from getting written or read.
159 * We attach the EIO error, muck with its flags, and call biodone
160 * so that the proper iodone callbacks get called.
161 */
162int
163xfs_bioerror(
164 xfs_buf_t *bp)
165{
166
167#ifdef XFSERRORDEBUG
168 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
169#endif
170
171 /*
172 * No need to wait until the buffer is unpinned.
173 * We aren't flushing it.
174 */
175 XFS_BUF_ERROR(bp, EIO);
176 /*
177 * We're calling biodone, so delete B_DONE flag. Either way
178 * we have to call the iodone callback, and calling biodone
179 * probably is the best way since it takes care of
180 * GRIO as well.
181 */
182 XFS_BUF_UNREAD(bp);
183 XFS_BUF_UNDELAYWRITE(bp);
184 XFS_BUF_UNDONE(bp);
185 XFS_BUF_STALE(bp);
186
187 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
188 xfs_biodone(bp);
189
190 return (EIO);
191}
192
193/*
194 * Same as xfs_bioerror, except that we are releasing the buffer
195 * here ourselves, and avoiding the biodone call.
196 * This is meant for userdata errors; metadata bufs come with
197 * iodone functions attached, so that we can track down errors.
198 */
199int
200xfs_bioerror_relse(
201 xfs_buf_t *bp)
202{
203 int64_t fl;
204
205 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
206 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
207
208 fl = XFS_BUF_BFLAGS(bp);
209 /*
210 * No need to wait until the buffer is unpinned.
211 * We aren't flushing it.
212 *
213 * chunkhold expects B_DONE to be set, whether
214 * we actually finish the I/O or not. We don't want to
215 * change that interface.
216 */
217 XFS_BUF_UNREAD(bp);
218 XFS_BUF_UNDELAYWRITE(bp);
219 XFS_BUF_DONE(bp);
220 XFS_BUF_STALE(bp);
221 XFS_BUF_CLR_IODONE_FUNC(bp);
222 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
223 if (!(fl & XFS_B_ASYNC)) {
224 /*
225 * Mark b_error and B_ERROR _both_.
226 * Lot's of chunkcache code assumes that.
227 * There's no reason to mark error for
228 * ASYNC buffers.
229 */
230 XFS_BUF_ERROR(bp, EIO);
231 XFS_BUF_FINISH_IOWAIT(bp);
232 } else {
233 xfs_buf_relse(bp);
234 }
235 return (EIO);
236}
237
238/* 114/*
239 * Prints out an ALERT message about I/O error. 115 * Prints out an ALERT message about I/O error.
240 */ 116 */
@@ -306,37 +182,6 @@ xfs_read_buf(
306} 182}
307 183
308/* 184/*
309 * Wrapper around bwrite() so that we can trap
310 * write errors, and act accordingly.
311 */
312int
313xfs_bwrite(
314 struct xfs_mount *mp,
315 struct xfs_buf *bp)
316{
317 int error;
318
319 /*
320 * XXXsup how does this work for quotas.
321 */
322 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
323 bp->b_mount = mp;
324 XFS_BUF_WRITE(bp);
325
326 if ((error = XFS_bwrite(bp))) {
327 ASSERT(mp);
328 /*
329 * Cannot put a buftrace here since if the buffer is not
330 * B_HOLD then we will brelse() the buffer before returning
331 * from bwrite and we could be tracing a buffer that has
332 * been reused.
333 */
334 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
335 }
336 return (error);
337}
338
339/*
340 * helper function to extract extent size hint from inode 185 * helper function to extract extent size hint from inode
341 */ 186 */
342xfs_extlen_t 187xfs_extlen_t
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 571f2174435c..11c41ec6ed75 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -39,10 +39,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
39/* 39/*
40 * Prototypes for functions in xfs_rw.c. 40 * Prototypes for functions in xfs_rw.c.
41 */ 41 */
42extern int xfs_write_clear_setuid(struct xfs_inode *ip);
43extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
44extern int xfs_bioerror(struct xfs_buf *bp);
45extern int xfs_bioerror_relse(struct xfs_buf *bp);
46extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, 42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
47 xfs_daddr_t blkno, int len, uint flags, 43 xfs_daddr_t blkno, int len, uint flags,
48 struct xfs_buf **bpp); 44 struct xfs_buf **bpp);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 237badcbac3b..be942d4e3324 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -981,9 +981,8 @@ shut_us_down:
981 */ 981 */
982 if (sync) { 982 if (sync) {
983 if (!error) { 983 if (!error) {
984 error = _xfs_log_force(mp, commit_lsn, 984 error = _xfs_log_force_lsn(mp, commit_lsn,
985 XFS_LOG_FORCE | XFS_LOG_SYNC, 985 XFS_LOG_SYNC, log_flushed);
986 log_flushed);
987 } 986 }
988 XFS_STATS_INC(xs_trans_sync); 987 XFS_STATS_INC(xs_trans_sync);
989 } else { 988 } else {
@@ -1121,7 +1120,7 @@ xfs_trans_fill_vecs(
1121 tp->t_header.th_num_items = nitems; 1120 tp->t_header.th_num_items = nitems;
1122 log_vector->i_addr = (xfs_caddr_t)&tp->t_header; 1121 log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
1123 log_vector->i_len = sizeof(xfs_trans_header_t); 1122 log_vector->i_len = sizeof(xfs_trans_header_t);
1124 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR); 1123 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
1125} 1124}
1126 1125
1127 1126
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ca64f33c63a3..c93e3a102857 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -861,8 +861,7 @@ typedef struct xfs_item_ops {
861#define XFS_ITEM_SUCCESS 0 861#define XFS_ITEM_SUCCESS 0
862#define XFS_ITEM_PINNED 1 862#define XFS_ITEM_PINNED 1
863#define XFS_ITEM_LOCKED 2 863#define XFS_ITEM_LOCKED 2
864#define XFS_ITEM_FLUSHING 3 864#define XFS_ITEM_PUSHBUF 3
865#define XFS_ITEM_PUSHBUF 4
866 865
867/* 866/*
868 * This structure is used to maintain a list of block ranges that have been 867 * This structure is used to maintain a list of block ranges that have been
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ffc570679be..e799824f7245 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -237,14 +237,15 @@ out:
237} 237}
238 238
239/* 239/*
240 * Function that does the work of pushing on the AIL 240 * xfsaild_push does the work of pushing on the AIL. Returning a timeout of
241 * zero indicates that the caller should sleep until woken.
241 */ 242 */
242long 243long
243xfsaild_push( 244xfsaild_push(
244 struct xfs_ail *ailp, 245 struct xfs_ail *ailp,
245 xfs_lsn_t *last_lsn) 246 xfs_lsn_t *last_lsn)
246{ 247{
247 long tout = 1000; /* milliseconds */ 248 long tout = 0;
248 xfs_lsn_t last_pushed_lsn = *last_lsn; 249 xfs_lsn_t last_pushed_lsn = *last_lsn;
249 xfs_lsn_t target = ailp->xa_target; 250 xfs_lsn_t target = ailp->xa_target;
250 xfs_lsn_t lsn; 251 xfs_lsn_t lsn;
@@ -252,6 +253,7 @@ xfsaild_push(
252 int flush_log, count, stuck; 253 int flush_log, count, stuck;
253 xfs_mount_t *mp = ailp->xa_mount; 254 xfs_mount_t *mp = ailp->xa_mount;
254 struct xfs_ail_cursor *cur = &ailp->xa_cursors; 255 struct xfs_ail_cursor *cur = &ailp->xa_cursors;
256 int push_xfsbufd = 0;
255 257
256 spin_lock(&ailp->xa_lock); 258 spin_lock(&ailp->xa_lock);
257 xfs_trans_ail_cursor_init(ailp, cur); 259 xfs_trans_ail_cursor_init(ailp, cur);
@@ -262,7 +264,7 @@ xfsaild_push(
262 */ 264 */
263 xfs_trans_ail_cursor_done(ailp, cur); 265 xfs_trans_ail_cursor_done(ailp, cur);
264 spin_unlock(&ailp->xa_lock); 266 spin_unlock(&ailp->xa_lock);
265 last_pushed_lsn = 0; 267 *last_lsn = 0;
266 return tout; 268 return tout;
267 } 269 }
268 270
@@ -279,7 +281,6 @@ xfsaild_push(
279 * prevents use from spinning when we can't do anything or there is 281 * prevents use from spinning when we can't do anything or there is
280 * lots of contention on the AIL lists. 282 * lots of contention on the AIL lists.
281 */ 283 */
282 tout = 10;
283 lsn = lip->li_lsn; 284 lsn = lip->li_lsn;
284 flush_log = stuck = count = 0; 285 flush_log = stuck = count = 0;
285 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { 286 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
@@ -308,6 +309,7 @@ xfsaild_push(
308 XFS_STATS_INC(xs_push_ail_pushbuf); 309 XFS_STATS_INC(xs_push_ail_pushbuf);
309 IOP_PUSHBUF(lip); 310 IOP_PUSHBUF(lip);
310 last_pushed_lsn = lsn; 311 last_pushed_lsn = lsn;
312 push_xfsbufd = 1;
311 break; 313 break;
312 314
313 case XFS_ITEM_PINNED: 315 case XFS_ITEM_PINNED:
@@ -322,12 +324,6 @@ xfsaild_push(
322 stuck++; 324 stuck++;
323 break; 325 break;
324 326
325 case XFS_ITEM_FLUSHING:
326 XFS_STATS_INC(xs_push_ail_flushing);
327 last_pushed_lsn = lsn;
328 stuck++;
329 break;
330
331 default: 327 default:
332 ASSERT(0); 328 ASSERT(0);
333 break; 329 break;
@@ -371,19 +367,24 @@ xfsaild_push(
371 * move forward in the AIL. 367 * move forward in the AIL.
372 */ 368 */
373 XFS_STATS_INC(xs_push_ail_flush); 369 XFS_STATS_INC(xs_push_ail_flush);
374 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 370 xfs_log_force(mp, 0);
371 }
372
373 if (push_xfsbufd) {
374 /* we've got delayed write buffers to flush */
375 wake_up_process(mp->m_ddev_targp->bt_task);
375 } 376 }
376 377
377 if (!count) { 378 if (!count) {
378 /* We're past our target or empty, so idle */ 379 /* We're past our target or empty, so idle */
379 tout = 1000; 380 last_pushed_lsn = 0;
380 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 381 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
381 /* 382 /*
382 * We reached the target so wait a bit longer for I/O to 383 * We reached the target so wait a bit longer for I/O to
383 * complete and remove pushed items from the AIL before we 384 * complete and remove pushed items from the AIL before we
384 * start the next scan from the start of the AIL. 385 * start the next scan from the start of the AIL.
385 */ 386 */
386 tout += 20; 387 tout = 50;
387 last_pushed_lsn = 0; 388 last_pushed_lsn = 0;
388 } else if ((stuck * 100) / count > 90) { 389 } else if ((stuck * 100) / count > 90) {
389 /* 390 /*
@@ -395,11 +396,14 @@ xfsaild_push(
395 * Backoff a bit more to allow some I/O to complete before 396 * Backoff a bit more to allow some I/O to complete before
396 * continuing from where we were. 397 * continuing from where we were.
397 */ 398 */
398 tout += 10; 399 tout = 20;
400 } else {
401 /* more to do, but wait a short while before continuing */
402 tout = 10;
399 } 403 }
400 *last_lsn = last_pushed_lsn; 404 *last_lsn = last_pushed_lsn;
401 return tout; 405 return tout;
402} /* xfsaild_push */ 406}
403 407
404 408
405/* 409/*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 49130628d5ef..5ffd544434eb 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -75,13 +75,14 @@ xfs_trans_get_buf(xfs_trans_t *tp,
75 xfs_buf_log_item_t *bip; 75 xfs_buf_log_item_t *bip;
76 76
77 if (flags == 0) 77 if (flags == 0)
78 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; 78 flags = XBF_LOCK | XBF_MAPPED;
79 79
80 /* 80 /*
81 * Default to a normal get_buf() call if the tp is NULL. 81 * Default to a normal get_buf() call if the tp is NULL.
82 */ 82 */
83 if (tp == NULL) 83 if (tp == NULL)
84 return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); 84 return xfs_buf_get(target_dev, blkno, len,
85 flags | XBF_DONT_BLOCK);
85 86
86 /* 87 /*
87 * If we find the buffer in the cache with this transaction 88 * If we find the buffer in the cache with this transaction
@@ -117,14 +118,14 @@ xfs_trans_get_buf(xfs_trans_t *tp,
117 } 118 }
118 119
119 /* 120 /*
120 * We always specify the BUF_BUSY flag within a transaction so 121 * We always specify the XBF_DONT_BLOCK flag within a transaction
121 * that get_buf does not try to push out a delayed write buffer 122 * so that get_buf does not try to push out a delayed write buffer
122 * which might cause another transaction to take place (if the 123 * which might cause another transaction to take place (if the
123 * buffer was delayed alloc). Such recursive transactions can 124 * buffer was delayed alloc). Such recursive transactions can
124 * easily deadlock with our current transaction as well as cause 125 * easily deadlock with our current transaction as well as cause
125 * us to run out of stack space. 126 * us to run out of stack space.
126 */ 127 */
127 bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); 128 bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
128 if (bp == NULL) { 129 if (bp == NULL) {
129 return NULL; 130 return NULL;
130 } 131 }
@@ -290,15 +291,15 @@ xfs_trans_read_buf(
290 int error; 291 int error;
291 292
292 if (flags == 0) 293 if (flags == 0)
293 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; 294 flags = XBF_LOCK | XBF_MAPPED;
294 295
295 /* 296 /*
296 * Default to a normal get_buf() call if the tp is NULL. 297 * Default to a normal get_buf() call if the tp is NULL.
297 */ 298 */
298 if (tp == NULL) { 299 if (tp == NULL) {
299 bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); 300 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
300 if (!bp) 301 if (!bp)
301 return (flags & XFS_BUF_TRYLOCK) ? 302 return (flags & XBF_TRYLOCK) ?
302 EAGAIN : XFS_ERROR(ENOMEM); 303 EAGAIN : XFS_ERROR(ENOMEM);
303 304
304 if (XFS_BUF_GETERROR(bp) != 0) { 305 if (XFS_BUF_GETERROR(bp) != 0) {
@@ -385,14 +386,14 @@ xfs_trans_read_buf(
385 } 386 }
386 387
387 /* 388 /*
388 * We always specify the BUF_BUSY flag within a transaction so 389 * We always specify the XBF_DONT_BLOCK flag within a transaction
389 * that get_buf does not try to push out a delayed write buffer 390 * so that get_buf does not try to push out a delayed write buffer
390 * which might cause another transaction to take place (if the 391 * which might cause another transaction to take place (if the
391 * buffer was delayed alloc). Such recursive transactions can 392 * buffer was delayed alloc). Such recursive transactions can
392 * easily deadlock with our current transaction as well as cause 393 * easily deadlock with our current transaction as well as cause
393 * us to run out of stack space. 394 * us to run out of stack space.
394 */ 395 */
395 bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); 396 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
396 if (bp == NULL) { 397 if (bp == NULL) {
397 *bpp = NULL; 398 *bpp = NULL;
398 return 0; 399 return 0;
@@ -472,8 +473,8 @@ shutdown_abort:
472 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 473 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
473 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); 474 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
474#endif 475#endif
475 ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) != 476 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
476 (XFS_B_STALE|XFS_B_DELWRI)); 477 (XBF_STALE|XBF_DELWRI));
477 478
478 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 479 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
479 xfs_buf_relse(bp); 480 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index d725428c9df6..b09904555d07 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -151,8 +151,8 @@ typedef enum {
151} xfs_btnum_t; 151} xfs_btnum_t;
152 152
153struct xfs_name { 153struct xfs_name {
154 const char *name; 154 const unsigned char *name;
155 int len; 155 int len;
156}; 156};
157 157
158#endif /* __XFS_TYPES_H__ */ 158#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 6f268756bf36..ddd2c5d1b854 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -256,7 +256,7 @@ xfs_setattr(
256 iattr->ia_size > ip->i_d.di_size) { 256 iattr->ia_size > ip->i_d.di_size) {
257 code = xfs_flush_pages(ip, 257 code = xfs_flush_pages(ip,
258 ip->i_d.di_size, iattr->ia_size, 258 ip->i_d.di_size, iattr->ia_size,
259 XFS_B_ASYNC, FI_NONE); 259 XBF_ASYNC, FI_NONE);
260 } 260 }
261 261
262 /* wait for all I/O to complete */ 262 /* wait for all I/O to complete */
@@ -597,7 +597,7 @@ xfs_fsync(
597{ 597{
598 xfs_trans_t *tp; 598 xfs_trans_t *tp;
599 int error = 0; 599 int error = 0;
600 int log_flushed = 0, changed = 1; 600 int log_flushed = 0;
601 601
602 xfs_itrace_entry(ip); 602 xfs_itrace_entry(ip);
603 603
@@ -627,19 +627,16 @@ xfs_fsync(
627 * disk yet, the inode will be still be pinned. If it is, 627 * disk yet, the inode will be still be pinned. If it is,
628 * force the log. 628 * force the log.
629 */ 629 */
630
631 xfs_iunlock(ip, XFS_ILOCK_SHARED); 630 xfs_iunlock(ip, XFS_ILOCK_SHARED);
632
633 if (xfs_ipincount(ip)) { 631 if (xfs_ipincount(ip)) {
634 error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, 632 if (ip->i_itemp->ili_last_lsn) {
635 XFS_LOG_FORCE | XFS_LOG_SYNC, 633 error = _xfs_log_force_lsn(ip->i_mount,
636 &log_flushed); 634 ip->i_itemp->ili_last_lsn,
637 } else { 635 XFS_LOG_SYNC, &log_flushed);
638 /* 636 } else {
639 * If the inode is not pinned and nothing has changed 637 error = _xfs_log_force(ip->i_mount,
640 * we don't need to flush the cache. 638 XFS_LOG_SYNC, &log_flushed);
641 */ 639 }
642 changed = 0;
643 } 640 }
644 } else { 641 } else {
645 /* 642 /*
@@ -674,7 +671,7 @@ xfs_fsync(
674 xfs_iunlock(ip, XFS_ILOCK_EXCL); 671 xfs_iunlock(ip, XFS_ILOCK_EXCL);
675 } 672 }
676 673
677 if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) { 674 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
678 /* 675 /*
679 * If the log write didn't issue an ordered tag we need 676 * If the log write didn't issue an ordered tag we need
680 * to flush the disk cache for the data device now. 677 * to flush the disk cache for the data device now.
@@ -1096,7 +1093,7 @@ xfs_release(
1096 */ 1093 */
1097 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1094 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1098 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) 1095 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
1099 xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); 1096 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
1100 } 1097 }
1101 1098
1102 if (ip->i_d.di_nlink != 0) { 1099 if (ip->i_d.di_nlink != 0) {
@@ -2199,7 +2196,8 @@ xfs_symlink(
2199 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { 2196 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2200 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, 2197 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2201 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, 2198 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2202 link_name->name, target_path, 0, 0, 0); 2199 link_name->name,
2200 (unsigned char *)target_path, 0, 0, 0);
2203 if (error) 2201 if (error)
2204 return error; 2202 return error;
2205 } 2203 }
@@ -2395,7 +2393,8 @@ std_return:
2395 dp, DM_RIGHT_NULL, 2393 dp, DM_RIGHT_NULL,
2396 error ? NULL : ip, 2394 error ? NULL : ip,
2397 DM_RIGHT_NULL, link_name->name, 2395 DM_RIGHT_NULL, link_name->name,
2398 target_path, 0, error, 0); 2396 (unsigned char *)target_path,
2397 0, error, 0);
2399 } 2398 }
2400 2399
2401 if (!error) 2400 if (!error)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 167a467403a5..774f40729ca1 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -43,11 +43,11 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
43int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, 43int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
44 struct xfs_inode *src_ip, struct xfs_inode *target_dp, 44 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
45 struct xfs_name *target_name, struct xfs_inode *target_ip); 45 struct xfs_name *target_name, struct xfs_inode *target_ip);
46int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, 46int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
47 int *valuelenp, int flags); 47 unsigned char *value, int *valuelenp, int flags);
48int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, 48int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
49 int valuelen, int flags); 49 unsigned char *value, int valuelen, int flags);
50int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); 50int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
51int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 51int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
52 int flags, struct attrlist_cursor_kern *cursor); 52 int flags, struct attrlist_cursor_kern *cursor);
53ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, 53ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,