aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2009-09-21 05:09:22 -0400
committerArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2009-09-21 05:09:22 -0400
commit7cce2f4cb7f5f641f78c8e3eea4e7b1b96cb71c0 (patch)
treeb064d077928cf224660ab1e1841cdab2c9fd8b08 /fs
parente055f7e873d900925c222cf2d1ec955af4a9ca90 (diff)
parentebc79c4f8da0f92efa968e0328f32334a2ce80cf (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into linux-next
Conflicts: fs/ubifs/super.c Merge the upstream tree in order to resolve a conflict with the per-bdi writeback changes from the linux-2.6-block tree.
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c21
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_inode.c126
-rw-r--r--fs/9p/vfs_super.c39
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/afs/file.c18
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/binfmt_elf.c28
-rw-r--r--fs/binfmt_flat.c17
-rw-r--r--fs/block_dev.c40
-rw-r--r--fs/btrfs/async-thread.c4
-rw-r--r--fs/btrfs/ctree.c121
-rw-r--r--fs/btrfs/ctree.h27
-rw-r--r--fs/btrfs/disk-io.c17
-rw-r--r--fs/btrfs/extent-tree.c533
-rw-r--r--fs/btrfs/free-space-cache.c1058
-rw-r--r--fs/btrfs/free-space-cache.h8
-rw-r--r--fs/btrfs/inode.c26
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/print-tree.c6
-rw-r--r--fs/btrfs/relocation.c12
-rw-r--r--fs/btrfs/transaction.c56
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c50
-rw-r--r--fs/btrfs/zlib.c6
-rw-r--r--fs/buffer.c9
-rw-r--r--fs/char_dev.c40
-rw-r--r--fs/cifs/CHANGES10
-rw-r--r--fs/cifs/README25
-rw-r--r--fs/cifs/cifs_dfs_ref.c12
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifs_unicode.c2
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c26
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h21
-rw-r--r--fs/cifs/cifssmb.c316
-rw-r--r--fs/cifs/connect.c112
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c43
-rw-r--r--fs/cifs/inode.c15
-rw-r--r--fs/cifs/transport.c17
-rw-r--r--fs/compat.c17
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/dlm/lowcomms.c26
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/ecryptfs/keystore.c13
-rw-r--r--fs/exec.c67
-rw-r--r--fs/ext2/acl.c8
-rw-r--r--fs/ext2/acl.h4
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/namei.c8
-rw-r--r--fs/ext3/Kconfig32
-rw-r--r--fs/ext3/acl.c8
-rw-r--r--fs/ext3/acl.h4
-rw-r--r--fs/ext3/dir.c3
-rw-r--r--fs/ext3/file.c63
-rw-r--r--fs/ext3/fsync.c12
-rw-r--r--fs/ext3/inode.c60
-rw-r--r--fs/ext3/namei.c4
-rw-r--r--fs/ext3/super.c40
-rw-r--r--fs/ext4/Kconfig11
-rw-r--r--fs/ext4/acl.c8
-rw-r--r--fs/ext4/acl.h4
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/extents.c112
-rw-r--r--fs/ext4/file.c55
-rw-r--r--fs/ext4/fsync.c13
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c150
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c429
-rw-r--r--fs/ext4/mballoc.h22
-rw-r--r--fs/ext4/migrate.c22
-rw-r--r--fs/ext4/move_extent.c334
-rw-r--r--fs/ext4/namei.c26
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c155
-rw-r--r--fs/ext4/xattr.c15
-rw-r--r--fs/fat/file.c22
-rw-r--r--fs/fat/misc.c4
-rw-r--r--fs/fs-writeback.c1104
-rw-r--r--fs/fuse/control.c138
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/fuse_i.h18
-rw-r--r--fs/fuse/inode.c83
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c106
-rw-r--r--fs/gfs2/aops.c39
-rw-r--r--fs/gfs2/dentry.c18
-rw-r--r--fs/gfs2/eaops.c157
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/export.c36
-rw-r--r--fs/gfs2/file.c1
-rw-r--r--fs/gfs2/glock.c138
-rw-r--r--fs/gfs2/glock.h3
-rw-r--r--fs/gfs2/glops.c21
-rw-r--r--fs/gfs2/incore.h17
-rw-r--r--fs/gfs2/inode.c159
-rw-r--r--fs/gfs2/ops_fstype.c66
-rw-r--r--fs/gfs2/ops_inode.c82
-rw-r--r--fs/gfs2/rgrp.c111
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c86
-rw-r--r--fs/gfs2/super.h9
-rw-r--r--fs/gfs2/sys.c49
-rw-r--r--fs/gfs2/util.c41
-rw-r--r--fs/gfs2/xattr.c (renamed from fs/gfs2/eattr.c)425
-rw-r--r--fs/gfs2/xattr.h (renamed from fs/gfs2/eattr.h)54
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--fs/inode.c44
-rw-r--r--fs/jbd/checkpoint.c6
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c54
-rw-r--r--fs/jbd/recovery.c18
-rw-r--r--fs/jbd/revoke.c16
-rw-r--r--fs/jbd/transaction.c77
-rw-r--r--fs/jbd2/commit.c12
-rw-r--r--fs/jbd2/journal.c6
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/jffs2/acl.c7
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jffs2/wbuf.c10
-rw-r--r--fs/jfs/acl.c11
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/host.c14
-rw-r--r--fs/lockd/mon.c44
-rw-r--r--fs/locks.c4
-rw-r--r--fs/namei.c110
-rw-r--r--fs/namespace.c3
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/cache_lib.c140
-rw-r--r--fs/nfs/cache_lib.h27
-rw-r--r--fs/nfs/callback.c26
-rw-r--r--fs/nfs/client.c16
-rw-r--r--fs/nfs/direct.c23
-rw-r--r--fs/nfs/dns_resolve.c335
-rw-r--r--fs/nfs/dns_resolve.h14
-rw-r--r--fs/nfs/file.c49
-rw-r--r--fs/nfs/idmap.c6
-rw-r--r--fs/nfs/inode.c100
-rw-r--r--fs/nfs/internal.h39
-rw-r--r--fs/nfs/mount_clnt.c83
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4namespace.c24
-rw-r--r--fs/nfs/nfs4proc.c40
-rw-r--r--fs/nfs/nfs4state.c4
-rw-r--r--fs/nfs/nfs4xdr.c1460
-rw-r--r--fs/nfs/read.c6
-rw-r--r--fs/nfs/super.c453
-rw-r--r--fs/nfs/write.c98
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfsctl.c21
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/nilfs2/Kconfig2
-rw-r--r--fs/nilfs2/bmap.c151
-rw-r--r--fs/nilfs2/bmap.h76
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/btree.c625
-rw-r--r--fs/nilfs2/cpfile.c11
-rw-r--r--fs/nilfs2/cpfile.h2
-rw-r--r--fs/nilfs2/dat.c42
-rw-r--r--fs/nilfs2/dat.h8
-rw-r--r--fs/nilfs2/direct.c161
-rw-r--r--fs/nilfs2/ifile.h1
-rw-r--r--fs/nilfs2/inode.c3
-rw-r--r--fs/nilfs2/ioctl.c26
-rw-r--r--fs/nilfs2/mdt.c42
-rw-r--r--fs/nilfs2/mdt.h3
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/segment.c23
-rw-r--r--fs/nilfs2/sufile.h1
-rw-r--r--fs/nilfs2/super.c102
-rw-r--r--fs/nilfs2/the_nilfs.c19
-rw-r--r--fs/nilfs2/the_nilfs.h45
-rw-r--r--fs/notify/Kconfig12
-rw-r--r--fs/notify/dnotify/Kconfig2
-rw-r--r--fs/notify/fsnotify.c4
-rw-r--r--fs/notify/inotify/Kconfig2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c46
-rw-r--r--fs/notify/inotify/inotify_user.c271
-rw-r--r--fs/notify/notification.c30
-rw-r--r--fs/ntfs/file.c16
-rw-r--r--fs/ntfs/mft.c13
-rw-r--r--fs/ocfs2/alloc.c49
-rw-r--r--fs/ocfs2/aops.c69
-rw-r--r--fs/ocfs2/dcache.c46
-rw-r--r--fs/ocfs2/dcache.h3
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/file.c54
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/journal.h19
-rw-r--r--fs/ocfs2/ocfs2.h22
-rw-r--r--fs/ocfs2/ocfs2_lockid.h1
-rw-r--r--fs/ocfs2/quota.h1
-rw-r--r--fs/ocfs2/quota_global.c144
-rw-r--r--fs/ocfs2/quota_local.c110
-rw-r--r--fs/ocfs2/stack_o2cb.c3
-rw-r--r--fs/ocfs2/super.c34
-rw-r--r--fs/ocfs2/xattr.c3
-rw-r--r--fs/open.c12
-rw-r--r--fs/partitions/check.c14
-rw-r--r--fs/proc/base.c46
-rw-r--r--fs/proc/task_mmu.c1
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/quota/dquot.c7
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/select.c1
-rw-r--r--fs/splice.c30
-rw-r--r--fs/super.c11
-rw-r--r--fs/sync.c85
-rw-r--r--fs/sysfs/dir.c3
-rw-r--r--fs/sysfs/inode.c135
-rw-r--r--fs/sysfs/symlink.c2
-rw-r--r--fs/sysfs/sysfs.h12
-rw-r--r--fs/ubifs/budget.c32
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/directory.c86
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c19
-rw-r--r--fs/udf/lowlevel.c4
-rw-r--r--fs/udf/namei.c1
-rw-r--r--fs/udf/super.c12
-rw-r--r--fs/xattr.c55
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c21
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c51
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c28
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c78
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_attr.c8
-rw-r--r--fs/xfs/xfs_bmap.c4
-rw-r--r--fs/xfs/xfs_bmap.h11
-rw-r--r--fs/xfs/xfs_bmap_btree.c20
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c46
-rw-r--r--fs/xfs/xfs_btree.h15
-rw-r--r--fs/xfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_fsops.c20
-rw-r--r--fs/xfs/xfs_ialloc.c805
-rw-r--r--fs/xfs/xfs_ialloc.h18
-rw-r--r--fs/xfs/xfs_iget.c280
-rw-r--r--fs/xfs/xfs_inode.c18
-rw-r--r--fs/xfs/xfs_inode.h25
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h1
-rw-r--r--fs/xfs/xfs_itable.c98
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_mru_cache.c29
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_rw.c84
-rw-r--r--fs/xfs/xfs_rw.h7
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c86
-rw-r--r--fs/xfs/xfs_vnodeops.c21
292 files changed, 9459 insertions, 6600 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 332b5ff02fec..f7003cfac63d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -76,7 +76,7 @@ static const match_table_t tokens = {
76 * Return 0 upon success, -ERRNO upon failure. 76 * Return 0 upon success, -ERRNO upon failure.
77 */ 77 */
78 78
79static int v9fs_parse_options(struct v9fs_session_info *v9ses) 79static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
80{ 80{
81 char *options; 81 char *options;
82 substring_t args[MAX_OPT_ARGS]; 82 substring_t args[MAX_OPT_ARGS];
@@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
90 v9ses->debug = 0; 90 v9ses->debug = 0;
91 v9ses->cache = 0; 91 v9ses->cache = 0;
92 92
93 if (!v9ses->options) 93 if (!opts)
94 return 0; 94 return 0;
95 95
96 options = kstrdup(v9ses->options, GFP_KERNEL); 96 options = kstrdup(opts, GFP_KERNEL);
97 if (!options) { 97 if (!options) {
98 P9_DPRINTK(P9_DEBUG_ERROR, 98 P9_DPRINTK(P9_DEBUG_ERROR,
99 "failed to allocate copy of option string\n"); 99 "failed to allocate copy of option string\n");
@@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
206 v9ses->uid = ~0; 206 v9ses->uid = ~0;
207 v9ses->dfltuid = V9FS_DEFUID; 207 v9ses->dfltuid = V9FS_DEFUID;
208 v9ses->dfltgid = V9FS_DEFGID; 208 v9ses->dfltgid = V9FS_DEFGID;
209 if (data) {
210 v9ses->options = kstrdup(data, GFP_KERNEL);
211 if (!v9ses->options) {
212 P9_DPRINTK(P9_DEBUG_ERROR,
213 "failed to allocate copy of option string\n");
214 retval = -ENOMEM;
215 goto error;
216 }
217 }
218 209
219 rc = v9fs_parse_options(v9ses); 210 rc = v9fs_parse_options(v9ses, data);
220 if (rc < 0) { 211 if (rc < 0) {
221 retval = rc; 212 retval = rc;
222 goto error; 213 goto error;
223 } 214 }
224 215
225 v9ses->clnt = p9_client_create(dev_name, v9ses->options); 216 v9ses->clnt = p9_client_create(dev_name, data);
226
227 if (IS_ERR(v9ses->clnt)) { 217 if (IS_ERR(v9ses->clnt)) {
228 retval = PTR_ERR(v9ses->clnt); 218 retval = PTR_ERR(v9ses->clnt);
229 v9ses->clnt = NULL; 219 v9ses->clnt = NULL;
@@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
280 270
281 __putname(v9ses->uname); 271 __putname(v9ses->uname);
282 __putname(v9ses->aname); 272 __putname(v9ses->aname);
283 kfree(v9ses->options);
284} 273}
285 274
286/** 275/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a7d567192998..38762bf102a9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -85,7 +85,6 @@ struct v9fs_session_info {
85 unsigned int afid; 85 unsigned int afid;
86 unsigned int cache; 86 unsigned int cache;
87 87
88 char *options; /* copy of mount options */
89 char *uname; /* user name to mount as */ 88 char *uname; /* user name to mount as */
90 char *aname; /* name of remote hierarchy being mounted */ 89 char *aname; /* name of remote hierarchy being mounted */
91 unsigned int maxdata; /* max data for client interface */ 90 unsigned int maxdata; /* max data for client interface */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 81f8bbf12f9f..06a223d50a81 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended)
171 171
172/** 172/**
173 * v9fs_blank_wstat - helper function to setup a 9P stat structure 173 * v9fs_blank_wstat - helper function to setup a 9P stat structure
174 * @v9ses: 9P session info (for determining extended mode)
175 * @wstat: structure to initialize 174 * @wstat: structure to initialize
176 * 175 *
177 */ 176 */
@@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
207 206
208struct inode *v9fs_get_inode(struct super_block *sb, int mode) 207struct inode *v9fs_get_inode(struct super_block *sb, int mode)
209{ 208{
209 int err;
210 struct inode *inode; 210 struct inode *inode;
211 struct v9fs_session_info *v9ses = sb->s_fs_info; 211 struct v9fs_session_info *v9ses = sb->s_fs_info;
212 212
213 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); 213 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
214 214
215 inode = new_inode(sb); 215 inode = new_inode(sb);
216 if (inode) { 216 if (!inode) {
217 inode->i_mode = mode;
218 inode->i_uid = current_fsuid();
219 inode->i_gid = current_fsgid();
220 inode->i_blocks = 0;
221 inode->i_rdev = 0;
222 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
223 inode->i_mapping->a_ops = &v9fs_addr_operations;
224
225 switch (mode & S_IFMT) {
226 case S_IFIFO:
227 case S_IFBLK:
228 case S_IFCHR:
229 case S_IFSOCK:
230 if (!v9fs_extended(v9ses)) {
231 P9_DPRINTK(P9_DEBUG_ERROR,
232 "special files without extended mode\n");
233 return ERR_PTR(-EINVAL);
234 }
235 init_special_inode(inode, inode->i_mode,
236 inode->i_rdev);
237 break;
238 case S_IFREG:
239 inode->i_op = &v9fs_file_inode_operations;
240 inode->i_fop = &v9fs_file_operations;
241 break;
242 case S_IFLNK:
243 if (!v9fs_extended(v9ses)) {
244 P9_DPRINTK(P9_DEBUG_ERROR,
245 "extended modes used w/o 9P2000.u\n");
246 return ERR_PTR(-EINVAL);
247 }
248 inode->i_op = &v9fs_symlink_inode_operations;
249 break;
250 case S_IFDIR:
251 inc_nlink(inode);
252 if (v9fs_extended(v9ses))
253 inode->i_op = &v9fs_dir_inode_operations_ext;
254 else
255 inode->i_op = &v9fs_dir_inode_operations;
256 inode->i_fop = &v9fs_dir_operations;
257 break;
258 default:
259 P9_DPRINTK(P9_DEBUG_ERROR,
260 "BAD mode 0x%x S_IFMT 0x%x\n",
261 mode, mode & S_IFMT);
262 return ERR_PTR(-EINVAL);
263 }
264 } else {
265 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); 217 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
266 return ERR_PTR(-ENOMEM); 218 return ERR_PTR(-ENOMEM);
267 } 219 }
220
221 inode->i_mode = mode;
222 inode->i_uid = current_fsuid();
223 inode->i_gid = current_fsgid();
224 inode->i_blocks = 0;
225 inode->i_rdev = 0;
226 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
227 inode->i_mapping->a_ops = &v9fs_addr_operations;
228
229 switch (mode & S_IFMT) {
230 case S_IFIFO:
231 case S_IFBLK:
232 case S_IFCHR:
233 case S_IFSOCK:
234 if (!v9fs_extended(v9ses)) {
235 P9_DPRINTK(P9_DEBUG_ERROR,
236 "special files without extended mode\n");
237 err = -EINVAL;
238 goto error;
239 }
240 init_special_inode(inode, inode->i_mode, inode->i_rdev);
241 break;
242 case S_IFREG:
243 inode->i_op = &v9fs_file_inode_operations;
244 inode->i_fop = &v9fs_file_operations;
245 break;
246 case S_IFLNK:
247 if (!v9fs_extended(v9ses)) {
248 P9_DPRINTK(P9_DEBUG_ERROR,
249 "extended modes used w/o 9P2000.u\n");
250 err = -EINVAL;
251 goto error;
252 }
253 inode->i_op = &v9fs_symlink_inode_operations;
254 break;
255 case S_IFDIR:
256 inc_nlink(inode);
257 if (v9fs_extended(v9ses))
258 inode->i_op = &v9fs_dir_inode_operations_ext;
259 else
260 inode->i_op = &v9fs_dir_inode_operations;
261 inode->i_fop = &v9fs_dir_operations;
262 break;
263 default:
264 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
265 mode, mode & S_IFMT);
266 err = -EINVAL;
267 goto error;
268 }
269
268 return inode; 270 return inode;
271
272error:
273 iput(inode);
274 return ERR_PTR(err);
269} 275}
270 276
271/* 277/*
@@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
338 344
339 ret = NULL; 345 ret = NULL;
340 st = p9_client_stat(fid); 346 st = p9_client_stat(fid);
341 if (IS_ERR(st)) { 347 if (IS_ERR(st))
342 err = PTR_ERR(st); 348 return ERR_CAST(st);
343 st = NULL;
344 goto error;
345 }
346 349
347 umode = p9mode2unixmode(v9ses, st->mode); 350 umode = p9mode2unixmode(v9ses, st->mode);
348 ret = v9fs_get_inode(sb, umode); 351 ret = v9fs_get_inode(sb, umode);
349 if (IS_ERR(ret)) { 352 if (IS_ERR(ret)) {
350 err = PTR_ERR(ret); 353 err = PTR_ERR(ret);
351 ret = NULL;
352 goto error; 354 goto error;
353 } 355 }
354 356
355 v9fs_stat2inode(st, ret, sb); 357 v9fs_stat2inode(st, ret, sb);
356 ret->i_ino = v9fs_qid2ino(&st->qid); 358 ret->i_ino = v9fs_qid2ino(&st->qid);
359 p9stat_free(st);
357 kfree(st); 360 kfree(st);
358 return ret; 361 return ret;
359 362
360error: 363error:
364 p9stat_free(st);
361 kfree(st); 365 kfree(st);
362 if (ret)
363 iput(ret);
364
365 return ERR_PTR(err); 366 return ERR_PTR(err);
366} 367}
367 368
@@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file)
403 * @v9ses: session information 404 * @v9ses: session information
404 * @dir: directory that dentry is being created in 405 * @dir: directory that dentry is being created in
405 * @dentry: dentry that is being created 406 * @dentry: dentry that is being created
407 * @extension: 9p2000.u extension string to support devices, etc.
406 * @perm: create permissions 408 * @perm: create permissions
407 * @mode: open mode 409 * @mode: open mode
408 * @extension: 9p2000.u extension string to support devices, etc.
409 * 410 *
410 */ 411 */
411static struct p9_fid * 412static struct p9_fid *
@@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
470 dentry->d_op = &v9fs_dentry_operations; 471 dentry->d_op = &v9fs_dentry_operations;
471 472
472 d_instantiate(dentry, inode); 473 d_instantiate(dentry, inode);
473 v9fs_fid_add(dentry, fid); 474 err = v9fs_fid_add(dentry, fid);
475 if (err < 0)
476 goto error;
477
474 return ofid; 478 return ofid;
475 479
476error: 480error:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 38d695d66a0b..8961f1a8f668 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
81 81
82static void 82static void
83v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, 83v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
84 int flags) 84 int flags, void *data)
85{ 85{
86 sb->s_maxbytes = MAX_LFS_FILESIZE; 86 sb->s_maxbytes = MAX_LFS_FILESIZE;
87 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 87 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
91 91
92 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 92 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
93 MS_NOATIME; 93 MS_NOATIME;
94
95 save_mount_options(sb, data);
94} 96}
95 97
96/** 98/**
@@ -113,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
113 struct v9fs_session_info *v9ses = NULL; 115 struct v9fs_session_info *v9ses = NULL;
114 struct p9_wstat *st = NULL; 116 struct p9_wstat *st = NULL;
115 int mode = S_IRWXUGO | S_ISVTX; 117 int mode = S_IRWXUGO | S_ISVTX;
116 uid_t uid = current_fsuid();
117 gid_t gid = current_fsgid();
118 struct p9_fid *fid; 118 struct p9_fid *fid;
119 int retval = 0; 119 int retval = 0;
120 120
121 P9_DPRINTK(P9_DEBUG_VFS, " \n"); 121 P9_DPRINTK(P9_DEBUG_VFS, " \n");
122 122
123 st = NULL;
124 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); 123 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
125 if (!v9ses) 124 if (!v9ses)
126 return -ENOMEM; 125 return -ENOMEM;
@@ -142,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
142 retval = PTR_ERR(sb); 141 retval = PTR_ERR(sb);
143 goto free_stat; 142 goto free_stat;
144 } 143 }
145 v9fs_fill_super(sb, v9ses, flags); 144 v9fs_fill_super(sb, v9ses, flags, data);
146 145
147 inode = v9fs_get_inode(sb, S_IFDIR | mode); 146 inode = v9fs_get_inode(sb, S_IFDIR | mode);
148 if (IS_ERR(inode)) { 147 if (IS_ERR(inode)) {
@@ -150,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
150 goto release_sb; 149 goto release_sb;
151 } 150 }
152 151
153 inode->i_uid = uid;
154 inode->i_gid = gid;
155
156 root = d_alloc_root(inode); 152 root = d_alloc_root(inode);
157 if (!root) { 153 if (!root) {
158 iput(inode); 154 iput(inode);
@@ -173,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
173 simple_set_mnt(mnt, sb); 169 simple_set_mnt(mnt, sb);
174 return 0; 170 return 0;
175 171
176release_sb:
177 deactivate_locked_super(sb);
178
179free_stat: 172free_stat:
173 p9stat_free(st);
180 kfree(st); 174 kfree(st);
181 175
182clunk_fid: 176clunk_fid:
@@ -185,7 +179,12 @@ clunk_fid:
185close_session: 179close_session:
186 v9fs_session_close(v9ses); 180 v9fs_session_close(v9ses);
187 kfree(v9ses); 181 kfree(v9ses);
182 return retval;
188 183
184release_sb:
185 p9stat_free(st);
186 kfree(st);
187 deactivate_locked_super(sb);
189 return retval; 188 return retval;
190} 189}
191 190
@@ -207,24 +206,10 @@ static void v9fs_kill_super(struct super_block *s)
207 206
208 v9fs_session_close(v9ses); 207 v9fs_session_close(v9ses);
209 kfree(v9ses); 208 kfree(v9ses);
209 s->s_fs_info = NULL;
210 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); 210 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
211} 211}
212 212
213/**
214 * v9fs_show_options - Show mount options in /proc/mounts
215 * @m: seq_file to write to
216 * @mnt: mount descriptor
217 *
218 */
219
220static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
221{
222 struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
223
224 seq_printf(m, "%s", v9ses->options);
225 return 0;
226}
227
228static void 213static void
229v9fs_umount_begin(struct super_block *sb) 214v9fs_umount_begin(struct super_block *sb)
230{ 215{
@@ -237,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb)
237static const struct super_operations v9fs_super_ops = { 222static const struct super_operations v9fs_super_ops = {
238 .statfs = simple_statfs, 223 .statfs = simple_statfs,
239 .clear_inode = v9fs_clear_inode, 224 .clear_inode = v9fs_clear_inode,
240 .show_options = v9fs_show_options, 225 .show_options = generic_show_options,
241 .umount_begin = v9fs_umount_begin, 226 .umount_begin = v9fs_umount_begin,
242}; 227};
243 228
diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d93..455aa207e67e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
43source "fs/gfs2/Kconfig" 43source "fs/gfs2/Kconfig"
44source "fs/ocfs2/Kconfig" 44source "fs/ocfs2/Kconfig"
45source "fs/btrfs/Kconfig" 45source "fs/btrfs/Kconfig"
46source "fs/nilfs2/Kconfig"
46 47
47endif # BLOCK 48endif # BLOCK
48 49
@@ -186,7 +187,6 @@ source "fs/romfs/Kconfig"
186source "fs/sysv/Kconfig" 187source "fs/sysv/Kconfig"
187source "fs/ufs/Kconfig" 188source "fs/ufs/Kconfig"
188source "fs/exofs/Kconfig" 189source "fs/exofs/Kconfig"
189source "fs/nilfs2/Kconfig"
190 190
191endif # MISC_FILESYSTEMS 191endif # MISC_FILESYSTEMS
192 192
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0149dab365e7..681c2a7b013f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page)
134 134
135 inode = page->mapping->host; 135 inode = page->mapping->host;
136 136
137 ASSERT(file != NULL); 137 if (file) {
138 key = file->private_data; 138 key = file->private_data;
139 ASSERT(key != NULL); 139 ASSERT(key != NULL);
140 } else {
141 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
142 if (IS_ERR(key)) {
143 ret = PTR_ERR(key);
144 goto error_nokey;
145 }
146 }
140 147
141 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); 148 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
142 149
@@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page)
207 unlock_page(page); 214 unlock_page(page);
208 } 215 }
209 216
217 if (!file)
218 key_put(key);
210 _leave(" = 0"); 219 _leave(" = 0");
211 return 0; 220 return 0;
212 221
213error: 222error:
214 SetPageError(page); 223 SetPageError(page);
215 unlock_page(page); 224 unlock_page(page);
225 if (!file)
226 key_put(key);
227error_nokey:
216 _leave(" = %d", ret); 228 _leave(" = %d", ret);
217 return ret; 229 return ret;
218} 230}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
712 .bdi = mapping->backing_dev_info, 712 .bdi = mapping->backing_dev_info,
713 .sync_mode = WB_SYNC_ALL, 713 .sync_mode = WB_SYNC_ALL,
714 .nr_to_write = LONG_MAX, 714 .nr_to_write = LONG_MAX,
715 .for_writepages = 1,
716 .range_cyclic = 1, 715 .range_cyclic = 1,
717 }; 716 };
718 int ret; 717 int ret;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index aa39ae83f019..3da18d453488 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
77 } 77 }
78 78
79 /* Update the expiry counter if fs is busy */ 79 /* Update the expiry counter if fs is busy */
80 if (!may_umount_tree(mnt)) { 80 if (!may_umount_tree(path.mnt)) {
81 struct autofs_info *ino = autofs4_dentry_ino(top); 81 struct autofs_info *ino = autofs4_dentry_ino(top);
82 ino->last_used = jiffies; 82 ino->last_used = jiffies;
83 goto done; 83 goto done;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..7c1e65d54872 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
501 } 501 }
502 } 502 }
503 503
504 /* 504 if (last_bss > elf_bss) {
505 * Now fill out the bss section. First pad the last page up 505 /*
506 * to the page boundary, and then perform a mmap to make sure 506 * Now fill out the bss section. First pad the last page up
507 * that there are zero-mapped pages up to and including the 507 * to the page boundary, and then perform a mmap to make sure
508 * last bss page. 508 * that there are zero-mapped pages up to and including the
509 */ 509 * last bss page.
510 if (padzero(elf_bss)) { 510 */
511 error = -EFAULT; 511 if (padzero(elf_bss)) {
512 goto out_close; 512 error = -EFAULT;
513 } 513 goto out_close;
514 }
514 515
515 /* What we have mapped so far */ 516 /* What we have mapped so far */
516 elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); 517 elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
517 518
518 /* Map the last of the bss segment */ 519 /* Map the last of the bss segment */
519 if (last_bss > elf_bss) {
520 down_write(&current->mm->mmap_sem); 520 down_write(&current->mm->mmap_sem);
521 error = do_brk(elf_bss, last_bss - elf_bss); 521 error = do_brk(elf_bss, last_bss - elf_bss);
522 up_write(&current->mm->mmap_sem); 522 up_write(&current->mm->mmap_sem);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 697f6b5f1313..e92f229e3c6e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
828 if (IS_ERR(bprm.file)) 828 if (IS_ERR(bprm.file))
829 return res; 829 return res;
830 830
831 bprm.cred = prepare_exec_creds();
832 res = -ENOMEM;
833 if (!bprm.cred)
834 goto out;
835
831 res = prepare_binprm(&bprm); 836 res = prepare_binprm(&bprm);
832 837
833 if (res <= (unsigned long)-4096) 838 if (res <= (unsigned long)-4096)
834 res = load_flat_file(&bprm, libs, id, NULL); 839 res = load_flat_file(&bprm, libs, id, NULL);
835 if (bprm.file) { 840
836 allow_write_access(bprm.file); 841 abort_creds(bprm.cred);
837 fput(bprm.file); 842
838 bprm.file = NULL; 843out:
839 } 844 allow_write_access(bprm.file);
845 fput(bprm.file);
846
840 return(res); 847 return(res);
841} 848}
842 849
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3a6d4fb2a329..71e7e03ac343 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
420{ 420{
421 struct bdev_inode *bdi = BDEV_I(inode); 421 struct bdev_inode *bdi = BDEV_I(inode);
422 422
423 bdi->bdev.bd_inode_backing_dev_info = NULL;
424 kmem_cache_free(bdev_cachep, bdi); 423 kmem_cache_free(bdev_cachep, bdi);
425} 424}
426 425
@@ -564,6 +563,16 @@ struct block_device *bdget(dev_t dev)
564 563
565EXPORT_SYMBOL(bdget); 564EXPORT_SYMBOL(bdget);
566 565
566/**
567 * bdgrab -- Grab a reference to an already referenced block device
568 * @bdev: Block device to grab a reference to.
569 */
570struct block_device *bdgrab(struct block_device *bdev)
571{
572 atomic_inc(&bdev->bd_inode->i_count);
573 return bdev;
574}
575
567long nr_blockdev_pages(void) 576long nr_blockdev_pages(void)
568{ 577{
569 struct block_device *bdev; 578 struct block_device *bdev;
@@ -1395,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1395} 1404}
1396 1405
1397/* 1406/*
1407 * Write data to the block device. Only intended for the block device itself
1408 * and the raw driver which basically is a fake block device.
1409 *
1410 * Does not take i_mutex for the write and thus is not for general purpose
1411 * use.
1412 */
1413ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1414 unsigned long nr_segs, loff_t pos)
1415{
1416 struct file *file = iocb->ki_filp;
1417 ssize_t ret;
1418
1419 BUG_ON(iocb->ki_pos != pos);
1420
1421 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1422 if (ret > 0 || ret == -EIOCBQUEUED) {
1423 ssize_t err;
1424
1425 err = generic_write_sync(file, pos, ret);
1426 if (err < 0 && ret > 0)
1427 ret = err;
1428 }
1429 return ret;
1430}
1431EXPORT_SYMBOL_GPL(blkdev_aio_write);
1432
1433/*
1398 * Try to release a page associated with block device when the system 1434 * Try to release a page associated with block device when the system
1399 * is under memory pressure. 1435 * is under memory pressure.
1400 */ 1436 */
@@ -1426,7 +1462,7 @@ const struct file_operations def_blk_fops = {
1426 .read = do_sync_read, 1462 .read = do_sync_read,
1427 .write = do_sync_write, 1463 .write = do_sync_write,
1428 .aio_read = generic_file_aio_read, 1464 .aio_read = generic_file_aio_read,
1429 .aio_write = generic_file_aio_write_nolock, 1465 .aio_write = blkdev_aio_write,
1430 .mmap = generic_file_mmap, 1466 .mmap = generic_file_mmap,
1431 .fsync = block_fsync, 1467 .fsync = block_fsync,
1432 .unlocked_ioctl = block_ioctl, 1468 .unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 6e4f6c50a120..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
424 * list 424 * list
425 */ 425 */
426 if (worker->idle) { 426 if (worker->idle) {
427 spin_lock_irqsave(&worker->workers->lock, flags); 427 spin_lock(&worker->workers->lock);
428 worker->idle = 0; 428 worker->idle = 0;
429 list_move_tail(&worker->worker_list, 429 list_move_tail(&worker->worker_list,
430 &worker->workers->worker_list); 430 &worker->workers->worker_list);
431 spin_unlock_irqrestore(&worker->workers->lock, flags); 431 spin_unlock(&worker->workers->lock);
432 } 432 }
433 if (!worker->working) { 433 if (!worker->working) {
434 wake = 1; 434 wake = 1;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e91..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
557 557
558 btrfs_disk_key_to_cpu(&k1, disk); 558 btrfs_disk_key_to_cpu(&k1, disk);
559 559
560 if (k1.objectid > k2->objectid) 560 return btrfs_comp_cpu_keys(&k1, k2);
561 return 1;
562 if (k1.objectid < k2->objectid)
563 return -1;
564 if (k1.type > k2->type)
565 return 1;
566 if (k1.type < k2->type)
567 return -1;
568 if (k1.offset > k2->offset)
569 return 1;
570 if (k1.offset < k2->offset)
571 return -1;
572 return 0;
573} 561}
574 562
575/* 563/*
@@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1052 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1040 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1053 return 0; 1041 return 0;
1054 1042
1055 if (btrfs_header_nritems(mid) > 2)
1056 return 0;
1057
1058 if (btrfs_header_nritems(mid) < 2) 1043 if (btrfs_header_nritems(mid) < 2)
1059 err_on_enospc = 1; 1044 err_on_enospc = 1;
1060 1045
@@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1701 struct extent_buffer *b; 1686 struct extent_buffer *b;
1702 int slot; 1687 int slot;
1703 int ret; 1688 int ret;
1689 int err;
1704 int level; 1690 int level;
1705 int lowest_unlock = 1; 1691 int lowest_unlock = 1;
1706 u8 lowest_level = 0; 1692 u8 lowest_level = 0;
@@ -1737,8 +1723,6 @@ again:
1737 p->locks[level] = 1; 1723 p->locks[level] = 1;
1738 1724
1739 if (cow) { 1725 if (cow) {
1740 int wret;
1741
1742 /* 1726 /*
1743 * if we don't really need to cow this block 1727 * if we don't really need to cow this block
1744 * then we don't want to set the path blocking, 1728 * then we don't want to set the path blocking,
@@ -1749,12 +1733,12 @@ again:
1749 1733
1750 btrfs_set_path_blocking(p); 1734 btrfs_set_path_blocking(p);
1751 1735
1752 wret = btrfs_cow_block(trans, root, b, 1736 err = btrfs_cow_block(trans, root, b,
1753 p->nodes[level + 1], 1737 p->nodes[level + 1],
1754 p->slots[level + 1], &b); 1738 p->slots[level + 1], &b);
1755 if (wret) { 1739 if (err) {
1756 free_extent_buffer(b); 1740 free_extent_buffer(b);
1757 ret = wret; 1741 ret = err;
1758 goto done; 1742 goto done;
1759 } 1743 }
1760 } 1744 }
@@ -1793,41 +1777,45 @@ cow_done:
1793 ret = bin_search(b, key, level, &slot); 1777 ret = bin_search(b, key, level, &slot);
1794 1778
1795 if (level != 0) { 1779 if (level != 0) {
1796 if (ret && slot > 0) 1780 int dec = 0;
1781 if (ret && slot > 0) {
1782 dec = 1;
1797 slot -= 1; 1783 slot -= 1;
1784 }
1798 p->slots[level] = slot; 1785 p->slots[level] = slot;
1799 ret = setup_nodes_for_search(trans, root, p, b, level, 1786 err = setup_nodes_for_search(trans, root, p, b, level,
1800 ins_len); 1787 ins_len);
1801 if (ret == -EAGAIN) 1788 if (err == -EAGAIN)
1802 goto again; 1789 goto again;
1803 else if (ret) 1790 if (err) {
1791 ret = err;
1804 goto done; 1792 goto done;
1793 }
1805 b = p->nodes[level]; 1794 b = p->nodes[level];
1806 slot = p->slots[level]; 1795 slot = p->slots[level];
1807 1796
1808 unlock_up(p, level, lowest_unlock); 1797 unlock_up(p, level, lowest_unlock);
1809 1798
1810 /* this is only true while dropping a snapshot */
1811 if (level == lowest_level) { 1799 if (level == lowest_level) {
1812 ret = 0; 1800 if (dec)
1801 p->slots[level]++;
1813 goto done; 1802 goto done;
1814 } 1803 }
1815 1804
1816 ret = read_block_for_search(trans, root, p, 1805 err = read_block_for_search(trans, root, p,
1817 &b, level, slot, key); 1806 &b, level, slot, key);
1818 if (ret == -EAGAIN) 1807 if (err == -EAGAIN)
1819 goto again; 1808 goto again;
1820 1809 if (err) {
1821 if (ret == -EIO) 1810 ret = err;
1822 goto done; 1811 goto done;
1812 }
1823 1813
1824 if (!p->skip_locking) { 1814 if (!p->skip_locking) {
1825 int lret;
1826
1827 btrfs_clear_path_blocking(p, NULL); 1815 btrfs_clear_path_blocking(p, NULL);
1828 lret = btrfs_try_spin_lock(b); 1816 err = btrfs_try_spin_lock(b);
1829 1817
1830 if (!lret) { 1818 if (!err) {
1831 btrfs_set_path_blocking(p); 1819 btrfs_set_path_blocking(p);
1832 btrfs_tree_lock(b); 1820 btrfs_tree_lock(b);
1833 btrfs_clear_path_blocking(p, b); 1821 btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1825,14 @@ cow_done:
1837 p->slots[level] = slot; 1825 p->slots[level] = slot;
1838 if (ins_len > 0 && 1826 if (ins_len > 0 &&
1839 btrfs_leaf_free_space(root, b) < ins_len) { 1827 btrfs_leaf_free_space(root, b) < ins_len) {
1840 int sret;
1841
1842 btrfs_set_path_blocking(p); 1828 btrfs_set_path_blocking(p);
1843 sret = split_leaf(trans, root, key, 1829 err = split_leaf(trans, root, key,
1844 p, ins_len, ret == 0); 1830 p, ins_len, ret == 0);
1845 btrfs_clear_path_blocking(p, NULL); 1831 btrfs_clear_path_blocking(p, NULL);
1846 1832
1847 BUG_ON(sret > 0); 1833 BUG_ON(err > 0);
1848 if (sret) { 1834 if (err) {
1849 ret = sret; 1835 ret = err;
1850 goto done; 1836 goto done;
1851 } 1837 }
1852 } 1838 }
@@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3807 } 3793 }
3808 3794
3809 /* delete the leaf if it is mostly empty */ 3795 /* delete the leaf if it is mostly empty */
3810 if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { 3796 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
3811 /* push_leaf_left fixes the path. 3797 /* push_leaf_left fixes the path.
3812 * make sure the path still points to our leaf 3798 * make sure the path still points to our leaf
3813 * for possible call to del_ptr below 3799 * for possible call to del_ptr below
@@ -4042,10 +4028,9 @@ out:
4042 * calling this function. 4028 * calling this function.
4043 */ 4029 */
4044int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4030int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4045 struct btrfs_key *key, int lowest_level, 4031 struct btrfs_key *key, int level,
4046 int cache_only, u64 min_trans) 4032 int cache_only, u64 min_trans)
4047{ 4033{
4048 int level = lowest_level;
4049 int slot; 4034 int slot;
4050 struct extent_buffer *c; 4035 struct extent_buffer *c;
4051 4036
@@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4058 c = path->nodes[level]; 4043 c = path->nodes[level];
4059next: 4044next:
4060 if (slot >= btrfs_header_nritems(c)) { 4045 if (slot >= btrfs_header_nritems(c)) {
4061 level++; 4046 int ret;
4062 if (level == BTRFS_MAX_LEVEL) 4047 int orig_lowest;
4048 struct btrfs_key cur_key;
4049 if (level + 1 >= BTRFS_MAX_LEVEL ||
4050 !path->nodes[level + 1])
4063 return 1; 4051 return 1;
4064 continue; 4052
4053 if (path->locks[level + 1]) {
4054 level++;
4055 continue;
4056 }
4057
4058 slot = btrfs_header_nritems(c) - 1;
4059 if (level == 0)
4060 btrfs_item_key_to_cpu(c, &cur_key, slot);
4061 else
4062 btrfs_node_key_to_cpu(c, &cur_key, slot);
4063
4064 orig_lowest = path->lowest_level;
4065 btrfs_release_path(root, path);
4066 path->lowest_level = level;
4067 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4068 0, 0);
4069 path->lowest_level = orig_lowest;
4070 if (ret < 0)
4071 return ret;
4072
4073 c = path->nodes[level];
4074 slot = path->slots[level];
4075 if (ret == 0)
4076 slot++;
4077 goto next;
4065 } 4078 }
4079
4066 if (level == 0) 4080 if (level == 0)
4067 btrfs_item_key_to_cpu(c, key, slot); 4081 btrfs_item_key_to_cpu(c, key, slot);
4068 else { 4082 else {
@@ -4146,7 +4160,8 @@ again:
4146 * advance the path if there are now more items available. 4160 * advance the path if there are now more items available.
4147 */ 4161 */
4148 if (nritems > 0 && path->slots[0] < nritems - 1) { 4162 if (nritems > 0 && path->slots[0] < nritems - 1) {
4149 path->slots[0]++; 4163 if (ret == 0)
4164 path->slots[0]++;
4150 ret = 0; 4165 ret = 0;
4151 goto done; 4166 goto done;
4152 } 4167 }
@@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
4278 path->slots[0]--; 4293 path->slots[0]--;
4279 4294
4280 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4295 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4281 if (found_key.type == type)
4282 return 0;
4283 if (found_key.objectid < min_objectid) 4296 if (found_key.objectid < min_objectid)
4284 break; 4297 break;
4298 if (found_key.type == type)
4299 return 0;
4285 if (found_key.objectid == min_objectid && 4300 if (found_key.objectid == min_objectid &&
4286 found_key.type < type) 4301 found_key.type < type)
4287 break; 4302 break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 98a873838717..837435ce84ca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -481,7 +481,7 @@ struct btrfs_shared_data_ref {
481 481
482struct btrfs_extent_inline_ref { 482struct btrfs_extent_inline_ref {
483 u8 type; 483 u8 type;
484 u64 offset; 484 __le64 offset;
485} __attribute__ ((__packed__)); 485} __attribute__ ((__packed__));
486 486
487/* old style backrefs item */ 487/* old style backrefs item */
@@ -689,6 +689,7 @@ struct btrfs_space_info {
689 struct list_head block_groups; 689 struct list_head block_groups;
690 spinlock_t lock; 690 spinlock_t lock;
691 struct rw_semaphore groups_sem; 691 struct rw_semaphore groups_sem;
692 atomic_t caching_threads;
692}; 693};
693 694
694/* 695/*
@@ -707,6 +708,9 @@ struct btrfs_free_cluster {
707 /* first extent starting offset */ 708 /* first extent starting offset */
708 u64 window_start; 709 u64 window_start;
709 710
711 /* if this cluster simply points at a bitmap in the block group */
712 bool points_to_bitmap;
713
710 struct btrfs_block_group_cache *block_group; 714 struct btrfs_block_group_cache *block_group;
711 /* 715 /*
712 * when a cluster is allocated from a block group, we put the 716 * when a cluster is allocated from a block group, we put the
@@ -716,24 +720,37 @@ struct btrfs_free_cluster {
716 struct list_head block_group_list; 720 struct list_head block_group_list;
717}; 721};
718 722
723enum btrfs_caching_type {
724 BTRFS_CACHE_NO = 0,
725 BTRFS_CACHE_STARTED = 1,
726 BTRFS_CACHE_FINISHED = 2,
727};
728
719struct btrfs_block_group_cache { 729struct btrfs_block_group_cache {
720 struct btrfs_key key; 730 struct btrfs_key key;
721 struct btrfs_block_group_item item; 731 struct btrfs_block_group_item item;
732 struct btrfs_fs_info *fs_info;
722 spinlock_t lock; 733 spinlock_t lock;
723 struct mutex cache_mutex;
724 u64 pinned; 734 u64 pinned;
725 u64 reserved; 735 u64 reserved;
726 u64 flags; 736 u64 flags;
727 int cached; 737 u64 sectorsize;
738 int extents_thresh;
739 int free_extents;
740 int total_bitmaps;
728 int ro; 741 int ro;
729 int dirty; 742 int dirty;
730 743
744 /* cache tracking stuff */
745 wait_queue_head_t caching_q;
746 int cached;
747
731 struct btrfs_space_info *space_info; 748 struct btrfs_space_info *space_info;
732 749
733 /* free space cache stuff */ 750 /* free space cache stuff */
734 spinlock_t tree_lock; 751 spinlock_t tree_lock;
735 struct rb_root free_space_bytes;
736 struct rb_root free_space_offset; 752 struct rb_root free_space_offset;
753 u64 free_space;
737 754
738 /* block group cache stuff */ 755 /* block group cache stuff */
739 struct rb_node cache_node; 756 struct rb_node cache_node;
@@ -808,6 +825,7 @@ struct btrfs_fs_info {
808 struct mutex drop_mutex; 825 struct mutex drop_mutex;
809 struct mutex volume_mutex; 826 struct mutex volume_mutex;
810 struct mutex tree_reloc_mutex; 827 struct mutex tree_reloc_mutex;
828 struct rw_semaphore extent_commit_sem;
811 829
812 /* 830 /*
813 * this protects the ordered operations list only while we are 831 * this protects the ordered operations list only while we are
@@ -1988,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1988 u64 bytes); 2006 u64 bytes);
1989void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2007void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1990 u64 bytes); 2008 u64 bytes);
2009void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
1991/* ctree.c */ 2010/* ctree.c */
1992int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2011int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1993 int level, int *slot); 2012 int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d28d29c95f7c..8b8192790011 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1352{ 1352{
1353 int err; 1353 int err;
1354 1354
1355 bdi->name = "btrfs";
1355 bdi->capabilities = BDI_CAP_MAP_COPY; 1356 bdi->capabilities = BDI_CAP_MAP_COPY;
1356 err = bdi_init(bdi); 1357 err = bdi_init(bdi);
1357 if (err) 1358 if (err)
@@ -1599,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1599 1600
1600 sb->s_blocksize = 4096; 1601 sb->s_blocksize = 4096;
1601 sb->s_blocksize_bits = blksize_bits(4096); 1602 sb->s_blocksize_bits = blksize_bits(4096);
1603 sb->s_bdi = &fs_info->bdi;
1602 1604
1603 /* 1605 /*
1604 * we set the i_size on the btree inode to the max possible int. 1606 * we set the i_size on the btree inode to the max possible int.
@@ -1639,6 +1641,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1639 mutex_init(&fs_info->cleaner_mutex); 1641 mutex_init(&fs_info->cleaner_mutex);
1640 mutex_init(&fs_info->volume_mutex); 1642 mutex_init(&fs_info->volume_mutex);
1641 mutex_init(&fs_info->tree_reloc_mutex); 1643 mutex_init(&fs_info->tree_reloc_mutex);
1644 init_rwsem(&fs_info->extent_commit_sem);
1642 1645
1643 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 1646 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1644 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 1647 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1799,6 +1802,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1799 btrfs_super_chunk_root(disk_super), 1802 btrfs_super_chunk_root(disk_super),
1800 blocksize, generation); 1803 blocksize, generation);
1801 BUG_ON(!chunk_root->node); 1804 BUG_ON(!chunk_root->node);
1805 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1806 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1807 sb->s_id);
1808 goto fail_chunk_root;
1809 }
1802 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 1810 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1803 chunk_root->commit_root = btrfs_root_node(chunk_root); 1811 chunk_root->commit_root = btrfs_root_node(chunk_root);
1804 1812
@@ -1826,6 +1834,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1826 blocksize, generation); 1834 blocksize, generation);
1827 if (!tree_root->node) 1835 if (!tree_root->node)
1828 goto fail_chunk_root; 1836 goto fail_chunk_root;
1837 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1838 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1839 sb->s_id);
1840 goto fail_tree_root;
1841 }
1829 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 1842 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1830 tree_root->commit_root = btrfs_root_node(tree_root); 1843 tree_root->commit_root = btrfs_root_node(tree_root);
1831 1844
@@ -2322,6 +2335,9 @@ int close_ctree(struct btrfs_root *root)
2322 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2335 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2323 } 2336 }
2324 2337
2338 fs_info->closing = 2;
2339 smp_mb();
2340
2325 if (fs_info->delalloc_bytes) { 2341 if (fs_info->delalloc_bytes) {
2326 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 2342 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2327 (unsigned long long)fs_info->delalloc_bytes); 2343 (unsigned long long)fs_info->delalloc_bytes);
@@ -2343,6 +2359,7 @@ int close_ctree(struct btrfs_root *root)
2343 free_extent_buffer(root->fs_info->csum_root->commit_root); 2359 free_extent_buffer(root->fs_info->csum_root->commit_root);
2344 2360
2345 btrfs_free_block_groups(root->fs_info); 2361 btrfs_free_block_groups(root->fs_info);
2362 btrfs_free_pinned_extents(root->fs_info);
2346 2363
2347 del_fs_roots(fs_info); 2364 del_fs_roots(fs_info);
2348 2365
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5aca3997d42..535f85ba104f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h>
24#include "compat.h" 25#include "compat.h"
25#include "hash.h" 26#include "hash.h"
26#include "ctree.h" 27#include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61 struct btrfs_root *extent_root, u64 alloc_bytes, 62 struct btrfs_root *extent_root, u64 alloc_bytes,
62 u64 flags, int force); 63 u64 flags, int force);
63 64
65static noinline int
66block_group_cache_done(struct btrfs_block_group_cache *cache)
67{
68 smp_mb();
69 return cache->cached == BTRFS_CACHE_FINISHED;
70}
71
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 72static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{ 73{
66 return (cache->flags & bits) == bits; 74 return (cache->flags & bits) == bits;
@@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
146} 154}
147 155
148/* 156/*
157 * We always set EXTENT_LOCKED for the super mirror extents so we don't
158 * overwrite them, so those bits need to be unset. Also, if we are unmounting
159 * with pinned extents still sitting there because we had a block group caching,
160 * we need to clear those now, since we are done.
161 */
162void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
163{
164 u64 start, end, last = 0;
165 int ret;
166
167 while (1) {
168 ret = find_first_extent_bit(&info->pinned_extents, last,
169 &start, &end,
170 EXTENT_LOCKED|EXTENT_DIRTY);
171 if (ret)
172 break;
173
174 clear_extent_bits(&info->pinned_extents, start, end,
175 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
176 last = end+1;
177 }
178}
179
180static int remove_sb_from_cache(struct btrfs_root *root,
181 struct btrfs_block_group_cache *cache)
182{
183 struct btrfs_fs_info *fs_info = root->fs_info;
184 u64 bytenr;
185 u64 *logical;
186 int stripe_len;
187 int i, nr, ret;
188
189 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
190 bytenr = btrfs_sb_offset(i);
191 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
192 cache->key.objectid, bytenr,
193 0, &logical, &nr, &stripe_len);
194 BUG_ON(ret);
195 while (nr--) {
196 try_lock_extent(&fs_info->pinned_extents,
197 logical[nr],
198 logical[nr] + stripe_len - 1, GFP_NOFS);
199 }
200 kfree(logical);
201 }
202
203 return 0;
204}
205
206/*
149 * this is only called by cache_block_group, since we could have freed extents 207 * this is only called by cache_block_group, since we could have freed extents
150 * we need to check the pinned_extents for any extents that can't be used yet 208 * we need to check the pinned_extents for any extents that can't be used yet
151 * since their free space will be released as soon as the transaction commits. 209 * since their free space will be released as soon as the transaction commits.
152 */ 210 */
153static int add_new_free_space(struct btrfs_block_group_cache *block_group, 211static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
154 struct btrfs_fs_info *info, u64 start, u64 end) 212 struct btrfs_fs_info *info, u64 start, u64 end)
155{ 213{
156 u64 extent_start, extent_end, size; 214 u64 extent_start, extent_end, size, total_added = 0;
157 int ret; 215 int ret;
158 216
159 while (start < end) { 217 while (start < end) {
160 ret = find_first_extent_bit(&info->pinned_extents, start, 218 ret = find_first_extent_bit(&info->pinned_extents, start,
161 &extent_start, &extent_end, 219 &extent_start, &extent_end,
162 EXTENT_DIRTY); 220 EXTENT_DIRTY|EXTENT_LOCKED);
163 if (ret) 221 if (ret)
164 break; 222 break;
165 223
@@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
167 start = extent_end + 1; 225 start = extent_end + 1;
168 } else if (extent_start > start && extent_start < end) { 226 } else if (extent_start > start && extent_start < end) {
169 size = extent_start - start; 227 size = extent_start - start;
228 total_added += size;
170 ret = btrfs_add_free_space(block_group, start, 229 ret = btrfs_add_free_space(block_group, start,
171 size); 230 size);
172 BUG_ON(ret); 231 BUG_ON(ret);
@@ -178,84 +237,93 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
178 237
179 if (start < end) { 238 if (start < end) {
180 size = end - start; 239 size = end - start;
240 total_added += size;
181 ret = btrfs_add_free_space(block_group, start, size); 241 ret = btrfs_add_free_space(block_group, start, size);
182 BUG_ON(ret); 242 BUG_ON(ret);
183 } 243 }
184 244
185 return 0; 245 return total_added;
186}
187
188static int remove_sb_from_cache(struct btrfs_root *root,
189 struct btrfs_block_group_cache *cache)
190{
191 u64 bytenr;
192 u64 *logical;
193 int stripe_len;
194 int i, nr, ret;
195
196 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
197 bytenr = btrfs_sb_offset(i);
198 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
199 cache->key.objectid, bytenr, 0,
200 &logical, &nr, &stripe_len);
201 BUG_ON(ret);
202 while (nr--) {
203 btrfs_remove_free_space(cache, logical[nr],
204 stripe_len);
205 }
206 kfree(logical);
207 }
208 return 0;
209} 246}
210 247
211static int cache_block_group(struct btrfs_root *root, 248static int caching_kthread(void *data)
212 struct btrfs_block_group_cache *block_group)
213{ 249{
250 struct btrfs_block_group_cache *block_group = data;
251 struct btrfs_fs_info *fs_info = block_group->fs_info;
252 u64 last = 0;
214 struct btrfs_path *path; 253 struct btrfs_path *path;
215 int ret = 0; 254 int ret = 0;
216 struct btrfs_key key; 255 struct btrfs_key key;
217 struct extent_buffer *leaf; 256 struct extent_buffer *leaf;
218 int slot; 257 int slot;
219 u64 last; 258 u64 total_found = 0;
220
221 if (!block_group)
222 return 0;
223
224 root = root->fs_info->extent_root;
225 259
226 if (block_group->cached) 260 BUG_ON(!fs_info);
227 return 0;
228 261
229 path = btrfs_alloc_path(); 262 path = btrfs_alloc_path();
230 if (!path) 263 if (!path)
231 return -ENOMEM; 264 return -ENOMEM;
232 265
233 path->reada = 2; 266 atomic_inc(&block_group->space_info->caching_threads);
267 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
234 /* 268 /*
235 * we get into deadlocks with paths held by callers of this function. 269 * We don't want to deadlock with somebody trying to allocate a new
236 * since the alloc_mutex is protecting things right now, just 270 * extent for the extent root while also trying to search the extent
237 * skip the locking here 271 * root to add free space. So we skip locking and search the commit
272 * root, since its read-only
238 */ 273 */
239 path->skip_locking = 1; 274 path->skip_locking = 1;
240 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 275 path->search_commit_root = 1;
276 path->reada = 2;
277
241 key.objectid = last; 278 key.objectid = last;
242 key.offset = 0; 279 key.offset = 0;
243 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 280 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
244 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 281again:
282 /* need to make sure the commit_root doesn't disappear */
283 down_read(&fs_info->extent_commit_sem);
284
285 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
245 if (ret < 0) 286 if (ret < 0)
246 goto err; 287 goto err;
247 288
248 while (1) { 289 while (1) {
290 smp_mb();
291 if (block_group->fs_info->closing > 1) {
292 last = (u64)-1;
293 break;
294 }
295
249 leaf = path->nodes[0]; 296 leaf = path->nodes[0];
250 slot = path->slots[0]; 297 slot = path->slots[0];
251 if (slot >= btrfs_header_nritems(leaf)) { 298 if (slot >= btrfs_header_nritems(leaf)) {
252 ret = btrfs_next_leaf(root, path); 299 ret = btrfs_next_leaf(fs_info->extent_root, path);
253 if (ret < 0) 300 if (ret < 0)
254 goto err; 301 goto err;
255 if (ret == 0) 302 else if (ret)
256 continue;
257 else
258 break; 303 break;
304
305 if (need_resched() ||
306 btrfs_transaction_in_commit(fs_info)) {
307 leaf = path->nodes[0];
308
309 /* this shouldn't happen, but if the
310 * leaf is empty just move on.
311 */
312 if (btrfs_header_nritems(leaf) == 0)
313 break;
314 /*
315 * we need to copy the key out so that
316 * we are sure the next search advances
317 * us forward in the btree.
318 */
319 btrfs_item_key_to_cpu(leaf, &key, 0);
320 btrfs_release_path(fs_info->extent_root, path);
321 up_read(&fs_info->extent_commit_sem);
322 schedule_timeout(1);
323 goto again;
324 }
325
326 continue;
259 } 327 }
260 btrfs_item_key_to_cpu(leaf, &key, slot); 328 btrfs_item_key_to_cpu(leaf, &key, slot);
261 if (key.objectid < block_group->key.objectid) 329 if (key.objectid < block_group->key.objectid)
@@ -266,24 +334,59 @@ static int cache_block_group(struct btrfs_root *root,
266 break; 334 break;
267 335
268 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { 336 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
269 add_new_free_space(block_group, root->fs_info, last, 337 total_found += add_new_free_space(block_group,
270 key.objectid); 338 fs_info, last,
271 339 key.objectid);
272 last = key.objectid + key.offset; 340 last = key.objectid + key.offset;
273 } 341 }
342
343 if (total_found > (1024 * 1024 * 2)) {
344 total_found = 0;
345 wake_up(&block_group->caching_q);
346 }
274next: 347next:
275 path->slots[0]++; 348 path->slots[0]++;
276 } 349 }
350 ret = 0;
277 351
278 add_new_free_space(block_group, root->fs_info, last, 352 total_found += add_new_free_space(block_group, fs_info, last,
279 block_group->key.objectid + 353 block_group->key.objectid +
280 block_group->key.offset); 354 block_group->key.offset);
355
356 spin_lock(&block_group->lock);
357 block_group->cached = BTRFS_CACHE_FINISHED;
358 spin_unlock(&block_group->lock);
281 359
282 block_group->cached = 1;
283 remove_sb_from_cache(root, block_group);
284 ret = 0;
285err: 360err:
286 btrfs_free_path(path); 361 btrfs_free_path(path);
362 up_read(&fs_info->extent_commit_sem);
363 atomic_dec(&block_group->space_info->caching_threads);
364 wake_up(&block_group->caching_q);
365
366 return 0;
367}
368
369static int cache_block_group(struct btrfs_block_group_cache *cache)
370{
371 struct task_struct *tsk;
372 int ret = 0;
373
374 spin_lock(&cache->lock);
375 if (cache->cached != BTRFS_CACHE_NO) {
376 spin_unlock(&cache->lock);
377 return ret;
378 }
379 cache->cached = BTRFS_CACHE_STARTED;
380 spin_unlock(&cache->lock);
381
382 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
383 cache->key.objectid);
384 if (IS_ERR(tsk)) {
385 ret = PTR_ERR(tsk);
386 printk(KERN_ERR "error running thread %d\n", ret);
387 BUG();
388 }
389
287 return ret; 390 return ret;
288} 391}
289 392
@@ -1408,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1408static void btrfs_issue_discard(struct block_device *bdev, 1511static void btrfs_issue_discard(struct block_device *bdev,
1409 u64 start, u64 len) 1512 u64 start, u64 len)
1410{ 1513{
1411 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); 1514 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1515 DISCARD_FL_BARRIER);
1412} 1516}
1413#endif 1517#endif
1414 1518
@@ -2387,13 +2491,29 @@ fail:
2387 2491
2388} 2492}
2389 2493
2494static struct btrfs_block_group_cache *
2495next_block_group(struct btrfs_root *root,
2496 struct btrfs_block_group_cache *cache)
2497{
2498 struct rb_node *node;
2499 spin_lock(&root->fs_info->block_group_cache_lock);
2500 node = rb_next(&cache->cache_node);
2501 btrfs_put_block_group(cache);
2502 if (node) {
2503 cache = rb_entry(node, struct btrfs_block_group_cache,
2504 cache_node);
2505 atomic_inc(&cache->count);
2506 } else
2507 cache = NULL;
2508 spin_unlock(&root->fs_info->block_group_cache_lock);
2509 return cache;
2510}
2511
2390int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2512int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2391 struct btrfs_root *root) 2513 struct btrfs_root *root)
2392{ 2514{
2393 struct btrfs_block_group_cache *cache, *entry; 2515 struct btrfs_block_group_cache *cache;
2394 struct rb_node *n;
2395 int err = 0; 2516 int err = 0;
2396 int werr = 0;
2397 struct btrfs_path *path; 2517 struct btrfs_path *path;
2398 u64 last = 0; 2518 u64 last = 0;
2399 2519
@@ -2402,39 +2522,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2402 return -ENOMEM; 2522 return -ENOMEM;
2403 2523
2404 while (1) { 2524 while (1) {
2405 cache = NULL; 2525 if (last == 0) {
2406 spin_lock(&root->fs_info->block_group_cache_lock); 2526 err = btrfs_run_delayed_refs(trans, root,
2407 for (n = rb_first(&root->fs_info->block_group_cache_tree); 2527 (unsigned long)-1);
2408 n; n = rb_next(n)) { 2528 BUG_ON(err);
2409 entry = rb_entry(n, struct btrfs_block_group_cache,
2410 cache_node);
2411 if (entry->dirty) {
2412 cache = entry;
2413 break;
2414 }
2415 } 2529 }
2416 spin_unlock(&root->fs_info->block_group_cache_lock);
2417 2530
2418 if (!cache) 2531 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2419 break; 2532 while (cache) {
2533 if (cache->dirty)
2534 break;
2535 cache = next_block_group(root, cache);
2536 }
2537 if (!cache) {
2538 if (last == 0)
2539 break;
2540 last = 0;
2541 continue;
2542 }
2420 2543
2421 cache->dirty = 0; 2544 cache->dirty = 0;
2422 last += cache->key.offset; 2545 last = cache->key.objectid + cache->key.offset;
2423 2546
2424 err = write_one_cache_group(trans, root, 2547 err = write_one_cache_group(trans, root, path, cache);
2425 path, cache); 2548 BUG_ON(err);
2426 /* 2549 btrfs_put_block_group(cache);
2427 * if we fail to write the cache group, we want
2428 * to keep it marked dirty in hopes that a later
2429 * write will work
2430 */
2431 if (err) {
2432 werr = err;
2433 continue;
2434 }
2435 } 2550 }
2551
2436 btrfs_free_path(path); 2552 btrfs_free_path(path);
2437 return werr; 2553 return 0;
2438} 2554}
2439 2555
2440int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 2556int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2484,6 +2600,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2484 found->force_alloc = 0; 2600 found->force_alloc = 0;
2485 *space_info = found; 2601 *space_info = found;
2486 list_add_rcu(&found->list, &info->space_info); 2602 list_add_rcu(&found->list, &info->space_info);
2603 atomic_set(&found->caching_threads, 0);
2487 return 0; 2604 return 0;
2488} 2605}
2489 2606
@@ -2947,13 +3064,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2947 struct btrfs_block_group_cache *cache; 3064 struct btrfs_block_group_cache *cache;
2948 struct btrfs_fs_info *fs_info = root->fs_info; 3065 struct btrfs_fs_info *fs_info = root->fs_info;
2949 3066
2950 if (pin) { 3067 if (pin)
2951 set_extent_dirty(&fs_info->pinned_extents, 3068 set_extent_dirty(&fs_info->pinned_extents,
2952 bytenr, bytenr + num - 1, GFP_NOFS); 3069 bytenr, bytenr + num - 1, GFP_NOFS);
2953 } else {
2954 clear_extent_dirty(&fs_info->pinned_extents,
2955 bytenr, bytenr + num - 1, GFP_NOFS);
2956 }
2957 3070
2958 while (num > 0) { 3071 while (num > 0) {
2959 cache = btrfs_lookup_block_group(fs_info, bytenr); 3072 cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2969,14 +3082,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2969 spin_unlock(&cache->space_info->lock); 3082 spin_unlock(&cache->space_info->lock);
2970 fs_info->total_pinned += len; 3083 fs_info->total_pinned += len;
2971 } else { 3084 } else {
3085 int unpin = 0;
3086
3087 /*
3088 * in order to not race with the block group caching, we
3089 * only want to unpin the extent if we are cached. If
3090 * we aren't cached, we want to start async caching this
3091 * block group so we can free the extent the next time
3092 * around.
3093 */
2972 spin_lock(&cache->space_info->lock); 3094 spin_lock(&cache->space_info->lock);
2973 spin_lock(&cache->lock); 3095 spin_lock(&cache->lock);
2974 cache->pinned -= len; 3096 unpin = (cache->cached == BTRFS_CACHE_FINISHED);
2975 cache->space_info->bytes_pinned -= len; 3097 if (likely(unpin)) {
3098 cache->pinned -= len;
3099 cache->space_info->bytes_pinned -= len;
3100 fs_info->total_pinned -= len;
3101 }
2976 spin_unlock(&cache->lock); 3102 spin_unlock(&cache->lock);
2977 spin_unlock(&cache->space_info->lock); 3103 spin_unlock(&cache->space_info->lock);
2978 fs_info->total_pinned -= len; 3104
2979 if (cache->cached) 3105 if (likely(unpin))
3106 clear_extent_dirty(&fs_info->pinned_extents,
3107 bytenr, bytenr + len -1,
3108 GFP_NOFS);
3109 else
3110 cache_block_group(cache);
3111
3112 if (unpin)
2980 btrfs_add_free_space(cache, bytenr, len); 3113 btrfs_add_free_space(cache, bytenr, len);
2981 } 3114 }
2982 btrfs_put_block_group(cache); 3115 btrfs_put_block_group(cache);
@@ -3030,6 +3163,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
3030 &start, &end, EXTENT_DIRTY); 3163 &start, &end, EXTENT_DIRTY);
3031 if (ret) 3164 if (ret)
3032 break; 3165 break;
3166
3033 set_extent_dirty(copy, start, end, GFP_NOFS); 3167 set_extent_dirty(copy, start, end, GFP_NOFS);
3034 last = end + 1; 3168 last = end + 1;
3035 } 3169 }
@@ -3058,6 +3192,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3058 3192
3059 cond_resched(); 3193 cond_resched();
3060 } 3194 }
3195
3061 return ret; 3196 return ret;
3062} 3197}
3063 3198
@@ -3436,6 +3571,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
3436} 3571}
3437 3572
3438/* 3573/*
3574 * when we wait for progress in the block group caching, its because
3575 * our allocation attempt failed at least once. So, we must sleep
3576 * and let some progress happen before we try again.
3577 *
3578 * This function will sleep at least once waiting for new free space to
3579 * show up, and then it will check the block group free space numbers
3580 * for our min num_bytes. Another option is to have it go ahead
3581 * and look in the rbtree for a free extent of a given size, but this
3582 * is a good start.
3583 */
3584static noinline int
3585wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3586 u64 num_bytes)
3587{
3588 DEFINE_WAIT(wait);
3589
3590 prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
3591
3592 if (block_group_cache_done(cache)) {
3593 finish_wait(&cache->caching_q, &wait);
3594 return 0;
3595 }
3596 schedule();
3597 finish_wait(&cache->caching_q, &wait);
3598
3599 wait_event(cache->caching_q, block_group_cache_done(cache) ||
3600 (cache->free_space >= num_bytes));
3601 return 0;
3602}
3603
3604enum btrfs_loop_type {
3605 LOOP_CACHED_ONLY = 0,
3606 LOOP_CACHING_NOWAIT = 1,
3607 LOOP_CACHING_WAIT = 2,
3608 LOOP_ALLOC_CHUNK = 3,
3609 LOOP_NO_EMPTY_SIZE = 4,
3610};
3611
3612/*
3439 * walks the btree of allocated extents and find a hole of a given size. 3613 * walks the btree of allocated extents and find a hole of a given size.
3440 * The key ins is changed to record the hole: 3614 * The key ins is changed to record the hole:
3441 * ins->objectid == block start 3615 * ins->objectid == block start
@@ -3460,6 +3634,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3460 struct btrfs_space_info *space_info; 3634 struct btrfs_space_info *space_info;
3461 int last_ptr_loop = 0; 3635 int last_ptr_loop = 0;
3462 int loop = 0; 3636 int loop = 0;
3637 bool found_uncached_bg = false;
3463 3638
3464 WARN_ON(num_bytes < root->sectorsize); 3639 WARN_ON(num_bytes < root->sectorsize);
3465 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 3640 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3491,15 +3666,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3491 search_start = max(search_start, first_logical_byte(root, 0)); 3666 search_start = max(search_start, first_logical_byte(root, 0));
3492 search_start = max(search_start, hint_byte); 3667 search_start = max(search_start, hint_byte);
3493 3668
3494 if (!last_ptr) { 3669 if (!last_ptr)
3495 empty_cluster = 0; 3670 empty_cluster = 0;
3496 loop = 1;
3497 }
3498 3671
3499 if (search_start == hint_byte) { 3672 if (search_start == hint_byte) {
3500 block_group = btrfs_lookup_block_group(root->fs_info, 3673 block_group = btrfs_lookup_block_group(root->fs_info,
3501 search_start); 3674 search_start);
3502 if (block_group && block_group_bits(block_group, data)) { 3675 /*
3676 * we don't want to use the block group if it doesn't match our
3677 * allocation bits, or if its not cached.
3678 */
3679 if (block_group && block_group_bits(block_group, data) &&
3680 block_group_cache_done(block_group)) {
3503 down_read(&space_info->groups_sem); 3681 down_read(&space_info->groups_sem);
3504 if (list_empty(&block_group->list) || 3682 if (list_empty(&block_group->list) ||
3505 block_group->ro) { 3683 block_group->ro) {
@@ -3522,21 +3700,35 @@ search:
3522 down_read(&space_info->groups_sem); 3700 down_read(&space_info->groups_sem);
3523 list_for_each_entry(block_group, &space_info->block_groups, list) { 3701 list_for_each_entry(block_group, &space_info->block_groups, list) {
3524 u64 offset; 3702 u64 offset;
3703 int cached;
3525 3704
3526 atomic_inc(&block_group->count); 3705 atomic_inc(&block_group->count);
3527 search_start = block_group->key.objectid; 3706 search_start = block_group->key.objectid;
3528 3707
3529have_block_group: 3708have_block_group:
3530 if (unlikely(!block_group->cached)) { 3709 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
3531 mutex_lock(&block_group->cache_mutex); 3710 /*
3532 ret = cache_block_group(root, block_group); 3711 * we want to start caching kthreads, but not too many
3533 mutex_unlock(&block_group->cache_mutex); 3712 * right off the bat so we don't overwhelm the system,
3534 if (ret) { 3713 * so only start them if there are less than 2 and we're
3535 btrfs_put_block_group(block_group); 3714 * in the initial allocation phase.
3536 break; 3715 */
3716 if (loop > LOOP_CACHING_NOWAIT ||
3717 atomic_read(&space_info->caching_threads) < 2) {
3718 ret = cache_block_group(block_group);
3719 BUG_ON(ret);
3537 } 3720 }
3538 } 3721 }
3539 3722
3723 cached = block_group_cache_done(block_group);
3724 if (unlikely(!cached)) {
3725 found_uncached_bg = true;
3726
3727 /* if we only want cached bgs, loop */
3728 if (loop == LOOP_CACHED_ONLY)
3729 goto loop;
3730 }
3731
3540 if (unlikely(block_group->ro)) 3732 if (unlikely(block_group->ro))
3541 goto loop; 3733 goto loop;
3542 3734
@@ -3615,14 +3807,21 @@ refill_cluster:
3615 spin_unlock(&last_ptr->refill_lock); 3807 spin_unlock(&last_ptr->refill_lock);
3616 goto checks; 3808 goto checks;
3617 } 3809 }
3810 } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
3811 spin_unlock(&last_ptr->refill_lock);
3812
3813 wait_block_group_cache_progress(block_group,
3814 num_bytes + empty_cluster + empty_size);
3815 goto have_block_group;
3618 } 3816 }
3817
3619 /* 3818 /*
3620 * at this point we either didn't find a cluster 3819 * at this point we either didn't find a cluster
3621 * or we weren't able to allocate a block from our 3820 * or we weren't able to allocate a block from our
3622 * cluster. Free the cluster we've been trying 3821 * cluster. Free the cluster we've been trying
3623 * to use, and go to the next block group 3822 * to use, and go to the next block group
3624 */ 3823 */
3625 if (loop < 2) { 3824 if (loop < LOOP_NO_EMPTY_SIZE) {
3626 btrfs_return_cluster_to_free_space(NULL, 3825 btrfs_return_cluster_to_free_space(NULL,
3627 last_ptr); 3826 last_ptr);
3628 spin_unlock(&last_ptr->refill_lock); 3827 spin_unlock(&last_ptr->refill_lock);
@@ -3633,11 +3832,17 @@ refill_cluster:
3633 3832
3634 offset = btrfs_find_space_for_alloc(block_group, search_start, 3833 offset = btrfs_find_space_for_alloc(block_group, search_start,
3635 num_bytes, empty_size); 3834 num_bytes, empty_size);
3636 if (!offset) 3835 if (!offset && (cached || (!cached &&
3836 loop == LOOP_CACHING_NOWAIT))) {
3637 goto loop; 3837 goto loop;
3838 } else if (!offset && (!cached &&
3839 loop > LOOP_CACHING_NOWAIT)) {
3840 wait_block_group_cache_progress(block_group,
3841 num_bytes + empty_size);
3842 goto have_block_group;
3843 }
3638checks: 3844checks:
3639 search_start = stripe_align(root, offset); 3845 search_start = stripe_align(root, offset);
3640
3641 /* move on to the next group */ 3846 /* move on to the next group */
3642 if (search_start + num_bytes >= search_end) { 3847 if (search_start + num_bytes >= search_end) {
3643 btrfs_add_free_space(block_group, offset, num_bytes); 3848 btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3683,13 +3888,26 @@ loop:
3683 } 3888 }
3684 up_read(&space_info->groups_sem); 3889 up_read(&space_info->groups_sem);
3685 3890
3686 /* loop == 0, try to find a clustered alloc in every block group 3891 /* LOOP_CACHED_ONLY, only search fully cached block groups
3687 * loop == 1, try again after forcing a chunk allocation 3892 * LOOP_CACHING_NOWAIT, search partially cached block groups, but
3688 * loop == 2, set empty_size and empty_cluster to 0 and try again 3893 * dont wait foR them to finish caching
3894 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3895 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3896 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3897 * again
3689 */ 3898 */
3690 if (!ins->objectid && loop < 3 && 3899 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
3691 (empty_size || empty_cluster || allowed_chunk_alloc)) { 3900 (found_uncached_bg || empty_size || empty_cluster ||
3692 if (loop >= 2) { 3901 allowed_chunk_alloc)) {
3902 if (found_uncached_bg) {
3903 found_uncached_bg = false;
3904 if (loop < LOOP_CACHING_WAIT) {
3905 loop++;
3906 goto search;
3907 }
3908 }
3909
3910 if (loop == LOOP_ALLOC_CHUNK) {
3693 empty_size = 0; 3911 empty_size = 0;
3694 empty_cluster = 0; 3912 empty_cluster = 0;
3695 } 3913 }
@@ -3702,7 +3920,7 @@ loop:
3702 space_info->force_alloc = 1; 3920 space_info->force_alloc = 1;
3703 } 3921 }
3704 3922
3705 if (loop < 3) { 3923 if (loop < LOOP_NO_EMPTY_SIZE) {
3706 loop++; 3924 loop++;
3707 goto search; 3925 goto search;
3708 } 3926 }
@@ -3798,7 +4016,7 @@ again:
3798 num_bytes, data, 1); 4016 num_bytes, data, 1);
3799 goto again; 4017 goto again;
3800 } 4018 }
3801 if (ret) { 4019 if (ret == -ENOSPC) {
3802 struct btrfs_space_info *sinfo; 4020 struct btrfs_space_info *sinfo;
3803 4021
3804 sinfo = __find_space_info(root->fs_info, data); 4022 sinfo = __find_space_info(root->fs_info, data);
@@ -3806,7 +4024,6 @@ again:
3806 "wanted %llu\n", (unsigned long long)data, 4024 "wanted %llu\n", (unsigned long long)data,
3807 (unsigned long long)num_bytes); 4025 (unsigned long long)num_bytes);
3808 dump_space_info(sinfo, num_bytes); 4026 dump_space_info(sinfo, num_bytes);
3809 BUG();
3810 } 4027 }
3811 4028
3812 return ret; 4029 return ret;
@@ -3844,7 +4061,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3844 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, 4061 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3845 empty_size, hint_byte, search_end, ins, 4062 empty_size, hint_byte, search_end, ins,
3846 data); 4063 data);
3847 update_reserved_extents(root, ins->objectid, ins->offset, 1); 4064 if (!ret)
4065 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4066
3848 return ret; 4067 return ret;
3849} 4068}
3850 4069
@@ -4006,9 +4225,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4006 struct btrfs_block_group_cache *block_group; 4225 struct btrfs_block_group_cache *block_group;
4007 4226
4008 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4227 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4009 mutex_lock(&block_group->cache_mutex); 4228 cache_block_group(block_group);
4010 cache_block_group(root, block_group); 4229 wait_event(block_group->caching_q,
4011 mutex_unlock(&block_group->cache_mutex); 4230 block_group_cache_done(block_group));
4012 4231
4013 ret = btrfs_remove_free_space(block_group, ins->objectid, 4232 ret = btrfs_remove_free_space(block_group, ins->objectid,
4014 ins->offset); 4233 ins->offset);
@@ -4039,7 +4258,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4039 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, 4258 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4040 empty_size, hint_byte, search_end, 4259 empty_size, hint_byte, search_end,
4041 ins, 0); 4260 ins, 0);
4042 BUG_ON(ret); 4261 if (ret)
4262 return ret;
4043 4263
4044 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 4264 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4045 if (parent == 0) 4265 if (parent == 0)
@@ -6955,11 +7175,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6955 &info->block_group_cache_tree); 7175 &info->block_group_cache_tree);
6956 spin_unlock(&info->block_group_cache_lock); 7176 spin_unlock(&info->block_group_cache_lock);
6957 7177
6958 btrfs_remove_free_space_cache(block_group);
6959 down_write(&block_group->space_info->groups_sem); 7178 down_write(&block_group->space_info->groups_sem);
6960 list_del(&block_group->list); 7179 list_del(&block_group->list);
6961 up_write(&block_group->space_info->groups_sem); 7180 up_write(&block_group->space_info->groups_sem);
6962 7181
7182 if (block_group->cached == BTRFS_CACHE_STARTED)
7183 wait_event(block_group->caching_q,
7184 block_group_cache_done(block_group));
7185
7186 btrfs_remove_free_space_cache(block_group);
7187
6963 WARN_ON(atomic_read(&block_group->count) != 1); 7188 WARN_ON(atomic_read(&block_group->count) != 1);
6964 kfree(block_group); 7189 kfree(block_group);
6965 7190
@@ -7025,9 +7250,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7025 atomic_set(&cache->count, 1); 7250 atomic_set(&cache->count, 1);
7026 spin_lock_init(&cache->lock); 7251 spin_lock_init(&cache->lock);
7027 spin_lock_init(&cache->tree_lock); 7252 spin_lock_init(&cache->tree_lock);
7028 mutex_init(&cache->cache_mutex); 7253 cache->fs_info = info;
7254 init_waitqueue_head(&cache->caching_q);
7029 INIT_LIST_HEAD(&cache->list); 7255 INIT_LIST_HEAD(&cache->list);
7030 INIT_LIST_HEAD(&cache->cluster_list); 7256 INIT_LIST_HEAD(&cache->cluster_list);
7257
7258 /*
7259 * we only want to have 32k of ram per block group for keeping
7260 * track of free space, and if we pass 1/2 of that we want to
7261 * start converting things over to using bitmaps
7262 */
7263 cache->extents_thresh = ((1024 * 32) / 2) /
7264 sizeof(struct btrfs_free_space);
7265
7031 read_extent_buffer(leaf, &cache->item, 7266 read_extent_buffer(leaf, &cache->item,
7032 btrfs_item_ptr_offset(leaf, path->slots[0]), 7267 btrfs_item_ptr_offset(leaf, path->slots[0]),
7033 sizeof(cache->item)); 7268 sizeof(cache->item));
@@ -7036,6 +7271,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7036 key.objectid = found_key.objectid + found_key.offset; 7271 key.objectid = found_key.objectid + found_key.offset;
7037 btrfs_release_path(root, path); 7272 btrfs_release_path(root, path);
7038 cache->flags = btrfs_block_group_flags(&cache->item); 7273 cache->flags = btrfs_block_group_flags(&cache->item);
7274 cache->sectorsize = root->sectorsize;
7275
7276 remove_sb_from_cache(root, cache);
7277
7278 /*
7279 * check for two cases, either we are full, and therefore
7280 * don't need to bother with the caching work since we won't
7281 * find any space, or we are empty, and we can just add all
7282 * the space in and be done with it. This saves us _alot_ of
7283 * time, particularly in the full case.
7284 */
7285 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7286 cache->cached = BTRFS_CACHE_FINISHED;
7287 } else if (btrfs_block_group_used(&cache->item) == 0) {
7288 cache->cached = BTRFS_CACHE_FINISHED;
7289 add_new_free_space(cache, root->fs_info,
7290 found_key.objectid,
7291 found_key.objectid +
7292 found_key.offset);
7293 }
7039 7294
7040 ret = update_space_info(info, cache->flags, found_key.offset, 7295 ret = update_space_info(info, cache->flags, found_key.offset,
7041 btrfs_block_group_used(&cache->item), 7296 btrfs_block_group_used(&cache->item),
@@ -7079,10 +7334,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7079 cache->key.objectid = chunk_offset; 7334 cache->key.objectid = chunk_offset;
7080 cache->key.offset = size; 7335 cache->key.offset = size;
7081 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7336 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7337 cache->sectorsize = root->sectorsize;
7338
7339 /*
7340 * we only want to have 32k of ram per block group for keeping track
7341 * of free space, and if we pass 1/2 of that we want to start
7342 * converting things over to using bitmaps
7343 */
7344 cache->extents_thresh = ((1024 * 32) / 2) /
7345 sizeof(struct btrfs_free_space);
7082 atomic_set(&cache->count, 1); 7346 atomic_set(&cache->count, 1);
7083 spin_lock_init(&cache->lock); 7347 spin_lock_init(&cache->lock);
7084 spin_lock_init(&cache->tree_lock); 7348 spin_lock_init(&cache->tree_lock);
7085 mutex_init(&cache->cache_mutex); 7349 init_waitqueue_head(&cache->caching_q);
7086 INIT_LIST_HEAD(&cache->list); 7350 INIT_LIST_HEAD(&cache->list);
7087 INIT_LIST_HEAD(&cache->cluster_list); 7351 INIT_LIST_HEAD(&cache->cluster_list);
7088 7352
@@ -7091,6 +7355,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7091 cache->flags = type; 7355 cache->flags = type;
7092 btrfs_set_block_group_flags(&cache->item, type); 7356 btrfs_set_block_group_flags(&cache->item, type);
7093 7357
7358 cache->cached = BTRFS_CACHE_FINISHED;
7359 remove_sb_from_cache(root, cache);
7360
7361 add_new_free_space(cache, root->fs_info, chunk_offset,
7362 chunk_offset + size);
7363
7094 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7364 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7095 &cache->space_info); 7365 &cache->space_info);
7096 BUG_ON(ret); 7366 BUG_ON(ret);
@@ -7149,7 +7419,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7149 rb_erase(&block_group->cache_node, 7419 rb_erase(&block_group->cache_node,
7150 &root->fs_info->block_group_cache_tree); 7420 &root->fs_info->block_group_cache_tree);
7151 spin_unlock(&root->fs_info->block_group_cache_lock); 7421 spin_unlock(&root->fs_info->block_group_cache_lock);
7152 btrfs_remove_free_space_cache(block_group); 7422
7153 down_write(&block_group->space_info->groups_sem); 7423 down_write(&block_group->space_info->groups_sem);
7154 /* 7424 /*
7155 * we must use list_del_init so people can check to see if they 7425 * we must use list_del_init so people can check to see if they
@@ -7158,11 +7428,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7158 list_del_init(&block_group->list); 7428 list_del_init(&block_group->list);
7159 up_write(&block_group->space_info->groups_sem); 7429 up_write(&block_group->space_info->groups_sem);
7160 7430
7431 if (block_group->cached == BTRFS_CACHE_STARTED)
7432 wait_event(block_group->caching_q,
7433 block_group_cache_done(block_group));
7434
7435 btrfs_remove_free_space_cache(block_group);
7436
7161 spin_lock(&block_group->space_info->lock); 7437 spin_lock(&block_group->space_info->lock);
7162 block_group->space_info->total_bytes -= block_group->key.offset; 7438 block_group->space_info->total_bytes -= block_group->key.offset;
7163 block_group->space_info->bytes_readonly -= block_group->key.offset; 7439 block_group->space_info->bytes_readonly -= block_group->key.offset;
7164 spin_unlock(&block_group->space_info->lock); 7440 spin_unlock(&block_group->space_info->lock);
7165 block_group->space_info->full = 0; 7441
7442 btrfs_clear_space_info_full(root->fs_info);
7166 7443
7167 btrfs_put_block_group(block_group); 7444 btrfs_put_block_group(block_group);
7168 btrfs_put_block_group(block_group); 7445 btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..5edcee3a617f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/pagemap.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/math64.h>
20#include "ctree.h" 22#include "ctree.h"
21#include "free-space-cache.h" 23#include "free-space-cache.h"
22#include "transaction.h" 24#include "transaction.h"
23 25
24struct btrfs_free_space { 26#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
25 struct rb_node bytes_index; 27#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
30 28
31static int tree_insert_offset(struct rb_root *root, u64 offset, 29static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
32 struct rb_node *node) 30 u64 offset)
33{ 31{
34 struct rb_node **p = &root->rb_node; 32 BUG_ON(offset < bitmap_start);
35 struct rb_node *parent = NULL; 33 offset -= bitmap_start;
36 struct btrfs_free_space *info; 34 return (unsigned long)(div64_u64(offset, sectorsize));
35}
37 36
38 while (*p) { 37static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
39 parent = *p; 38{
40 info = rb_entry(parent, struct btrfs_free_space, offset_index); 39 return (unsigned long)(div64_u64(bytes, sectorsize));
40}
41 41
42 if (offset < info->offset) 42static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
43 p = &(*p)->rb_left; 43 u64 offset)
44 else if (offset > info->offset) 44{
45 p = &(*p)->rb_right; 45 u64 bitmap_start;
46 else 46 u64 bytes_per_bitmap;
47 return -EEXIST;
48 }
49 47
50 rb_link_node(node, parent, p); 48 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
51 rb_insert_color(node, root); 49 bitmap_start = offset - block_group->key.objectid;
50 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
51 bitmap_start *= bytes_per_bitmap;
52 bitmap_start += block_group->key.objectid;
52 53
53 return 0; 54 return bitmap_start;
54} 55}
55 56
56static int tree_insert_bytes(struct rb_root *root, u64 bytes, 57static int tree_insert_offset(struct rb_root *root, u64 offset,
57 struct rb_node *node) 58 struct rb_node *node, int bitmap)
58{ 59{
59 struct rb_node **p = &root->rb_node; 60 struct rb_node **p = &root->rb_node;
60 struct rb_node *parent = NULL; 61 struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
62 63
63 while (*p) { 64 while (*p) {
64 parent = *p; 65 parent = *p;
65 info = rb_entry(parent, struct btrfs_free_space, bytes_index); 66 info = rb_entry(parent, struct btrfs_free_space, offset_index);
66 67
67 if (bytes < info->bytes) 68 if (offset < info->offset) {
68 p = &(*p)->rb_left; 69 p = &(*p)->rb_left;
69 else 70 } else if (offset > info->offset) {
70 p = &(*p)->rb_right; 71 p = &(*p)->rb_right;
72 } else {
73 /*
74 * we could have a bitmap entry and an extent entry
75 * share the same offset. If this is the case, we want
76 * the extent entry to always be found first if we do a
77 * linear search through the tree, since we want to have
78 * the quickest allocation time, and allocating from an
79 * extent is faster than allocating from a bitmap. So
80 * if we're inserting a bitmap and we find an entry at
81 * this offset, we want to go right, or after this entry
82 * logically. If we are inserting an extent and we've
83 * found a bitmap, we want to go left, or before
84 * logically.
85 */
86 if (bitmap) {
87 WARN_ON(info->bitmap);
88 p = &(*p)->rb_right;
89 } else {
90 WARN_ON(!info->bitmap);
91 p = &(*p)->rb_left;
92 }
93 }
71 } 94 }
72 95
73 rb_link_node(node, parent, p); 96 rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
79/* 102/*
80 * searches the tree for the given offset. 103 * searches the tree for the given offset.
81 * 104 *
82 * fuzzy == 1: this is used for allocations where we are given a hint of where 105 * fuzzy - If this is set, then we are trying to make an allocation, and we just
83 * to look for free space. Because the hint may not be completely on an offset 106 * want a section that has at least bytes size and comes at or after the given
84 * mark, or the hint may no longer point to free space we need to fudge our 107 * offset.
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
94 */ 108 */
95static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 109static struct btrfs_free_space *
96 u64 offset, u64 bytes, 110tree_search_offset(struct btrfs_block_group_cache *block_group,
97 int fuzzy) 111 u64 offset, int bitmap_only, int fuzzy)
98{ 112{
99 struct rb_node *n = root->rb_node; 113 struct rb_node *n = block_group->free_space_offset.rb_node;
100 struct btrfs_free_space *entry, *ret = NULL; 114 struct btrfs_free_space *entry, *prev = NULL;
115
116 /* find entry that is closest to the 'offset' */
117 while (1) {
118 if (!n) {
119 entry = NULL;
120 break;
121 }
101 122
102 while (n) {
103 entry = rb_entry(n, struct btrfs_free_space, offset_index); 123 entry = rb_entry(n, struct btrfs_free_space, offset_index);
124 prev = entry;
104 125
105 if (offset < entry->offset) { 126 if (offset < entry->offset)
106 if (fuzzy &&
107 (!ret || entry->offset < ret->offset) &&
108 (bytes <= entry->bytes))
109 ret = entry;
110 n = n->rb_left; 127 n = n->rb_left;
111 } else if (offset > entry->offset) { 128 else if (offset > entry->offset)
112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
114 bytes <= entry->bytes) {
115 ret = entry;
116 break;
117 }
118 n = n->rb_right; 129 n = n->rb_right;
119 } else { 130 else
120 if (bytes > entry->bytes) {
121 n = n->rb_right;
122 continue;
123 }
124 ret = entry;
125 break; 131 break;
126 }
127 } 132 }
128 133
129 return ret; 134 if (bitmap_only) {
130} 135 if (!entry)
136 return NULL;
137 if (entry->bitmap)
138 return entry;
131 139
132/* 140 /*
133 * return a chunk at least bytes size, as close to offset that we can get. 141 * bitmap entry and extent entry may share same offset,
134 */ 142 * in that case, bitmap entry comes after extent entry.
135static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, 143 */
136 u64 offset, u64 bytes) 144 n = rb_next(n);
137{ 145 if (!n)
138 struct rb_node *n = root->rb_node; 146 return NULL;
139 struct btrfs_free_space *entry, *ret = NULL; 147 entry = rb_entry(n, struct btrfs_free_space, offset_index);
140 148 if (entry->offset != offset)
141 while (n) { 149 return NULL;
142 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
143 150
144 if (bytes < entry->bytes) { 151 WARN_ON(!entry->bitmap);
152 return entry;
153 } else if (entry) {
154 if (entry->bitmap) {
145 /* 155 /*
146 * We prefer to get a hole size as close to the size we 156 * if previous extent entry covers the offset,
147 * are asking for so we don't take small slivers out of 157 * we should return it instead of the bitmap entry
148 * huge holes, but we also want to get as close to the
149 * offset as possible so we don't have a whole lot of
150 * fragmentation.
151 */ 158 */
152 if (offset <= entry->offset) { 159 n = &entry->offset_index;
153 if (!ret) 160 while (1) {
154 ret = entry; 161 n = rb_prev(n);
155 else if (entry->bytes < ret->bytes) 162 if (!n)
156 ret = entry; 163 break;
157 else if (entry->offset < ret->offset) 164 prev = rb_entry(n, struct btrfs_free_space,
158 ret = entry; 165 offset_index);
166 if (!prev->bitmap) {
167 if (prev->offset + prev->bytes > offset)
168 entry = prev;
169 break;
170 }
159 } 171 }
160 n = n->rb_left; 172 }
161 } else if (bytes > entry->bytes) { 173 return entry;
162 n = n->rb_right; 174 }
175
176 if (!prev)
177 return NULL;
178
179 /* find last entry before the 'offset' */
180 entry = prev;
181 if (entry->offset > offset) {
182 n = rb_prev(&entry->offset_index);
183 if (n) {
184 entry = rb_entry(n, struct btrfs_free_space,
185 offset_index);
186 BUG_ON(entry->offset > offset);
163 } else { 187 } else {
164 /* 188 if (fuzzy)
165 * Ok we may have multiple chunks of the wanted size, 189 return entry;
166 * so we don't want to take the first one we find, we 190 else
167 * want to take the one closest to our given offset, so 191 return NULL;
168 * keep searching just in case theres a better match.
169 */
170 n = n->rb_right;
171 if (offset > entry->offset)
172 continue;
173 else if (!ret || entry->offset < ret->offset)
174 ret = entry;
175 } 192 }
176 } 193 }
177 194
178 return ret; 195 if (entry->bitmap) {
196 n = &entry->offset_index;
197 while (1) {
198 n = rb_prev(n);
199 if (!n)
200 break;
201 prev = rb_entry(n, struct btrfs_free_space,
202 offset_index);
203 if (!prev->bitmap) {
204 if (prev->offset + prev->bytes > offset)
205 return prev;
206 break;
207 }
208 }
209 if (entry->offset + BITS_PER_BITMAP *
210 block_group->sectorsize > offset)
211 return entry;
212 } else if (entry->offset + entry->bytes > offset)
213 return entry;
214
215 if (!fuzzy)
216 return NULL;
217
218 while (1) {
219 if (entry->bitmap) {
220 if (entry->offset + BITS_PER_BITMAP *
221 block_group->sectorsize > offset)
222 break;
223 } else {
224 if (entry->offset + entry->bytes > offset)
225 break;
226 }
227
228 n = rb_next(&entry->offset_index);
229 if (!n)
230 return NULL;
231 entry = rb_entry(n, struct btrfs_free_space, offset_index);
232 }
233 return entry;
179} 234}
180 235
181static void unlink_free_space(struct btrfs_block_group_cache *block_group, 236static void unlink_free_space(struct btrfs_block_group_cache *block_group,
182 struct btrfs_free_space *info) 237 struct btrfs_free_space *info)
183{ 238{
184 rb_erase(&info->offset_index, &block_group->free_space_offset); 239 rb_erase(&info->offset_index, &block_group->free_space_offset);
185 rb_erase(&info->bytes_index, &block_group->free_space_bytes); 240 block_group->free_extents--;
241 block_group->free_space -= info->bytes;
186} 242}
187 243
188static int link_free_space(struct btrfs_block_group_cache *block_group, 244static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,353 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
190{ 246{
191 int ret = 0; 247 int ret = 0;
192 248
193 249 BUG_ON(!info->bitmap && !info->bytes);
194 BUG_ON(!info->bytes);
195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 250 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
196 &info->offset_index); 251 &info->offset_index, (info->bitmap != NULL));
197 if (ret) 252 if (ret)
198 return ret; 253 return ret;
199 254
200 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, 255 block_group->free_space += info->bytes;
201 &info->bytes_index); 256 block_group->free_extents++;
202 if (ret) 257 return ret;
203 return ret; 258}
259
260static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261{
262 u64 max_bytes, possible_bytes;
263
264 /*
265 * The goal is to keep the total amount of memory used per 1gb of space
266 * at or below 32k, so we need to adjust how much memory we allow to be
267 * used by extent based free space tracking
268 */
269 max_bytes = MAX_CACHE_BYTES_PER_GIG *
270 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
271
272 possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
273 (sizeof(struct btrfs_free_space) *
274 block_group->extents_thresh);
275
276 if (possible_bytes > max_bytes) {
277 int extent_bytes = max_bytes -
278 (block_group->total_bitmaps * PAGE_CACHE_SIZE);
279
280 if (extent_bytes <= 0) {
281 block_group->extents_thresh = 0;
282 return;
283 }
284
285 block_group->extents_thresh = extent_bytes /
286 (sizeof(struct btrfs_free_space));
287 }
288}
289
290static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
291 struct btrfs_free_space *info, u64 offset,
292 u64 bytes)
293{
294 unsigned long start, end;
295 unsigned long i;
296
297 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
298 end = start + bytes_to_bits(bytes, block_group->sectorsize);
299 BUG_ON(end > BITS_PER_BITMAP);
300
301 for (i = start; i < end; i++)
302 clear_bit(i, info->bitmap);
303
304 info->bytes -= bytes;
305 block_group->free_space -= bytes;
306}
307
308static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
309 struct btrfs_free_space *info, u64 offset,
310 u64 bytes)
311{
312 unsigned long start, end;
313 unsigned long i;
314
315 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
316 end = start + bytes_to_bits(bytes, block_group->sectorsize);
317 BUG_ON(end > BITS_PER_BITMAP);
318
319 for (i = start; i < end; i++)
320 set_bit(i, info->bitmap);
321
322 info->bytes += bytes;
323 block_group->free_space += bytes;
324}
325
326static int search_bitmap(struct btrfs_block_group_cache *block_group,
327 struct btrfs_free_space *bitmap_info, u64 *offset,
328 u64 *bytes)
329{
330 unsigned long found_bits = 0;
331 unsigned long bits, i;
332 unsigned long next_zero;
333
334 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
335 max_t(u64, *offset, bitmap_info->offset));
336 bits = bytes_to_bits(*bytes, block_group->sectorsize);
337
338 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
339 i < BITS_PER_BITMAP;
340 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
341 next_zero = find_next_zero_bit(bitmap_info->bitmap,
342 BITS_PER_BITMAP, i);
343 if ((next_zero - i) >= bits) {
344 found_bits = next_zero - i;
345 break;
346 }
347 i = next_zero;
348 }
349
350 if (found_bits) {
351 *offset = (u64)(i * block_group->sectorsize) +
352 bitmap_info->offset;
353 *bytes = (u64)(found_bits) * block_group->sectorsize;
354 return 0;
355 }
356
357 return -1;
358}
359
360static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
361 *block_group, u64 *offset,
362 u64 *bytes, int debug)
363{
364 struct btrfs_free_space *entry;
365 struct rb_node *node;
366 int ret;
367
368 if (!block_group->free_space_offset.rb_node)
369 return NULL;
370
371 entry = tree_search_offset(block_group,
372 offset_to_bitmap(block_group, *offset),
373 0, 1);
374 if (!entry)
375 return NULL;
376
377 for (node = &entry->offset_index; node; node = rb_next(node)) {
378 entry = rb_entry(node, struct btrfs_free_space, offset_index);
379 if (entry->bytes < *bytes)
380 continue;
381
382 if (entry->bitmap) {
383 ret = search_bitmap(block_group, entry, offset, bytes);
384 if (!ret)
385 return entry;
386 continue;
387 }
388
389 *offset = entry->offset;
390 *bytes = entry->bytes;
391 return entry;
392 }
393
394 return NULL;
395}
396
397static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
398 struct btrfs_free_space *info, u64 offset)
399{
400 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
401 int max_bitmaps = (int)div64_u64(block_group->key.offset +
402 bytes_per_bg - 1, bytes_per_bg);
403 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
404
405 info->offset = offset_to_bitmap(block_group, offset);
406 link_free_space(block_group, info);
407 block_group->total_bitmaps++;
408
409 recalculate_thresholds(block_group);
410}
411
412static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
413 struct btrfs_free_space *bitmap_info,
414 u64 *offset, u64 *bytes)
415{
416 u64 end;
417 u64 search_start, search_bytes;
418 int ret;
419
420again:
421 end = bitmap_info->offset +
422 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
423
424 /*
425 * XXX - this can go away after a few releases.
426 *
427 * since the only user of btrfs_remove_free_space is the tree logging
428 * stuff, and the only way to test that is under crash conditions, we
429 * want to have this debug stuff here just in case somethings not
430 * working. Search the bitmap for the space we are trying to use to
431 * make sure its actually there. If its not there then we need to stop
432 * because something has gone wrong.
433 */
434 search_start = *offset;
435 search_bytes = *bytes;
436 ret = search_bitmap(block_group, bitmap_info, &search_start,
437 &search_bytes);
438 BUG_ON(ret < 0 || search_start != *offset);
439
440 if (*offset > bitmap_info->offset && *offset + *bytes > end) {
441 bitmap_clear_bits(block_group, bitmap_info, *offset,
442 end - *offset + 1);
443 *bytes -= end - *offset + 1;
444 *offset = end + 1;
445 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
446 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
447 *bytes = 0;
448 }
449
450 if (*bytes) {
451 struct rb_node *next = rb_next(&bitmap_info->offset_index);
452 if (!bitmap_info->bytes) {
453 unlink_free_space(block_group, bitmap_info);
454 kfree(bitmap_info->bitmap);
455 kfree(bitmap_info);
456 block_group->total_bitmaps--;
457 recalculate_thresholds(block_group);
458 }
459
460 /*
461 * no entry after this bitmap, but we still have bytes to
462 * remove, so something has gone wrong.
463 */
464 if (!next)
465 return -EINVAL;
466
467 bitmap_info = rb_entry(next, struct btrfs_free_space,
468 offset_index);
469
470 /*
471 * if the next entry isn't a bitmap we need to return to let the
472 * extent stuff do its work.
473 */
474 if (!bitmap_info->bitmap)
475 return -EAGAIN;
476
477 /*
478 * Ok the next item is a bitmap, but it may not actually hold
479 * the information for the rest of this free space stuff, so
480 * look for it, and if we don't find it return so we can try
481 * everything over again.
482 */
483 search_start = *offset;
484 search_bytes = *bytes;
485 ret = search_bitmap(block_group, bitmap_info, &search_start,
486 &search_bytes);
487 if (ret < 0 || search_start != *offset)
488 return -EAGAIN;
489
490 goto again;
491 } else if (!bitmap_info->bytes) {
492 unlink_free_space(block_group, bitmap_info);
493 kfree(bitmap_info->bitmap);
494 kfree(bitmap_info);
495 block_group->total_bitmaps--;
496 recalculate_thresholds(block_group);
497 }
498
499 return 0;
500}
501
502static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
503 struct btrfs_free_space *info)
504{
505 struct btrfs_free_space *bitmap_info;
506 int added = 0;
507 u64 bytes, offset, end;
508 int ret;
509
510 /*
511 * If we are below the extents threshold then we can add this as an
512 * extent, and don't have to deal with the bitmap
513 */
514 if (block_group->free_extents < block_group->extents_thresh &&
515 info->bytes > block_group->sectorsize * 4)
516 return 0;
517
518 /*
519 * some block groups are so tiny they can't be enveloped by a bitmap, so
520 * don't even bother to create a bitmap for this
521 */
522 if (BITS_PER_BITMAP * block_group->sectorsize >
523 block_group->key.offset)
524 return 0;
525
526 bytes = info->bytes;
527 offset = info->offset;
528
529again:
530 bitmap_info = tree_search_offset(block_group,
531 offset_to_bitmap(block_group, offset),
532 1, 0);
533 if (!bitmap_info) {
534 BUG_ON(added);
535 goto new_bitmap;
536 }
537
538 end = bitmap_info->offset +
539 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
540
541 if (offset >= bitmap_info->offset && offset + bytes > end) {
542 bitmap_set_bits(block_group, bitmap_info, offset,
543 end - offset);
544 bytes -= end - offset;
545 offset = end;
546 added = 0;
547 } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
548 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
549 bytes = 0;
550 } else {
551 BUG();
552 }
553
554 if (!bytes) {
555 ret = 1;
556 goto out;
557 } else
558 goto again;
559
560new_bitmap:
561 if (info && info->bitmap) {
562 add_new_bitmap(block_group, info, offset);
563 added = 1;
564 info = NULL;
565 goto again;
566 } else {
567 spin_unlock(&block_group->tree_lock);
568
569 /* no pre-allocated info, allocate a new one */
570 if (!info) {
571 info = kzalloc(sizeof(struct btrfs_free_space),
572 GFP_NOFS);
573 if (!info) {
574 spin_lock(&block_group->tree_lock);
575 ret = -ENOMEM;
576 goto out;
577 }
578 }
579
580 /* allocate the bitmap */
581 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
582 spin_lock(&block_group->tree_lock);
583 if (!info->bitmap) {
584 ret = -ENOMEM;
585 goto out;
586 }
587 goto again;
588 }
589
590out:
591 if (info) {
592 if (info->bitmap)
593 kfree(info->bitmap);
594 kfree(info);
595 }
204 596
205 return ret; 597 return ret;
206} 598}
@@ -208,8 +600,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 600int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
209 u64 offset, u64 bytes) 601 u64 offset, u64 bytes)
210{ 602{
211 struct btrfs_free_space *right_info; 603 struct btrfs_free_space *right_info = NULL;
212 struct btrfs_free_space *left_info; 604 struct btrfs_free_space *left_info = NULL;
213 struct btrfs_free_space *info = NULL; 605 struct btrfs_free_space *info = NULL;
214 int ret = 0; 606 int ret = 0;
215 607
@@ -227,18 +619,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
227 * are adding, if there is remove that struct and add a new one to 619 * are adding, if there is remove that struct and add a new one to
228 * cover the entire range 620 * cover the entire range
229 */ 621 */
230 right_info = tree_search_offset(&block_group->free_space_offset, 622 right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
231 offset+bytes, 0, 0); 623 if (right_info && rb_prev(&right_info->offset_index))
232 left_info = tree_search_offset(&block_group->free_space_offset, 624 left_info = rb_entry(rb_prev(&right_info->offset_index),
233 offset-1, 0, 1); 625 struct btrfs_free_space, offset_index);
626 else
627 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
628
629 /*
630 * If there was no extent directly to the left or right of this new
631 * extent then we know we're going to have to allocate a new extent, so
632 * before we do that see if we need to drop this into a bitmap
633 */
634 if ((!left_info || left_info->bitmap) &&
635 (!right_info || right_info->bitmap)) {
636 ret = insert_into_bitmap(block_group, info);
637
638 if (ret < 0) {
639 goto out;
640 } else if (ret) {
641 ret = 0;
642 goto out;
643 }
644 }
234 645
235 if (right_info) { 646 if (right_info && !right_info->bitmap) {
236 unlink_free_space(block_group, right_info); 647 unlink_free_space(block_group, right_info);
237 info->bytes += right_info->bytes; 648 info->bytes += right_info->bytes;
238 kfree(right_info); 649 kfree(right_info);
239 } 650 }
240 651
241 if (left_info && left_info->offset + left_info->bytes == offset) { 652 if (left_info && !left_info->bitmap &&
653 left_info->offset + left_info->bytes == offset) {
242 unlink_free_space(block_group, left_info); 654 unlink_free_space(block_group, left_info);
243 info->offset = left_info->offset; 655 info->offset = left_info->offset;
244 info->bytes += left_info->bytes; 656 info->bytes += left_info->bytes;
@@ -248,11 +660,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
248 ret = link_free_space(block_group, info); 660 ret = link_free_space(block_group, info);
249 if (ret) 661 if (ret)
250 kfree(info); 662 kfree(info);
251 663out:
252 spin_unlock(&block_group->tree_lock); 664 spin_unlock(&block_group->tree_lock);
253 665
254 if (ret) { 666 if (ret) {
255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 667 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
256 BUG_ON(ret == -EEXIST); 668 BUG_ON(ret == -EEXIST);
257 } 669 }
258 670
@@ -263,40 +675,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
263 u64 offset, u64 bytes) 675 u64 offset, u64 bytes)
264{ 676{
265 struct btrfs_free_space *info; 677 struct btrfs_free_space *info;
678 struct btrfs_free_space *next_info = NULL;
266 int ret = 0; 679 int ret = 0;
267 680
268 spin_lock(&block_group->tree_lock); 681 spin_lock(&block_group->tree_lock);
269 682
270 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 683again:
271 1); 684 info = tree_search_offset(block_group, offset, 0, 0);
272 if (info && info->offset == offset) { 685 if (!info) {
273 if (info->bytes < bytes) { 686 /*
274 printk(KERN_ERR "Found free space at %llu, size %llu," 687 * oops didn't find an extent that matched the space we wanted
275 "trying to use %llu\n", 688 * to remove, look for a bitmap instead
276 (unsigned long long)info->offset, 689 */
277 (unsigned long long)info->bytes, 690 info = tree_search_offset(block_group,
278 (unsigned long long)bytes); 691 offset_to_bitmap(block_group, offset),
692 1, 0);
693 if (!info) {
694 WARN_ON(1);
695 goto out_lock;
696 }
697 }
698
699 if (info->bytes < bytes && rb_next(&info->offset_index)) {
700 u64 end;
701 next_info = rb_entry(rb_next(&info->offset_index),
702 struct btrfs_free_space,
703 offset_index);
704
705 if (next_info->bitmap)
706 end = next_info->offset + BITS_PER_BITMAP *
707 block_group->sectorsize - 1;
708 else
709 end = next_info->offset + next_info->bytes;
710
711 if (next_info->bytes < bytes ||
712 next_info->offset > offset || offset > end) {
713 printk(KERN_CRIT "Found free space at %llu, size %llu,"
714 " trying to use %llu\n",
715 (unsigned long long)info->offset,
716 (unsigned long long)info->bytes,
717 (unsigned long long)bytes);
279 WARN_ON(1); 718 WARN_ON(1);
280 ret = -EINVAL; 719 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock); 720 goto out_lock;
282 goto out;
283 } 721 }
284 unlink_free_space(block_group, info);
285 722
286 if (info->bytes == bytes) { 723 info = next_info;
287 kfree(info); 724 }
288 spin_unlock(&block_group->tree_lock); 725
289 goto out; 726 if (info->bytes == bytes) {
727 unlink_free_space(block_group, info);
728 if (info->bitmap) {
729 kfree(info->bitmap);
730 block_group->total_bitmaps--;
290 } 731 }
732 kfree(info);
733 goto out_lock;
734 }
291 735
736 if (!info->bitmap && info->offset == offset) {
737 unlink_free_space(block_group, info);
292 info->offset += bytes; 738 info->offset += bytes;
293 info->bytes -= bytes; 739 info->bytes -= bytes;
740 link_free_space(block_group, info);
741 goto out_lock;
742 }
294 743
295 ret = link_free_space(block_group, info); 744 if (!info->bitmap && info->offset <= offset &&
296 spin_unlock(&block_group->tree_lock); 745 info->offset + info->bytes >= offset + bytes) {
297 BUG_ON(ret);
298 } else if (info && info->offset < offset &&
299 info->offset + info->bytes >= offset + bytes) {
300 u64 old_start = info->offset; 746 u64 old_start = info->offset;
301 /* 747 /*
302 * we're freeing space in the middle of the info, 748 * we're freeing space in the middle of the info,
@@ -312,7 +758,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
312 info->offset = offset + bytes; 758 info->offset = offset + bytes;
313 info->bytes = old_end - info->offset; 759 info->bytes = old_end - info->offset;
314 ret = link_free_space(block_group, info); 760 ret = link_free_space(block_group, info);
315 BUG_ON(ret); 761 WARN_ON(ret);
762 if (ret)
763 goto out_lock;
316 } else { 764 } else {
317 /* the hole we're creating ends at the end 765 /* the hole we're creating ends at the end
318 * of the info struct, just free the info 766 * of the info struct, just free the info
@@ -320,32 +768,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
320 kfree(info); 768 kfree(info);
321 } 769 }
322 spin_unlock(&block_group->tree_lock); 770 spin_unlock(&block_group->tree_lock);
323 /* step two, insert a new info struct to cover anything 771
324 * before the hole 772 /* step two, insert a new info struct to cover
773 * anything before the hole
325 */ 774 */
326 ret = btrfs_add_free_space(block_group, old_start, 775 ret = btrfs_add_free_space(block_group, old_start,
327 offset - old_start); 776 offset - old_start);
328 BUG_ON(ret); 777 WARN_ON(ret);
329 } else { 778 goto out;
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached,
336 (unsigned long long)block_group->key.objectid,
337 (unsigned long long)block_group->key.offset);
338 btrfs_dump_free_space(block_group, bytes);
339 } else if (info) {
340 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
341 "but wanted offset=%llu bytes=%llu\n",
342 (unsigned long long)info->offset,
343 (unsigned long long)info->bytes,
344 (unsigned long long)offset,
345 (unsigned long long)bytes);
346 }
347 WARN_ON(1);
348 } 779 }
780
781 ret = remove_from_bitmap(block_group, info, &offset, &bytes);
782 if (ret == -EAGAIN)
783 goto again;
784 BUG_ON(ret);
785out_lock:
786 spin_unlock(&block_group->tree_lock);
349out: 787out:
350 return ret; 788 return ret;
351} 789}
@@ -361,10 +799,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
361 info = rb_entry(n, struct btrfs_free_space, offset_index); 799 info = rb_entry(n, struct btrfs_free_space, offset_index);
362 if (info->bytes >= bytes) 800 if (info->bytes >= bytes)
363 count++; 801 count++;
364 printk(KERN_ERR "entry offset %llu, bytes %llu\n", 802 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
365 (unsigned long long)info->offset, 803 (unsigned long long)info->offset,
366 (unsigned long long)info->bytes); 804 (unsigned long long)info->bytes,
805 (info->bitmap) ? "yes" : "no");
367 } 806 }
807 printk(KERN_INFO "block group has cluster?: %s\n",
808 list_empty(&block_group->cluster_list) ? "no" : "yes");
368 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 809 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
369 "\n", count); 810 "\n", count);
370} 811}
@@ -397,26 +838,35 @@ __btrfs_return_cluster_to_free_space(
397{ 838{
398 struct btrfs_free_space *entry; 839 struct btrfs_free_space *entry;
399 struct rb_node *node; 840 struct rb_node *node;
841 bool bitmap;
400 842
401 spin_lock(&cluster->lock); 843 spin_lock(&cluster->lock);
402 if (cluster->block_group != block_group) 844 if (cluster->block_group != block_group)
403 goto out; 845 goto out;
404 846
847 bitmap = cluster->points_to_bitmap;
848 cluster->block_group = NULL;
405 cluster->window_start = 0; 849 cluster->window_start = 0;
850 list_del_init(&cluster->block_group_list);
851 cluster->points_to_bitmap = false;
852
853 if (bitmap)
854 goto out;
855
406 node = rb_first(&cluster->root); 856 node = rb_first(&cluster->root);
407 while(node) { 857 while (node) {
408 entry = rb_entry(node, struct btrfs_free_space, offset_index); 858 entry = rb_entry(node, struct btrfs_free_space, offset_index);
409 node = rb_next(&entry->offset_index); 859 node = rb_next(&entry->offset_index);
410 rb_erase(&entry->offset_index, &cluster->root); 860 rb_erase(&entry->offset_index, &cluster->root);
411 link_free_space(block_group, entry); 861 BUG_ON(entry->bitmap);
862 tree_insert_offset(&block_group->free_space_offset,
863 entry->offset, &entry->offset_index, 0);
412 } 864 }
413 list_del_init(&cluster->block_group_list);
414
415 btrfs_put_block_group(cluster->block_group);
416 cluster->block_group = NULL;
417 cluster->root.rb_node = NULL; 865 cluster->root.rb_node = NULL;
866
418out: 867out:
419 spin_unlock(&cluster->lock); 868 spin_unlock(&cluster->lock);
869 btrfs_put_block_group(block_group);
420 return 0; 870 return 0;
421} 871}
422 872
@@ -425,20 +875,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
425 struct btrfs_free_space *info; 875 struct btrfs_free_space *info;
426 struct rb_node *node; 876 struct rb_node *node;
427 struct btrfs_free_cluster *cluster; 877 struct btrfs_free_cluster *cluster;
428 struct btrfs_free_cluster *safe; 878 struct list_head *head;
429 879
430 spin_lock(&block_group->tree_lock); 880 spin_lock(&block_group->tree_lock);
431 881 while ((head = block_group->cluster_list.next) !=
432 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, 882 &block_group->cluster_list) {
433 block_group_list) { 883 cluster = list_entry(head, struct btrfs_free_cluster,
884 block_group_list);
434 885
435 WARN_ON(cluster->block_group != block_group); 886 WARN_ON(cluster->block_group != block_group);
436 __btrfs_return_cluster_to_free_space(block_group, cluster); 887 __btrfs_return_cluster_to_free_space(block_group, cluster);
888 if (need_resched()) {
889 spin_unlock(&block_group->tree_lock);
890 cond_resched();
891 spin_lock(&block_group->tree_lock);
892 }
437 } 893 }
438 894
439 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 895 while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
440 info = rb_entry(node, struct btrfs_free_space, bytes_index); 896 info = rb_entry(node, struct btrfs_free_space, offset_index);
441 unlink_free_space(block_group, info); 897 unlink_free_space(block_group, info);
898 if (info->bitmap)
899 kfree(info->bitmap);
442 kfree(info); 900 kfree(info);
443 if (need_resched()) { 901 if (need_resched()) {
444 spin_unlock(&block_group->tree_lock); 902 spin_unlock(&block_group->tree_lock);
@@ -446,6 +904,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
446 spin_lock(&block_group->tree_lock); 904 spin_lock(&block_group->tree_lock);
447 } 905 }
448 } 906 }
907
449 spin_unlock(&block_group->tree_lock); 908 spin_unlock(&block_group->tree_lock);
450} 909}
451 910
@@ -453,25 +912,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
453 u64 offset, u64 bytes, u64 empty_size) 912 u64 offset, u64 bytes, u64 empty_size)
454{ 913{
455 struct btrfs_free_space *entry = NULL; 914 struct btrfs_free_space *entry = NULL;
915 u64 bytes_search = bytes + empty_size;
456 u64 ret = 0; 916 u64 ret = 0;
457 917
458 spin_lock(&block_group->tree_lock); 918 spin_lock(&block_group->tree_lock);
459 entry = tree_search_offset(&block_group->free_space_offset, offset, 919 entry = find_free_space(block_group, &offset, &bytes_search, 0);
460 bytes + empty_size, 1);
461 if (!entry) 920 if (!entry)
462 entry = tree_search_bytes(&block_group->free_space_bytes, 921 goto out;
463 offset, bytes + empty_size); 922
464 if (entry) { 923 ret = offset;
924 if (entry->bitmap) {
925 bitmap_clear_bits(block_group, entry, offset, bytes);
926 if (!entry->bytes) {
927 unlink_free_space(block_group, entry);
928 kfree(entry->bitmap);
929 kfree(entry);
930 block_group->total_bitmaps--;
931 recalculate_thresholds(block_group);
932 }
933 } else {
465 unlink_free_space(block_group, entry); 934 unlink_free_space(block_group, entry);
466 ret = entry->offset;
467 entry->offset += bytes; 935 entry->offset += bytes;
468 entry->bytes -= bytes; 936 entry->bytes -= bytes;
469
470 if (!entry->bytes) 937 if (!entry->bytes)
471 kfree(entry); 938 kfree(entry);
472 else 939 else
473 link_free_space(block_group, entry); 940 link_free_space(block_group, entry);
474 } 941 }
942
943out:
475 spin_unlock(&block_group->tree_lock); 944 spin_unlock(&block_group->tree_lock);
476 945
477 return ret; 946 return ret;
@@ -517,6 +986,54 @@ int btrfs_return_cluster_to_free_space(
517 return ret; 986 return ret;
518} 987}
519 988
989static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
990 struct btrfs_free_cluster *cluster,
991 u64 bytes, u64 min_start)
992{
993 struct btrfs_free_space *entry;
994 int err;
995 u64 search_start = cluster->window_start;
996 u64 search_bytes = bytes;
997 u64 ret = 0;
998
999 spin_lock(&block_group->tree_lock);
1000 spin_lock(&cluster->lock);
1001
1002 if (!cluster->points_to_bitmap)
1003 goto out;
1004
1005 if (cluster->block_group != block_group)
1006 goto out;
1007
1008 /*
1009 * search_start is the beginning of the bitmap, but at some point it may
1010 * be a good idea to point to the actual start of the free area in the
1011 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1012 * to 1 to make sure we get the bitmap entry
1013 */
1014 entry = tree_search_offset(block_group,
1015 offset_to_bitmap(block_group, search_start),
1016 1, 0);
1017 if (!entry || !entry->bitmap)
1018 goto out;
1019
1020 search_start = min_start;
1021 search_bytes = bytes;
1022
1023 err = search_bitmap(block_group, entry, &search_start,
1024 &search_bytes);
1025 if (err)
1026 goto out;
1027
1028 ret = search_start;
1029 bitmap_clear_bits(block_group, entry, ret, bytes);
1030out:
1031 spin_unlock(&cluster->lock);
1032 spin_unlock(&block_group->tree_lock);
1033
1034 return ret;
1035}
1036
520/* 1037/*
521 * given a cluster, try to allocate 'bytes' from it, returns 0 1038 * given a cluster, try to allocate 'bytes' from it, returns 0
522 * if it couldn't find anything suitably large, or a logical disk offset 1039 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +1047,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
530 struct rb_node *node; 1047 struct rb_node *node;
531 u64 ret = 0; 1048 u64 ret = 0;
532 1049
1050 if (cluster->points_to_bitmap)
1051 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1052 min_start);
1053
533 spin_lock(&cluster->lock); 1054 spin_lock(&cluster->lock);
534 if (bytes > cluster->max_size) 1055 if (bytes > cluster->max_size)
535 goto out; 1056 goto out;
@@ -567,9 +1088,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
567 } 1088 }
568out: 1089out:
569 spin_unlock(&cluster->lock); 1090 spin_unlock(&cluster->lock);
1091
570 return ret; 1092 return ret;
571} 1093}
572 1094
1095static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1096 struct btrfs_free_space *entry,
1097 struct btrfs_free_cluster *cluster,
1098 u64 offset, u64 bytes, u64 min_bytes)
1099{
1100 unsigned long next_zero;
1101 unsigned long i;
1102 unsigned long search_bits;
1103 unsigned long total_bits;
1104 unsigned long found_bits;
1105 unsigned long start = 0;
1106 unsigned long total_found = 0;
1107 bool found = false;
1108
1109 i = offset_to_bit(entry->offset, block_group->sectorsize,
1110 max_t(u64, offset, entry->offset));
1111 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1112 total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1113
1114again:
1115 found_bits = 0;
1116 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1117 i < BITS_PER_BITMAP;
1118 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1119 next_zero = find_next_zero_bit(entry->bitmap,
1120 BITS_PER_BITMAP, i);
1121 if (next_zero - i >= search_bits) {
1122 found_bits = next_zero - i;
1123 break;
1124 }
1125 i = next_zero;
1126 }
1127
1128 if (!found_bits)
1129 return -1;
1130
1131 if (!found) {
1132 start = i;
1133 found = true;
1134 }
1135
1136 total_found += found_bits;
1137
1138 if (cluster->max_size < found_bits * block_group->sectorsize)
1139 cluster->max_size = found_bits * block_group->sectorsize;
1140
1141 if (total_found < total_bits) {
1142 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1143 if (i - start > total_bits * 2) {
1144 total_found = 0;
1145 cluster->max_size = 0;
1146 found = false;
1147 }
1148 goto again;
1149 }
1150
1151 cluster->window_start = start * block_group->sectorsize +
1152 entry->offset;
1153 cluster->points_to_bitmap = true;
1154
1155 return 0;
1156}
1157
573/* 1158/*
574 * here we try to find a cluster of blocks in a block group. The goal 1159 * here we try to find a cluster of blocks in a block group. The goal
575 * is to find at least bytes free and up to empty_size + bytes free. 1160 * is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1172,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
587 struct btrfs_free_space *entry = NULL; 1172 struct btrfs_free_space *entry = NULL;
588 struct rb_node *node; 1173 struct rb_node *node;
589 struct btrfs_free_space *next; 1174 struct btrfs_free_space *next;
590 struct btrfs_free_space *last; 1175 struct btrfs_free_space *last = NULL;
591 u64 min_bytes; 1176 u64 min_bytes;
592 u64 window_start; 1177 u64 window_start;
593 u64 window_free; 1178 u64 window_free;
594 u64 max_extent = 0; 1179 u64 max_extent = 0;
595 int total_retries = 0; 1180 bool found_bitmap = false;
596 int ret; 1181 int ret;
597 1182
598 /* for metadata, allow allocates with more holes */ 1183 /* for metadata, allow allocates with more holes */
@@ -620,31 +1205,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
620 goto out; 1205 goto out;
621 } 1206 }
622again: 1207again:
623 min_bytes = min(min_bytes, bytes + empty_size); 1208 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
624 entry = tree_search_bytes(&block_group->free_space_bytes,
625 offset, min_bytes);
626 if (!entry) { 1209 if (!entry) {
627 ret = -ENOSPC; 1210 ret = -ENOSPC;
628 goto out; 1211 goto out;
629 } 1212 }
1213
1214 /*
1215 * If found_bitmap is true, we exhausted our search for extent entries,
1216 * and we just want to search all of the bitmaps that we can find, and
1217 * ignore any extent entries we find.
1218 */
1219 while (entry->bitmap || found_bitmap ||
1220 (!entry->bitmap && entry->bytes < min_bytes)) {
1221 struct rb_node *node = rb_next(&entry->offset_index);
1222
1223 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1224 ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1225 offset, bytes + empty_size,
1226 min_bytes);
1227 if (!ret)
1228 goto got_it;
1229 }
1230
1231 if (!node) {
1232 ret = -ENOSPC;
1233 goto out;
1234 }
1235 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1236 }
1237
1238 /*
1239 * We already searched all the extent entries from the passed in offset
1240 * to the end and didn't find enough space for the cluster, and we also
1241 * didn't find any bitmaps that met our criteria, just go ahead and exit
1242 */
1243 if (found_bitmap) {
1244 ret = -ENOSPC;
1245 goto out;
1246 }
1247
1248 cluster->points_to_bitmap = false;
630 window_start = entry->offset; 1249 window_start = entry->offset;
631 window_free = entry->bytes; 1250 window_free = entry->bytes;
632 last = entry; 1251 last = entry;
633 max_extent = entry->bytes; 1252 max_extent = entry->bytes;
634 1253
635 while(1) { 1254 while (1) {
636 /* out window is just right, lets fill it */ 1255 /* out window is just right, lets fill it */
637 if (window_free >= bytes + empty_size) 1256 if (window_free >= bytes + empty_size)
638 break; 1257 break;
639 1258
640 node = rb_next(&last->offset_index); 1259 node = rb_next(&last->offset_index);
641 if (!node) { 1260 if (!node) {
1261 if (found_bitmap)
1262 goto again;
642 ret = -ENOSPC; 1263 ret = -ENOSPC;
643 goto out; 1264 goto out;
644 } 1265 }
645 next = rb_entry(node, struct btrfs_free_space, offset_index); 1266 next = rb_entry(node, struct btrfs_free_space, offset_index);
646 1267
647 /* 1268 /*
1269 * we found a bitmap, so if this search doesn't result in a
1270 * cluster, we know to go and search again for the bitmaps and
1271 * start looking for space there
1272 */
1273 if (next->bitmap) {
1274 if (!found_bitmap)
1275 offset = next->offset;
1276 found_bitmap = true;
1277 last = next;
1278 continue;
1279 }
1280
1281 /*
648 * we haven't filled the empty size and the window is 1282 * we haven't filled the empty size and the window is
649 * very large. reset and try again 1283 * very large. reset and try again
650 */ 1284 */
@@ -655,19 +1289,6 @@ again:
655 window_free = entry->bytes; 1289 window_free = entry->bytes;
656 last = entry; 1290 last = entry;
657 max_extent = 0; 1291 max_extent = 0;
658 total_retries++;
659 if (total_retries % 64 == 0) {
660 if (min_bytes >= (bytes + empty_size)) {
661 ret = -ENOSPC;
662 goto out;
663 }
664 /*
665 * grow our allocation a bit, we're not having
666 * much luck
667 */
668 min_bytes *= 2;
669 goto again;
670 }
671 } else { 1292 } else {
672 last = next; 1293 last = next;
673 window_free += next->bytes; 1294 window_free += next->bytes;
@@ -685,11 +1306,19 @@ again:
685 * The cluster includes an rbtree, but only uses the offset index 1306 * The cluster includes an rbtree, but only uses the offset index
686 * of each free space cache entry. 1307 * of each free space cache entry.
687 */ 1308 */
688 while(1) { 1309 while (1) {
689 node = rb_next(&entry->offset_index); 1310 node = rb_next(&entry->offset_index);
690 unlink_free_space(block_group, entry); 1311 if (entry->bitmap && node) {
1312 entry = rb_entry(node, struct btrfs_free_space,
1313 offset_index);
1314 continue;
1315 } else if (entry->bitmap && !node) {
1316 break;
1317 }
1318
1319 rb_erase(&entry->offset_index, &block_group->free_space_offset);
691 ret = tree_insert_offset(&cluster->root, entry->offset, 1320 ret = tree_insert_offset(&cluster->root, entry->offset,
692 &entry->offset_index); 1321 &entry->offset_index, 0);
693 BUG_ON(ret); 1322 BUG_ON(ret);
694 1323
695 if (!node || entry == last) 1324 if (!node || entry == last)
@@ -697,8 +1326,10 @@ again:
697 1326
698 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1327 entry = rb_entry(node, struct btrfs_free_space, offset_index);
699 } 1328 }
700 ret = 0; 1329
701 cluster->max_size = max_extent; 1330 cluster->max_size = max_extent;
1331got_it:
1332 ret = 0;
702 atomic_inc(&block_group->count); 1333 atomic_inc(&block_group->count);
703 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 1334 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
704 cluster->block_group = block_group; 1335 cluster->block_group = block_group;
@@ -718,6 +1349,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
718 spin_lock_init(&cluster->refill_lock); 1349 spin_lock_init(&cluster->refill_lock);
719 cluster->root.rb_node = NULL; 1350 cluster->root.rb_node = NULL;
720 cluster->max_size = 0; 1351 cluster->max_size = 0;
1352 cluster->points_to_bitmap = false;
721 INIT_LIST_HEAD(&cluster->block_group_list); 1353 INIT_LIST_HEAD(&cluster->block_group_list);
722 cluster->block_group = NULL; 1354 cluster->block_group = NULL;
723} 1355}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
19#ifndef __BTRFS_FREE_SPACE_CACHE 19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE 20#define __BTRFS_FREE_SPACE_CACHE
21 21
22struct btrfs_free_space {
23 struct rb_node offset_index;
24 u64 offset;
25 u64 bytes;
26 unsigned long *bitmap;
27 struct list_head list;
28};
29
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 30int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size); 31 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 32int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 791eab19e330..59cba180fe83 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2603 if (root->ref_cows) 2603 if (root->ref_cows)
2604 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 2604 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2605 path = btrfs_alloc_path(); 2605 path = btrfs_alloc_path();
2606 path->reada = -1;
2607 BUG_ON(!path); 2606 BUG_ON(!path);
2607 path->reada = -1;
2608 2608
2609 /* FIXME, add redo link to tree so we don't leak on crash */ 2609 /* FIXME, add redo link to tree so we don't leak on crash */
2610 key.objectid = inode->i_ino; 2610 key.objectid = inode->i_ino;
@@ -3099,8 +3099,12 @@ static void inode_tree_add(struct inode *inode)
3099{ 3099{
3100 struct btrfs_root *root = BTRFS_I(inode)->root; 3100 struct btrfs_root *root = BTRFS_I(inode)->root;
3101 struct btrfs_inode *entry; 3101 struct btrfs_inode *entry;
3102 struct rb_node **p = &root->inode_tree.rb_node; 3102 struct rb_node **p;
3103 struct rb_node *parent = NULL; 3103 struct rb_node *parent;
3104
3105again:
3106 p = &root->inode_tree.rb_node;
3107 parent = NULL;
3104 3108
3105 spin_lock(&root->inode_lock); 3109 spin_lock(&root->inode_lock);
3106 while (*p) { 3110 while (*p) {
@@ -3108,13 +3112,16 @@ static void inode_tree_add(struct inode *inode)
3108 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3112 entry = rb_entry(parent, struct btrfs_inode, rb_node);
3109 3113
3110 if (inode->i_ino < entry->vfs_inode.i_ino) 3114 if (inode->i_ino < entry->vfs_inode.i_ino)
3111 p = &(*p)->rb_left; 3115 p = &parent->rb_left;
3112 else if (inode->i_ino > entry->vfs_inode.i_ino) 3116 else if (inode->i_ino > entry->vfs_inode.i_ino)
3113 p = &(*p)->rb_right; 3117 p = &parent->rb_right;
3114 else { 3118 else {
3115 WARN_ON(!(entry->vfs_inode.i_state & 3119 WARN_ON(!(entry->vfs_inode.i_state &
3116 (I_WILL_FREE | I_FREEING | I_CLEAR))); 3120 (I_WILL_FREE | I_FREEING | I_CLEAR)));
3117 break; 3121 rb_erase(parent, &root->inode_tree);
3122 RB_CLEAR_NODE(parent);
3123 spin_unlock(&root->inode_lock);
3124 goto again;
3118 } 3125 }
3119 } 3126 }
3120 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3127 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
@@ -3126,12 +3133,12 @@ static void inode_tree_del(struct inode *inode)
3126{ 3133{
3127 struct btrfs_root *root = BTRFS_I(inode)->root; 3134 struct btrfs_root *root = BTRFS_I(inode)->root;
3128 3135
3136 spin_lock(&root->inode_lock);
3129 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3137 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
3130 spin_lock(&root->inode_lock);
3131 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3138 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3132 spin_unlock(&root->inode_lock);
3133 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3139 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3134 } 3140 }
3141 spin_unlock(&root->inode_lock);
3135} 3142}
3136 3143
3137static noinline void init_btrfs_i(struct inode *inode) 3144static noinline void init_btrfs_i(struct inode *inode)
@@ -4785,8 +4792,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4785 * and the replacement file is large. Start IO on it now so 4792 * and the replacement file is large. Start IO on it now so
4786 * we don't add too much work to the end of the transaction 4793 * we don't add too much work to the end of the transaction
4787 */ 4794 */
4788 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && 4795 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
4789 new_inode->i_size &&
4790 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 4796 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4791 filemap_flush(old_inode->i_mapping); 4797 filemap_flush(old_inode->i_mapping);
4792 4798
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
740 .nr_to_write = mapping->nrpages * 2, 740 .nr_to_write = mapping->nrpages * 2,
741 .range_start = start, 741 .range_start = start,
742 .range_end = end, 742 .range_end = end,
743 .for_writepages = 1,
744 }; 743 };
745 return btrfs_writepages(mapping, &wbc); 744 return btrfs_writepages(mapping, &wbc);
746} 745}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a30..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
309 } 309 }
310 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", 310 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
311 (unsigned long long)btrfs_header_bytenr(c), 311 (unsigned long long)btrfs_header_bytenr(c),
312 btrfs_header_level(c), nr, 312 level, nr,
313 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); 313 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
314 for (i = 0; i < nr; i++) { 314 for (i = 0; i < nr; i++) {
315 btrfs_node_key_to_cpu(c, &key, i); 315 btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
326 btrfs_level_size(root, level - 1), 326 btrfs_level_size(root, level - 1),
327 btrfs_node_ptr_generation(c, i)); 327 btrfs_node_ptr_generation(c, i));
328 if (btrfs_is_leaf(next) && 328 if (btrfs_is_leaf(next) &&
329 btrfs_header_level(c) != 1) 329 level != 1)
330 BUG(); 330 BUG();
331 if (btrfs_header_level(next) != 331 if (btrfs_header_level(next) !=
332 btrfs_header_level(c) - 1) 332 level - 1)
333 BUG(); 333 BUG();
334 btrfs_print_tree(root, next); 334 btrfs_print_tree(root, next);
335 free_extent_buffer(next); 335 free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 008397934778..c04f7f212602 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,6 +670,8 @@ again:
670 err = ret; 670 err = ret;
671 goto out; 671 goto out;
672 } 672 }
673 if (ret > 0 && path2->slots[level] > 0)
674 path2->slots[level]--;
673 675
674 eb = path2->nodes[level]; 676 eb = path2->nodes[level];
675 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != 677 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1609 BUG_ON(level == 0); 1611 BUG_ON(level == 0);
1610 path->lowest_level = level; 1612 path->lowest_level = level;
1611 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1613 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1614 path->lowest_level = 0;
1612 if (ret < 0) { 1615 if (ret < 0) {
1613 btrfs_free_path(path); 1616 btrfs_free_path(path);
1614 return ret; 1617 return ret;
@@ -2550,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
2550 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; 2553 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2551 2554
2552 /* make sure the dirty trick played by the caller work */ 2555 /* make sure the dirty trick played by the caller work */
2553 ret = invalidate_inode_pages2_range(inode->i_mapping, 2556 while (1) {
2554 first_index, last_index); 2557 ret = invalidate_inode_pages2_range(inode->i_mapping,
2558 first_index, last_index);
2559 if (ret != -EBUSY)
2560 break;
2561 schedule_timeout(HZ/10);
2562 }
2555 if (ret) 2563 if (ret)
2556 goto out_unlock; 2564 goto out_unlock;
2557 2565
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2dbf1c1f56ee..cdbb5022da52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
40 } 40 }
41} 41}
42 42
43static noinline void switch_commit_root(struct btrfs_root *root)
44{
45 free_extent_buffer(root->commit_root);
46 root->commit_root = btrfs_root_node(root);
47}
48
43/* 49/*
44 * either allocate a new transaction or hop into the existing one 50 * either allocate a new transaction or hop into the existing one
45 */ 51 */
@@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
444 450
445 btrfs_write_dirty_block_groups(trans, root); 451 btrfs_write_dirty_block_groups(trans, root);
446 452
447 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
448 BUG_ON(ret);
449
450 while (1) { 453 while (1) {
451 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 454 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
452 if (old_root_bytenr == root->node->start) 455 if (old_root_bytenr == root->node->start)
@@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
457 &root->root_key, 460 &root->root_key,
458 &root->root_item); 461 &root->root_item);
459 BUG_ON(ret); 462 BUG_ON(ret);
460 btrfs_write_dirty_block_groups(trans, root);
461 463
462 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 464 ret = btrfs_write_dirty_block_groups(trans, root);
463 BUG_ON(ret); 465 BUG_ON(ret);
464 } 466 }
465 free_extent_buffer(root->commit_root); 467
466 root->commit_root = btrfs_root_node(root); 468 if (root != root->fs_info->extent_root)
469 switch_commit_root(root);
470
467 return 0; 471 return 0;
468} 472}
469 473
@@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
495 root = list_entry(next, struct btrfs_root, dirty_list); 499 root = list_entry(next, struct btrfs_root, dirty_list);
496 500
497 update_cowonly_root(trans, root); 501 update_cowonly_root(trans, root);
498
499 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
500 BUG_ON(ret);
501 } 502 }
503
504 down_write(&fs_info->extent_commit_sem);
505 switch_commit_root(fs_info->extent_root);
506 up_write(&fs_info->extent_commit_sem);
507
502 return 0; 508 return 0;
503} 509}
504 510
@@ -544,8 +550,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
544 btrfs_update_reloc_root(trans, root); 550 btrfs_update_reloc_root(trans, root);
545 551
546 if (root->commit_root != root->node) { 552 if (root->commit_root != root->node) {
547 free_extent_buffer(root->commit_root); 553 switch_commit_root(root);
548 root->commit_root = btrfs_root_node(root);
549 btrfs_set_root_node(&root->root_item, 554 btrfs_set_root_node(&root->root_item,
550 root->node); 555 root->node);
551 } 556 }
@@ -852,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root)
852 super->root_level = root_item->level; 857 super->root_level = root_item->level;
853} 858}
854 859
860int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
861{
862 int ret = 0;
863 spin_lock(&info->new_trans_lock);
864 if (info->running_transaction)
865 ret = info->running_transaction->in_commit;
866 spin_unlock(&info->new_trans_lock);
867 return ret;
868}
869
855int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 870int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
856 struct btrfs_root *root) 871 struct btrfs_root *root)
857{ 872{
@@ -943,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
943 958
944 mutex_unlock(&root->fs_info->trans_mutex); 959 mutex_unlock(&root->fs_info->trans_mutex);
945 960
946 if (flush_on_commit || snap_pending) { 961 if (flush_on_commit) {
947 if (flush_on_commit) 962 btrfs_start_delalloc_inodes(root);
948 btrfs_start_delalloc_inodes(root); 963 ret = btrfs_wait_ordered_extents(root, 0);
964 BUG_ON(ret);
965 } else if (snap_pending) {
949 ret = btrfs_wait_ordered_extents(root, 1); 966 ret = btrfs_wait_ordered_extents(root, 1);
950 BUG_ON(ret); 967 BUG_ON(ret);
951 } 968 }
@@ -1009,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1009 1026
1010 btrfs_set_root_node(&root->fs_info->tree_root->root_item, 1027 btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1011 root->fs_info->tree_root->node); 1028 root->fs_info->tree_root->node);
1012 free_extent_buffer(root->fs_info->tree_root->commit_root); 1029 switch_commit_root(root->fs_info->tree_root);
1013 root->fs_info->tree_root->commit_root =
1014 btrfs_root_node(root->fs_info->tree_root);
1015 1030
1016 btrfs_set_root_node(&root->fs_info->chunk_root->root_item, 1031 btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1017 root->fs_info->chunk_root->node); 1032 root->fs_info->chunk_root->node);
1018 free_extent_buffer(root->fs_info->chunk_root->commit_root); 1033 switch_commit_root(root->fs_info->chunk_root);
1019 root->fs_info->chunk_root->commit_root =
1020 btrfs_root_node(root->fs_info->chunk_root);
1021 1034
1022 update_super_roots(root); 1035 update_super_roots(root);
1023 1036
@@ -1057,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1057 cur_trans->commit_done = 1; 1070 cur_trans->commit_done = 1;
1058 1071
1059 root->fs_info->last_trans_committed = cur_trans->transid; 1072 root->fs_info->last_trans_committed = cur_trans->transid;
1073
1060 wake_up(&cur_trans->commit_wait); 1074 wake_up(&cur_trans->commit_wait);
1061 1075
1062 put_transaction(cur_trans); 1076 put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 961c3ee5a2e1..663c67404918 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root); 107 struct btrfs_root *root);
108int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 108int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
109 struct extent_io_tree *dirty_pages); 109 struct extent_io_tree *dirty_pages);
110int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
110#endif 111#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1b..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
797 return -ENOENT; 797 return -ENOENT;
798 798
799 inode = read_one_inode(root, key->objectid); 799 inode = read_one_inode(root, key->objectid);
800 BUG_ON(!dir); 800 BUG_ON(!inode);
801 801
802 ref_ptr = btrfs_item_ptr_offset(eb, slot); 802 ref_ptr = btrfs_item_ptr_offset(eb, slot);
803 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 803 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd767..5cf405b0828d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
260 num_run++; 260 num_run++;
261 batch_run++; 261 batch_run++;
262 262
263 if (bio_sync(cur)) 263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
264 num_sync_run++; 264 num_sync_run++;
265 265
266 if (need_resched()) { 266 if (need_resched()) {
@@ -721,7 +721,8 @@ error:
721 */ 721 */
722static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 722static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
723 struct btrfs_device *device, 723 struct btrfs_device *device,
724 u64 num_bytes, u64 *start) 724 u64 num_bytes, u64 *start,
725 u64 *max_avail)
725{ 726{
726 struct btrfs_key key; 727 struct btrfs_key key;
727 struct btrfs_root *root = device->dev_root; 728 struct btrfs_root *root = device->dev_root;
@@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
758 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 759 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
759 if (ret < 0) 760 if (ret < 0)
760 goto error; 761 goto error;
761 ret = btrfs_previous_item(root, path, 0, key.type); 762 if (ret > 0) {
762 if (ret < 0) 763 ret = btrfs_previous_item(root, path, key.objectid, key.type);
763 goto error; 764 if (ret < 0)
765 goto error;
766 if (ret > 0)
767 start_found = 1;
768 }
764 l = path->nodes[0]; 769 l = path->nodes[0];
765 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 770 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
766 while (1) { 771 while (1) {
@@ -803,6 +808,10 @@ no_more_items:
803 if (last_byte < search_start) 808 if (last_byte < search_start)
804 last_byte = search_start; 809 last_byte = search_start;
805 hole_size = key.offset - last_byte; 810 hole_size = key.offset - last_byte;
811
812 if (hole_size > *max_avail)
813 *max_avail = hole_size;
814
806 if (key.offset > last_byte && 815 if (key.offset > last_byte &&
807 hole_size >= num_bytes) { 816 hole_size >= num_bytes) {
808 *start = last_byte; 817 *start = last_byte;
@@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1621 device->fs_devices->total_rw_bytes += diff; 1630 device->fs_devices->total_rw_bytes += diff;
1622 1631
1623 device->total_bytes = new_size; 1632 device->total_bytes = new_size;
1633 device->disk_total_bytes = new_size;
1624 btrfs_clear_space_info_full(device->dev_root->fs_info); 1634 btrfs_clear_space_info_full(device->dev_root->fs_info);
1625 1635
1626 return btrfs_update_device(trans, device); 1636 return btrfs_update_device(trans, device);
@@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2007 goto done; 2017 goto done;
2008 if (ret) { 2018 if (ret) {
2009 ret = 0; 2019 ret = 0;
2010 goto done; 2020 break;
2011 } 2021 }
2012 2022
2013 l = path->nodes[0]; 2023 l = path->nodes[0];
@@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2015 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2025 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2016 2026
2017 if (key.objectid != device->devid) 2027 if (key.objectid != device->devid)
2018 goto done; 2028 break;
2019 2029
2020 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2030 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2021 length = btrfs_dev_extent_length(l, dev_extent); 2031 length = btrfs_dev_extent_length(l, dev_extent);
@@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2171 max_chunk_size); 2181 max_chunk_size);
2172 2182
2173again: 2183again:
2184 max_avail = 0;
2174 if (!map || map->num_stripes != num_stripes) { 2185 if (!map || map->num_stripes != num_stripes) {
2175 kfree(map); 2186 kfree(map);
2176 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2187 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2219,7 +2230,8 @@ again:
2219 2230
2220 if (device->in_fs_metadata && avail >= min_free) { 2231 if (device->in_fs_metadata && avail >= min_free) {
2221 ret = find_free_dev_extent(trans, device, 2232 ret = find_free_dev_extent(trans, device,
2222 min_free, &dev_offset); 2233 min_free, &dev_offset,
2234 &max_avail);
2223 if (ret == 0) { 2235 if (ret == 0) {
2224 list_move_tail(&device->dev_alloc_list, 2236 list_move_tail(&device->dev_alloc_list,
2225 &private_devs); 2237 &private_devs);
@@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2795 } 2807 }
2796 } 2808 }
2797 2809
2798 for (i = 0; i > nr; i++) {
2799 struct btrfs_multi_bio *multi;
2800 struct btrfs_bio_stripe *stripe;
2801 int ret;
2802
2803 length = 1;
2804 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2805 &length, &multi, 0);
2806 BUG_ON(ret);
2807
2808 stripe = multi->stripes;
2809 for (j = 0; j < multi->num_stripes; j++) {
2810 if (stripe->physical >= physical &&
2811 physical < stripe->physical + length)
2812 break;
2813 }
2814 BUG_ON(j >= multi->num_stripes);
2815 kfree(multi);
2816 }
2817
2818 *logical = buf; 2810 *logical = buf;
2819 *naddrs = nr; 2811 *naddrs = nr;
2820 *stripe_len = map->stripe_len; 2812 *stripe_len = map->stripe_len;
@@ -2911,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2911 bio->bi_rw |= rw; 2903 bio->bi_rw |= rw;
2912 2904
2913 spin_lock(&device->io_lock); 2905 spin_lock(&device->io_lock);
2914 if (bio_sync(bio)) 2906 if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
2915 pending_bios = &device->pending_sync_bios; 2907 pending_bios = &device->pending_sync_bios;
2916 else 2908 else
2917 pending_bios = &device->pending_bios; 2909 pending_bios = &device->pending_bios;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d32..3e2b90eaa239 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
208 *total_in = 0; 208 *total_in = 0;
209 209
210 workspace = find_zlib_workspace(); 210 workspace = find_zlib_workspace();
211 if (!workspace) 211 if (IS_ERR(workspace))
212 return -1; 212 return -1;
213 213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
366 char *kaddr; 366 char *kaddr;
367 367
368 workspace = find_zlib_workspace(); 368 workspace = find_zlib_workspace();
369 if (!workspace) 369 if (IS_ERR(workspace))
370 return -ENOMEM; 370 return -ENOMEM;
371 371
372 data_in = kmap(pages_in[page_in_index]); 372 data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
547 return -ENOMEM; 547 return -ENOMEM;
548 548
549 workspace = find_zlib_workspace(); 549 workspace = find_zlib_workspace();
550 if (!workspace) 550 if (IS_ERR(workspace))
551 return -ENOMEM; 551 return -ENOMEM;
552 552
553 workspace->inf_strm.next_in = data_in; 553 workspace->inf_strm.next_in = data_in;
diff --git a/fs/buffer.c b/fs/buffer.c
index a3ef091a45bd..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
281 struct zone *zone; 281 struct zone *zone;
282 int nid; 282 int nid;
283 283
284 wakeup_pdflush(1024); 284 wakeup_flusher_threads(1024);
285 yield(); 285 yield();
286 286
287 for_each_online_node(nid) { 287 for_each_online_node(nid) {
@@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
1165 1165
1166 if (!test_set_buffer_dirty(bh)) { 1166 if (!test_set_buffer_dirty(bh)) {
1167 struct page *page = bh->b_page; 1167 struct page *page = bh->b_page;
1168 if (!TestSetPageDirty(page)) 1168 if (!TestSetPageDirty(page)) {
1169 __set_page_dirty(page, page_mapping(page), 0); 1169 struct address_space *mapping = page_mapping(page);
1170 if (mapping)
1171 __set_page_dirty(page, mapping, 0);
1172 }
1170 } 1173 }
1171} 1174}
1172 1175
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..3cbc57f932d2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
31 * - no readahead or I/O queue unplugging required 31 * - no readahead or I/O queue unplugging required
32 */ 32 */
33struct backing_dev_info directly_mappable_cdev_bdi = { 33struct backing_dev_info directly_mappable_cdev_bdi = {
34 .name = "char",
34 .capabilities = ( 35 .capabilities = (
35#ifdef CONFIG_MMU 36#ifdef CONFIG_MMU
36 /* permit private copies of the data to be taken */ 37 /* permit private copies of the data to be taken */
@@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
237} 238}
238 239
239/** 240/**
240 * register_chrdev() - Register a major number for character devices. 241 * __register_chrdev() - create and register a cdev occupying a range of minors
241 * @major: major device number or 0 for dynamic allocation 242 * @major: major device number or 0 for dynamic allocation
243 * @baseminor: first of the requested range of minor numbers
244 * @count: the number of minor numbers required
242 * @name: name of this range of devices 245 * @name: name of this range of devices
243 * @fops: file operations associated with this devices 246 * @fops: file operations associated with this devices
244 * 247 *
@@ -254,19 +257,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
254 * /dev. It only helps to keep track of the different owners of devices. If 257 * /dev. It only helps to keep track of the different owners of devices. If
255 * your module name has only one type of devices it's ok to use e.g. the name 258 * your module name has only one type of devices it's ok to use e.g. the name
256 * of the module here. 259 * of the module here.
257 *
258 * This function registers a range of 256 minor numbers. The first minor number
259 * is 0.
260 */ 260 */
261int register_chrdev(unsigned int major, const char *name, 261int __register_chrdev(unsigned int major, unsigned int baseminor,
262 const struct file_operations *fops) 262 unsigned int count, const char *name,
263 const struct file_operations *fops)
263{ 264{
264 struct char_device_struct *cd; 265 struct char_device_struct *cd;
265 struct cdev *cdev; 266 struct cdev *cdev;
266 char *s; 267 char *s;
267 int err = -ENOMEM; 268 int err = -ENOMEM;
268 269
269 cd = __register_chrdev_region(major, 0, 256, name); 270 cd = __register_chrdev_region(major, baseminor, count, name);
270 if (IS_ERR(cd)) 271 if (IS_ERR(cd))
271 return PTR_ERR(cd); 272 return PTR_ERR(cd);
272 273
@@ -280,7 +281,7 @@ int register_chrdev(unsigned int major, const char *name,
280 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) 281 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
281 *s = '!'; 282 *s = '!';
282 283
283 err = cdev_add(cdev, MKDEV(cd->major, 0), 256); 284 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
284 if (err) 285 if (err)
285 goto out; 286 goto out;
286 287
@@ -290,7 +291,7 @@ int register_chrdev(unsigned int major, const char *name,
290out: 291out:
291 kobject_put(&cdev->kobj); 292 kobject_put(&cdev->kobj);
292out2: 293out2:
293 kfree(__unregister_chrdev_region(cd->major, 0, 256)); 294 kfree(__unregister_chrdev_region(cd->major, baseminor, count));
294 return err; 295 return err;
295} 296}
296 297
@@ -316,10 +317,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
316 } 317 }
317} 318}
318 319
319void unregister_chrdev(unsigned int major, const char *name) 320/**
321 * __unregister_chrdev - unregister and destroy a cdev
322 * @major: major device number
323 * @baseminor: first of the range of minor numbers
324 * @count: the number of minor numbers this cdev is occupying
325 * @name: name of this range of devices
326 *
327 * Unregister and destroy the cdev occupying the region described by
328 * @major, @baseminor and @count. This function undoes what
329 * __register_chrdev() did.
330 */
331void __unregister_chrdev(unsigned int major, unsigned int baseminor,
332 unsigned int count, const char *name)
320{ 333{
321 struct char_device_struct *cd; 334 struct char_device_struct *cd;
322 cd = __unregister_chrdev_region(major, 0, 256); 335
336 cd = __unregister_chrdev_region(major, baseminor, count);
323 if (cd && cd->cdev) 337 if (cd && cd->cdev)
324 cdev_del(cd->cdev); 338 cdev_del(cd->cdev);
325 kfree(cd); 339 kfree(cd);
@@ -568,6 +582,6 @@ EXPORT_SYMBOL(cdev_alloc);
568EXPORT_SYMBOL(cdev_del); 582EXPORT_SYMBOL(cdev_del);
569EXPORT_SYMBOL(cdev_add); 583EXPORT_SYMBOL(cdev_add);
570EXPORT_SYMBOL(cdev_index); 584EXPORT_SYMBOL(cdev_index);
571EXPORT_SYMBOL(register_chrdev); 585EXPORT_SYMBOL(__register_chrdev);
572EXPORT_SYMBOL(unregister_chrdev); 586EXPORT_SYMBOL(__unregister_chrdev);
573EXPORT_SYMBOL(directly_mappable_cdev_bdi); 587EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 92888aa90749..145540a316ab 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
1Version 1.60
2-------------
3Fix memory leak in reconnect. Fix oops in DFS mount error path.
4Set s_maxbytes to smaller (the max that vfs can handle) so that
5sendfile will now work over cifs mounts again. Add noforcegid
6and noforceuid mount parameters. Fix small mem leak when using
7ntlmv2. Fix 2nd mount to same server but with different port to
8be allowed (rather than reusing the 1st port) - only when the
9user explicitly overrides the port on the 2nd mount.
10
1Version 1.59 11Version 1.59
2------------ 12------------
3Client uses server inode numbers (which are persistent) rather than 13Client uses server inode numbers (which are persistent) rather than
diff --git a/fs/cifs/README b/fs/cifs/README
index ad92921dbde4..79c1a93400be 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,11 +262,11 @@ A partial list of the supported mount options follows:
262 mount. 262 mount.
263 domain Set the SMB/CIFS workgroup name prepended to the 263 domain Set the SMB/CIFS workgroup name prepended to the
264 username during CIFS session establishment 264 username during CIFS session establishment
265 forceuid Set the default uid for inodes based on the uid 265 forceuid Set the default uid for inodes to the uid
266 passed in. For mounts to servers 266 passed in on mount. For mounts to servers
267 which do support the CIFS Unix extensions, such as a 267 which do support the CIFS Unix extensions, such as a
268 properly configured Samba server, the server provides 268 properly configured Samba server, the server provides
269 the uid, gid and mode so this parameter should not be 269 the uid, gid and mode so this parameter should not be
270 specified unless the server and clients uid and gid 270 specified unless the server and clients uid and gid
271 numbering differ. If the server and client are in the 271 numbering differ. If the server and client are in the
272 same domain (e.g. running winbind or nss_ldap) and 272 same domain (e.g. running winbind or nss_ldap) and
@@ -278,11 +278,7 @@ A partial list of the supported mount options follows:
278 of existing files will be the uid (gid) of the person 278 of existing files will be the uid (gid) of the person
279 who executed the mount (root, except when mount.cifs 279 who executed the mount (root, except when mount.cifs
280 is configured setuid for user mounts) unless the "uid=" 280 is configured setuid for user mounts) unless the "uid="
281 (gid) mount option is specified. For the uid (gid) of newly 281 (gid) mount option is specified. Also note that permission
282 created files and directories, ie files created since
283 the last mount of the server share, the expected uid
284 (gid) is cached as long as the inode remains in
285 memory on the client. Also note that permission
286 checks (authorization checks) on accesses to a file occur 282 checks (authorization checks) on accesses to a file occur
287 at the server, but there are cases in which an administrator 283 at the server, but there are cases in which an administrator
288 may want to restrict at the client as well. For those 284 may want to restrict at the client as well. For those
@@ -290,12 +286,15 @@ A partial list of the supported mount options follows:
290 (such as Windows), permissions can also be checked at the 286 (such as Windows), permissions can also be checked at the
291 client, and a crude form of client side permission checking 287 client, and a crude form of client side permission checking
292 can be enabled by specifying file_mode and dir_mode on 288 can be enabled by specifying file_mode and dir_mode on
293 the client. Note that the mount.cifs helper must be 289 the client. (default)
294 at version 1.10 or higher to support specifying the uid 290 forcegid (similar to above but for the groupid instead of uid) (default)
295 (or gid) in non-numeric form. 291 noforceuid Fill in file owner information (uid) by requesting it from
296 forcegid (similar to above but for the groupid instead of uid) 292 the server if possible. With this option, the value given in
293 the uid= option (on mount) will only be used if the server
294 can not support returning uids on inodes.
295 noforcegid (similar to above but for the group owner, gid, instead of uid)
297 uid Set the default uid for inodes, and indicate to the 296 uid Set the default uid for inodes, and indicate to the
298 cifs kernel driver which local user mounted . If the server 297 cifs kernel driver which local user mounted. If the server
299 supports the unix extensions the default uid is 298 supports the unix extensions the default uid is
300 not used to fill in the owner fields of inodes (files) 299 not used to fill in the owner fields of inodes (files)
301 unless the "forceuid" parameter is specified. 300 unless the "forceuid" parameter is specified.
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 3bb11be8b6a8..606912d8f2a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
55 * i.e. strips from UNC trailing path that is not part of share 55 * i.e. strips from UNC trailing path that is not part of share
56 * name and fixup missing '\' in the begining of DFS node refferal 56 * name and fixup missing '\' in the begining of DFS node refferal
57 * if neccessary. 57 * if neccessary.
58 * Returns pointer to share name on success or NULL on error. 58 * Returns pointer to share name on success or ERR_PTR on error.
59 * Caller is responsible for freeing returned string. 59 * Caller is responsible for freeing returned string.
60 */ 60 */
61static char *cifs_get_share_name(const char *node_name) 61static char *cifs_get_share_name(const char *node_name)
@@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name)
68 UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, 68 UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
69 GFP_KERNEL); 69 GFP_KERNEL);
70 if (!UNC) 70 if (!UNC)
71 return NULL; 71 return ERR_PTR(-ENOMEM);
72 72
73 /* get share name and server name */ 73 /* get share name and server name */
74 if (node_name[1] != '\\') { 74 if (node_name[1] != '\\') {
@@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name)
87 cERROR(1, ("%s: no server name end in node name: %s", 87 cERROR(1, ("%s: no server name end in node name: %s",
88 __func__, node_name)); 88 __func__, node_name));
89 kfree(UNC); 89 kfree(UNC);
90 return NULL; 90 return ERR_PTR(-EINVAL);
91 } 91 }
92 92
93 /* find sharename end */ 93 /* find sharename end */
@@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
133 return ERR_PTR(-EINVAL); 133 return ERR_PTR(-EINVAL);
134 134
135 *devname = cifs_get_share_name(ref->node_name); 135 *devname = cifs_get_share_name(ref->node_name);
136 if (IS_ERR(*devname)) {
137 rc = PTR_ERR(*devname);
138 *devname = NULL;
139 goto compose_mount_options_err;
140 }
141
136 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
137 if (rc != 0) { 143 if (rc != 0) {
138 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", 144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d67..8ec7736ce954 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
125 if (server->addr.sockAddr.sin_family == AF_INET) 125 if (server->addr.sockAddr.sin_family == AF_INET)
126 sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); 126 sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
127 else if (server->addr.sockAddr.sin_family == AF_INET6) 127 else if (server->addr.sockAddr.sin_family == AF_INET6)
128 sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr); 128 sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
129 else 129 else
130 goto out; 130 goto out;
131 131
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 60e3c4253de0..714a542cbafc 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
44 int maxwords = maxbytes / 2; 44 int maxwords = maxbytes / 2;
45 char tmp[NLS_MAX_CHARSET_SIZE]; 45 char tmp[NLS_MAX_CHARSET_SIZE];
46 46
47 for (i = 0; from[i] && i < maxwords; i++) { 47 for (i = 0; i < maxwords && from[i]; i++) {
48 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, 48 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
49 NLS_MAX_CHARSET_SIZE); 49 NLS_MAX_CHARSET_SIZE);
50 if (charlen > 0) 50 if (charlen > 0)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a6..7dfe0842a6f6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
607 return get_cifs_acl_by_path(cifs_sb, path, pacllen); 607 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
608 608
609 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); 609 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
610 atomic_dec(&open_file->wrtPending); 610 cifsFileInfo_put(open_file);
611 return pntsd; 611 return pntsd;
612} 612}
613 613
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
665 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); 665 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
666 666
667 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); 667 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
668 atomic_dec(&open_file->wrtPending); 668 cifsFileInfo_put(open_file);
669 return rc; 669 return rc;
670} 670}
671 671
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f42..7efe1745494d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
373 compare with the NTLM example */ 373 compare with the NTLM example */
374 hmac_md5_final(ses->server->ntlmv2_hash, pctxt); 374 hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
375 375
376 kfree(pctxt);
376 return rc; 377 return rc;
377} 378}
378 379
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 44f30504b82d..3610e9958b4c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
361static int 361static int
362cifs_show_options(struct seq_file *s, struct vfsmount *m) 362cifs_show_options(struct seq_file *s, struct vfsmount *m)
363{ 363{
364 struct cifs_sb_info *cifs_sb; 364 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
365 struct cifsTconInfo *tcon; 365 struct cifsTconInfo *tcon = cifs_sb->tcon;
366
367 cifs_sb = CIFS_SB(m->mnt_sb);
368 tcon = cifs_sb->tcon;
369 366
370 seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName); 367 seq_printf(s, ",unc=%s", tcon->treeName);
371 if (tcon->ses->userName) 368 if (tcon->ses->userName)
372 seq_printf(s, ",username=%s", tcon->ses->userName); 369 seq_printf(s, ",username=%s", tcon->ses->userName);
373 if (tcon->ses->domainName) 370 if (tcon->ses->domainName)
@@ -376,10 +373,14 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
376 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); 373 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
377 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) 374 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
378 seq_printf(s, ",forceuid"); 375 seq_printf(s, ",forceuid");
376 else
377 seq_printf(s, ",noforceuid");
379 378
380 seq_printf(s, ",gid=%d", cifs_sb->mnt_gid); 379 seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
381 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) 380 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
382 seq_printf(s, ",forcegid"); 381 seq_printf(s, ",forcegid");
382 else
383 seq_printf(s, ",noforcegid");
383 384
384 cifs_show_address(s, tcon->ses->server); 385 cifs_show_address(s, tcon->ses->server);
385 386
@@ -985,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg)
985 if (try_to_freeze()) 986 if (try_to_freeze())
986 continue; 987 continue;
987 988
988 spin_lock(&GlobalMid_Lock); 989 spin_lock(&cifs_oplock_lock);
989 if (list_empty(&GlobalOplock_Q)) { 990 if (list_empty(&cifs_oplock_list)) {
990 spin_unlock(&GlobalMid_Lock); 991 spin_unlock(&cifs_oplock_lock);
991 set_current_state(TASK_INTERRUPTIBLE); 992 set_current_state(TASK_INTERRUPTIBLE);
992 schedule_timeout(39*HZ); 993 schedule_timeout(39*HZ);
993 } else { 994 } else {
994 oplock_item = list_entry(GlobalOplock_Q.next, 995 oplock_item = list_entry(cifs_oplock_list.next,
995 struct oplock_q_entry, qhead); 996 struct oplock_q_entry, qhead);
996 cFYI(1, ("found oplock item to write out")); 997 cFYI(1, ("found oplock item to write out"));
997 pTcon = oplock_item->tcon; 998 pTcon = oplock_item->tcon;
998 inode = oplock_item->pinode; 999 inode = oplock_item->pinode;
999 netfid = oplock_item->netfid; 1000 netfid = oplock_item->netfid;
1000 spin_unlock(&GlobalMid_Lock); 1001 spin_unlock(&cifs_oplock_lock);
1001 DeleteOplockQEntry(oplock_item); 1002 DeleteOplockQEntry(oplock_item);
1002 /* can not grab inode sem here since it would 1003 /* can not grab inode sem here since it would
1003 deadlock when oplock received on delete 1004 deadlock when oplock received on delete
@@ -1054,7 +1055,7 @@ init_cifs(void)
1054 int rc = 0; 1055 int rc = 0;
1055 cifs_proc_init(); 1056 cifs_proc_init();
1056 INIT_LIST_HEAD(&cifs_tcp_ses_list); 1057 INIT_LIST_HEAD(&cifs_tcp_ses_list);
1057 INIT_LIST_HEAD(&GlobalOplock_Q); 1058 INIT_LIST_HEAD(&cifs_oplock_list);
1058#ifdef CONFIG_CIFS_EXPERIMENTAL 1059#ifdef CONFIG_CIFS_EXPERIMENTAL
1059 INIT_LIST_HEAD(&GlobalDnotifyReqList); 1060 INIT_LIST_HEAD(&GlobalDnotifyReqList);
1060 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); 1061 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1083,6 +1084,7 @@ init_cifs(void)
1083 rwlock_init(&GlobalSMBSeslock); 1084 rwlock_init(&GlobalSMBSeslock);
1084 rwlock_init(&cifs_tcp_ses_lock); 1085 rwlock_init(&cifs_tcp_ses_lock);
1085 spin_lock_init(&GlobalMid_Lock); 1086 spin_lock_init(&GlobalMid_Lock);
1087 spin_lock_init(&cifs_oplock_lock);
1086 1088
1087 if (cifs_max_pending < 2) { 1089 if (cifs_max_pending < 2) {
1088 cifs_max_pending = 2; 1090 cifs_max_pending = 2;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300d..094325e3f714 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
113extern const struct export_operations cifs_export_ops; 113extern const struct export_operations cifs_export_ops;
114#endif /* EXPERIMENTAL */ 114#endif /* EXPERIMENTAL */
115 115
116#define CIFS_VERSION "1.60" 116#define CIFS_VERSION "1.61"
117#endif /* _CIFSFS_H */ 117#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c03..6cfc81a32703 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -351,11 +351,24 @@ struct cifsFileInfo {
351 bool closePend:1; /* file is marked to close */ 351 bool closePend:1; /* file is marked to close */
352 bool invalidHandle:1; /* file closed via session abend */ 352 bool invalidHandle:1; /* file closed via session abend */
353 bool messageMode:1; /* for pipes: message vs byte mode */ 353 bool messageMode:1; /* for pipes: message vs byte mode */
354 atomic_t wrtPending; /* handle in use - defer close */ 354 atomic_t count; /* reference count */
355 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 355 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
356 struct cifs_search_info srch_inf; 356 struct cifs_search_info srch_inf;
357}; 357};
358 358
359/* Take a reference on the file private data */
360static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
361{
362 atomic_inc(&cifs_file->count);
363}
364
365/* Release a reference on the file private data */
366static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
367{
368 if (atomic_dec_and_test(&cifs_file->count))
369 kfree(cifs_file);
370}
371
359/* 372/*
360 * One of these for each file inode 373 * One of these for each file inode
361 */ 374 */
@@ -656,7 +669,11 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
656 */ 669 */
657GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; 670GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
658 671
659GLOBAL_EXTERN struct list_head GlobalOplock_Q; 672/* Global list of oplocks */
673GLOBAL_EXTERN struct list_head cifs_oplock_list;
674
675/* Protects the cifs_oplock_list */
676GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
660 677
661/* Outstanding dir notify requests */ 678/* Outstanding dir notify requests */
662GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; 679GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d4..301e307e1279 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
100 to this tcon */ 100 to this tcon */
101} 101}
102 102
103/* Allocate and return pointer to an SMB request buffer, and set basic 103/* reconnect the socket, tcon, and smb session if needed */
104 SMB information in the SMB header. If the return code is zero, this
105 function must have filled in request_buf pointer */
106static int 104static int
107small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 105cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
108 void **request_buf)
109{ 106{
110 int rc = 0; 107 int rc = 0;
108 struct cifsSesInfo *ses;
109 struct TCP_Server_Info *server;
110 struct nls_table *nls_codepage;
111 111
112 /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so 112 /*
113 check for tcp and smb session status done differently 113 * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
114 for those three - in the calling routine */ 114 * tcp and smb session status done differently for those three - in the
115 if (tcon) { 115 * calling routine
116 if (tcon->tidStatus == CifsExiting) { 116 */
117 /* only tree disconnect, open, and write, 117 if (!tcon)
118 (and ulogoff which does not have tcon) 118 return 0;
119 are allowed as we start force umount */ 119
120 if ((smb_command != SMB_COM_WRITE_ANDX) && 120 ses = tcon->ses;
121 (smb_command != SMB_COM_OPEN_ANDX) && 121 server = ses->server;
122 (smb_command != SMB_COM_TREE_DISCONNECT)) { 122
123 cFYI(1, ("can not send cmd %d while umounting", 123 /*
124 smb_command)); 124 * only tree disconnect, open, and write, (and ulogoff which does not
125 return -ENODEV; 125 * have tcon) are allowed as we start force umount
126 } 126 */
127 if (tcon->tidStatus == CifsExiting) {
128 if (smb_command != SMB_COM_WRITE_ANDX &&
129 smb_command != SMB_COM_OPEN_ANDX &&
130 smb_command != SMB_COM_TREE_DISCONNECT) {
131 cFYI(1, ("can not send cmd %d while umounting",
132 smb_command));
133 return -ENODEV;
127 } 134 }
128 if ((tcon->ses) && (tcon->ses->status != CifsExiting) && 135 }
129 (tcon->ses->server)) {
130 struct nls_table *nls_codepage;
131 /* Give Demultiplex thread up to 10 seconds to
132 reconnect, should be greater than cifs socket
133 timeout which is 7 seconds */
134 while (tcon->ses->server->tcpStatus ==
135 CifsNeedReconnect) {
136 wait_event_interruptible_timeout(tcon->ses->server->response_q,
137 (tcon->ses->server->tcpStatus ==
138 CifsGood), 10 * HZ);
139 if (tcon->ses->server->tcpStatus ==
140 CifsNeedReconnect) {
141 /* on "soft" mounts we wait once */
142 if (!tcon->retry ||
143 (tcon->ses->status == CifsExiting)) {
144 cFYI(1, ("gave up waiting on "
145 "reconnect in smb_init"));
146 return -EHOSTDOWN;
147 } /* else "hard" mount - keep retrying
148 until process is killed or server
149 comes back on-line */
150 } else /* TCP session is reestablished now */
151 break;
152 }
153 136
154 nls_codepage = load_nls_default(); 137 if (ses->status == CifsExiting)
155 /* need to prevent multiple threads trying to 138 return -EIO;
156 simultaneously reconnect the same SMB session */
157 down(&tcon->ses->sesSem);
158 if (tcon->ses->need_reconnect)
159 rc = cifs_setup_session(0, tcon->ses,
160 nls_codepage);
161 if (!rc && (tcon->need_reconnect)) {
162 mark_open_files_invalid(tcon);
163 rc = CIFSTCon(0, tcon->ses, tcon->treeName,
164 tcon, nls_codepage);
165 up(&tcon->ses->sesSem);
166 /* BB FIXME add code to check if wsize needs
167 update due to negotiated smb buffer size
168 shrinking */
169 if (rc == 0) {
170 atomic_inc(&tconInfoReconnectCount);
171 /* tell server Unix caps we support */
172 if (tcon->ses->capabilities & CAP_UNIX)
173 reset_cifs_unix_caps(
174 0 /* no xid */,
175 tcon,
176 NULL /* we do not know sb */,
177 NULL /* no vol info */);
178 }
179 139
180 cFYI(1, ("reconnect tcon rc = %d", rc)); 140 /*
181 /* Removed call to reopen open files here. 141 * Give demultiplex thread up to 10 seconds to reconnect, should be
182 It is safer (and faster) to reopen files 142 * greater than cifs socket timeout which is 7 seconds
183 one at a time as needed in read and write */ 143 */
184 144 while (server->tcpStatus == CifsNeedReconnect) {
185 /* Check if handle based operation so we 145 wait_event_interruptible_timeout(server->response_q,
186 know whether we can continue or not without 146 (server->tcpStatus == CifsGood), 10 * HZ);
187 returning to caller to reset file handle */
188 switch (smb_command) {
189 case SMB_COM_READ_ANDX:
190 case SMB_COM_WRITE_ANDX:
191 case SMB_COM_CLOSE:
192 case SMB_COM_FIND_CLOSE2:
193 case SMB_COM_LOCKING_ANDX: {
194 unload_nls(nls_codepage);
195 return -EAGAIN;
196 }
197 }
198 } else {
199 up(&tcon->ses->sesSem);
200 }
201 unload_nls(nls_codepage);
202 147
203 } else { 148 /* is TCP session is reestablished now ?*/
204 return -EIO; 149 if (server->tcpStatus != CifsNeedReconnect)
150 break;
151
152 /*
153 * on "soft" mounts we wait once. Hard mounts keep
154 * retrying until process is killed or server comes
155 * back on-line
156 */
157 if (!tcon->retry || ses->status == CifsExiting) {
158 cFYI(1, ("gave up waiting on reconnect in smb_init"));
159 return -EHOSTDOWN;
205 } 160 }
206 } 161 }
162
163 if (!ses->need_reconnect && !tcon->need_reconnect)
164 return 0;
165
166 nls_codepage = load_nls_default();
167
168 /*
169 * need to prevent multiple threads trying to simultaneously
170 * reconnect the same SMB session
171 */
172 down(&ses->sesSem);
173 if (ses->need_reconnect)
174 rc = cifs_setup_session(0, ses, nls_codepage);
175
176 /* do we need to reconnect tcon? */
177 if (rc || !tcon->need_reconnect) {
178 up(&ses->sesSem);
179 goto out;
180 }
181
182 mark_open_files_invalid(tcon);
183 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
184 up(&ses->sesSem);
185 cFYI(1, ("reconnect tcon rc = %d", rc));
186
187 if (rc)
188 goto out;
189
190 /*
191 * FIXME: check if wsize needs updated due to negotiated smb buffer
192 * size shrinking
193 */
194 atomic_inc(&tconInfoReconnectCount);
195
196 /* tell server Unix caps we support */
197 if (ses->capabilities & CAP_UNIX)
198 reset_cifs_unix_caps(0, tcon, NULL, NULL);
199
200 /*
201 * Removed call to reopen open files here. It is safer (and faster) to
202 * reopen files one at a time as needed in read and write.
203 *
204 * FIXME: what about file locks? don't we need to reclaim them ASAP?
205 */
206
207out:
208 /*
209 * Check if handle based operation so we know whether we can continue
210 * or not without returning to caller to reset file handle
211 */
212 switch (smb_command) {
213 case SMB_COM_READ_ANDX:
214 case SMB_COM_WRITE_ANDX:
215 case SMB_COM_CLOSE:
216 case SMB_COM_FIND_CLOSE2:
217 case SMB_COM_LOCKING_ANDX:
218 rc = -EAGAIN;
219 }
220
221 unload_nls(nls_codepage);
222 return rc;
223}
224
225/* Allocate and return pointer to an SMB request buffer, and set basic
226 SMB information in the SMB header. If the return code is zero, this
227 function must have filled in request_buf pointer */
228static int
229small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
230 void **request_buf)
231{
232 int rc = 0;
233
234 rc = cifs_reconnect_tcon(tcon, smb_command);
207 if (rc) 235 if (rc)
208 return rc; 236 return rc;
209 237
@@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
256{ 284{
257 int rc = 0; 285 int rc = 0;
258 286
259 /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so 287 rc = cifs_reconnect_tcon(tcon, smb_command);
260 check for tcp and smb session status done differently
261 for those three - in the calling routine */
262 if (tcon) {
263 if (tcon->tidStatus == CifsExiting) {
264 /* only tree disconnect, open, and write,
265 (and ulogoff which does not have tcon)
266 are allowed as we start force umount */
267 if ((smb_command != SMB_COM_WRITE_ANDX) &&
268 (smb_command != SMB_COM_OPEN_ANDX) &&
269 (smb_command != SMB_COM_TREE_DISCONNECT)) {
270 cFYI(1, ("can not send cmd %d while umounting",
271 smb_command));
272 return -ENODEV;
273 }
274 }
275
276 if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
277 (tcon->ses->server)) {
278 struct nls_table *nls_codepage;
279 /* Give Demultiplex thread up to 10 seconds to
280 reconnect, should be greater than cifs socket
281 timeout which is 7 seconds */
282 while (tcon->ses->server->tcpStatus ==
283 CifsNeedReconnect) {
284 wait_event_interruptible_timeout(tcon->ses->server->response_q,
285 (tcon->ses->server->tcpStatus ==
286 CifsGood), 10 * HZ);
287 if (tcon->ses->server->tcpStatus ==
288 CifsNeedReconnect) {
289 /* on "soft" mounts we wait once */
290 if (!tcon->retry ||
291 (tcon->ses->status == CifsExiting)) {
292 cFYI(1, ("gave up waiting on "
293 "reconnect in smb_init"));
294 return -EHOSTDOWN;
295 } /* else "hard" mount - keep retrying
296 until process is killed or server
297 comes on-line */
298 } else /* TCP session is reestablished now */
299 break;
300 }
301 nls_codepage = load_nls_default();
302 /* need to prevent multiple threads trying to
303 simultaneously reconnect the same SMB session */
304 down(&tcon->ses->sesSem);
305 if (tcon->ses->need_reconnect)
306 rc = cifs_setup_session(0, tcon->ses,
307 nls_codepage);
308 if (!rc && (tcon->need_reconnect)) {
309 mark_open_files_invalid(tcon);
310 rc = CIFSTCon(0, tcon->ses, tcon->treeName,
311 tcon, nls_codepage);
312 up(&tcon->ses->sesSem);
313 /* BB FIXME add code to check if wsize needs
314 update due to negotiated smb buffer size
315 shrinking */
316 if (rc == 0) {
317 atomic_inc(&tconInfoReconnectCount);
318 /* tell server Unix caps we support */
319 if (tcon->ses->capabilities & CAP_UNIX)
320 reset_cifs_unix_caps(
321 0 /* no xid */,
322 tcon,
323 NULL /* do not know sb */,
324 NULL /* no vol info */);
325 }
326
327 cFYI(1, ("reconnect tcon rc = %d", rc));
328 /* Removed call to reopen open files here.
329 It is safer (and faster) to reopen files
330 one at a time as needed in read and write */
331
332 /* Check if handle based operation so we
333 know whether we can continue or not without
334 returning to caller to reset file handle */
335 switch (smb_command) {
336 case SMB_COM_READ_ANDX:
337 case SMB_COM_WRITE_ANDX:
338 case SMB_COM_CLOSE:
339 case SMB_COM_FIND_CLOSE2:
340 case SMB_COM_LOCKING_ANDX: {
341 unload_nls(nls_codepage);
342 return -EAGAIN;
343 }
344 }
345 } else {
346 up(&tcon->ses->sesSem);
347 }
348 unload_nls(nls_codepage);
349
350 } else {
351 return -EIO;
352 }
353 }
354 if (rc) 288 if (rc)
355 return rc; 289 return rc;
356 290
@@ -3961,6 +3895,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3961 if (is_unicode) { 3895 if (is_unicode) {
3962 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, 3896 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
3963 GFP_KERNEL); 3897 GFP_KERNEL);
3898 if (tmp == NULL) {
3899 rc = -ENOMEM;
3900 goto parse_DFS_referrals_exit;
3901 }
3964 cifsConvertToUCS((__le16 *) tmp, searchName, 3902 cifsConvertToUCS((__le16 *) tmp, searchName,
3965 PATH_MAX, nls_codepage, remap); 3903 PATH_MAX, nls_codepage, remap);
3966 node->path_consumed = cifs_ucs2_bytes(tmp, 3904 node->path_consumed = cifs_ucs2_bytes(tmp,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9bb5c8750736..d49682433c20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -803,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname,
803 char *data; 803 char *data;
804 unsigned int temp_len, i, j; 804 unsigned int temp_len, i, j;
805 char separator[2]; 805 char separator[2];
806 short int override_uid = -1;
807 short int override_gid = -1;
808 bool uid_specified = false;
809 bool gid_specified = false;
806 810
807 separator[0] = ','; 811 separator[0] = ',';
808 separator[1] = 0; 812 separator[1] = 0;
@@ -1093,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname,
1093 "too long.\n"); 1097 "too long.\n");
1094 return 1; 1098 return 1;
1095 } 1099 }
1096 } else if (strnicmp(data, "uid", 3) == 0) { 1100 } else if (!strnicmp(data, "uid", 3) && value && *value) {
1097 if (value && *value) 1101 vol->linux_uid = simple_strtoul(value, &value, 0);
1098 vol->linux_uid = 1102 uid_specified = true;
1099 simple_strtoul(value, &value, 0); 1103 } else if (!strnicmp(data, "forceuid", 8)) {
1100 } else if (strnicmp(data, "forceuid", 8) == 0) { 1104 override_uid = 1;
1101 vol->override_uid = 1; 1105 } else if (!strnicmp(data, "noforceuid", 10)) {
1102 } else if (strnicmp(data, "gid", 3) == 0) { 1106 override_uid = 0;
1103 if (value && *value) 1107 } else if (!strnicmp(data, "gid", 3) && value && *value) {
1104 vol->linux_gid = 1108 vol->linux_gid = simple_strtoul(value, &value, 0);
1105 simple_strtoul(value, &value, 0); 1109 gid_specified = true;
1106 } else if (strnicmp(data, "forcegid", 8) == 0) { 1110 } else if (!strnicmp(data, "forcegid", 8)) {
1107 vol->override_gid = 1; 1111 override_gid = 1;
1112 } else if (!strnicmp(data, "noforcegid", 10)) {
1113 override_gid = 0;
1108 } else if (strnicmp(data, "file_mode", 4) == 0) { 1114 } else if (strnicmp(data, "file_mode", 4) == 0) {
1109 if (value && *value) { 1115 if (value && *value) {
1110 vol->file_mode = 1116 vol->file_mode =
@@ -1355,11 +1361,23 @@ cifs_parse_mount_options(char *options, const char *devname,
1355 if (vol->UNCip == NULL) 1361 if (vol->UNCip == NULL)
1356 vol->UNCip = &vol->UNC[2]; 1362 vol->UNCip = &vol->UNC[2];
1357 1363
1364 if (uid_specified)
1365 vol->override_uid = override_uid;
1366 else if (override_uid == 1)
1367 printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
1368 "specified with no uid= option.\n");
1369
1370 if (gid_specified)
1371 vol->override_gid = override_gid;
1372 else if (override_gid == 1)
1373 printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
1374 "specified with no gid= option.\n");
1375
1358 return 0; 1376 return 0;
1359} 1377}
1360 1378
1361static struct TCP_Server_Info * 1379static struct TCP_Server_Info *
1362cifs_find_tcp_session(struct sockaddr_storage *addr) 1380cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
1363{ 1381{
1364 struct list_head *tmp; 1382 struct list_head *tmp;
1365 struct TCP_Server_Info *server; 1383 struct TCP_Server_Info *server;
@@ -1379,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
1379 if (server->tcpStatus == CifsNew) 1397 if (server->tcpStatus == CifsNew)
1380 continue; 1398 continue;
1381 1399
1382 if (addr->ss_family == AF_INET && 1400 switch (addr->ss_family) {
1383 (addr4->sin_addr.s_addr != 1401 case AF_INET:
1384 server->addr.sockAddr.sin_addr.s_addr)) 1402 if (addr4->sin_addr.s_addr ==
1385 continue; 1403 server->addr.sockAddr.sin_addr.s_addr) {
1386 else if (addr->ss_family == AF_INET6 && 1404 addr4->sin_port = htons(port);
1387 (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, 1405 /* user overrode default port? */
1388 &addr6->sin6_addr) || 1406 if (addr4->sin_port) {
1389 server->addr.sockAddr6.sin6_scope_id != 1407 if (addr4->sin_port !=
1390 addr6->sin6_scope_id)) 1408 server->addr.sockAddr.sin_port)
1391 continue; 1409 continue;
1410 }
1411 break;
1412 } else
1413 continue;
1414
1415 case AF_INET6:
1416 if (ipv6_addr_equal(&addr6->sin6_addr,
1417 &server->addr.sockAddr6.sin6_addr) &&
1418 (addr6->sin6_scope_id ==
1419 server->addr.sockAddr6.sin6_scope_id)) {
1420 addr6->sin6_port = htons(port);
1421 /* user overrode default port? */
1422 if (addr6->sin6_port) {
1423 if (addr6->sin6_port !=
1424 server->addr.sockAddr6.sin6_port)
1425 continue;
1426 }
1427 break;
1428 } else
1429 continue;
1430 }
1392 1431
1393 ++server->srv_count; 1432 ++server->srv_count;
1394 write_unlock(&cifs_tcp_ses_lock); 1433 write_unlock(&cifs_tcp_ses_lock);
@@ -1457,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1457 } 1496 }
1458 1497
1459 /* see if we already have a matching tcp_ses */ 1498 /* see if we already have a matching tcp_ses */
1460 tcp_ses = cifs_find_tcp_session(&addr); 1499 tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
1461 if (tcp_ses) 1500 if (tcp_ses)
1462 return tcp_ses; 1501 return tcp_ses;
1463 1502
@@ -2452,10 +2491,10 @@ try_mount_again:
2452 tcon->local_lease = volume_info->local_lease; 2491 tcon->local_lease = volume_info->local_lease;
2453 } 2492 }
2454 if (pSesInfo) { 2493 if (pSesInfo) {
2455 if (pSesInfo->capabilities & CAP_LARGE_FILES) { 2494 if (pSesInfo->capabilities & CAP_LARGE_FILES)
2456 sb->s_maxbytes = (u64) 1 << 63; 2495 sb->s_maxbytes = MAX_LFS_FILESIZE;
2457 } else 2496 else
2458 sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */ 2497 sb->s_maxbytes = MAX_NON_LFS;
2459 } 2498 }
2460 2499
2461 /* BB FIXME fix time_gran to be larger for LANMAN sessions */ 2500 /* BB FIXME fix time_gran to be larger for LANMAN sessions */
@@ -2544,11 +2583,20 @@ remote_path_check:
2544 2583
2545 if (mount_data != mount_data_global) 2584 if (mount_data != mount_data_global)
2546 kfree(mount_data); 2585 kfree(mount_data);
2586
2547 mount_data = cifs_compose_mount_options( 2587 mount_data = cifs_compose_mount_options(
2548 cifs_sb->mountdata, full_path + 1, 2588 cifs_sb->mountdata, full_path + 1,
2549 referrals, &fake_devname); 2589 referrals, &fake_devname);
2550 kfree(fake_devname); 2590
2551 free_dfs_info_array(referrals, num_referrals); 2591 free_dfs_info_array(referrals, num_referrals);
2592 kfree(fake_devname);
2593 kfree(full_path);
2594
2595 if (IS_ERR(mount_data)) {
2596 rc = PTR_ERR(mount_data);
2597 mount_data = NULL;
2598 goto mount_fail_check;
2599 }
2552 2600
2553 if (tcon) 2601 if (tcon)
2554 cifs_put_tcon(tcon); 2602 cifs_put_tcon(tcon);
@@ -2556,8 +2604,6 @@ remote_path_check:
2556 cifs_put_smb_ses(pSesInfo); 2604 cifs_put_smb_ses(pSesInfo);
2557 2605
2558 cleanup_volume_info(&volume_info); 2606 cleanup_volume_info(&volume_info);
2559 FreeXid(xid);
2560 kfree(full_path);
2561 referral_walks_count++; 2607 referral_walks_count++;
2562 goto try_mount_again; 2608 goto try_mount_again;
2563 } 2609 }
@@ -2611,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2611 return -EIO; 2657 return -EIO;
2612 2658
2613 smb_buffer = cifs_buf_get(); 2659 smb_buffer = cifs_buf_get();
2614 if (smb_buffer == NULL) { 2660 if (smb_buffer == NULL)
2615 return -ENOMEM; 2661 return -ENOMEM;
2616 } 2662
2617 smb_buffer_response = smb_buffer; 2663 smb_buffer_response = smb_buffer;
2618 2664
2619 header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, 2665 header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa9..a6424cfc0121 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
153 mutex_init(&pCifsFile->fh_mutex); 153 mutex_init(&pCifsFile->fh_mutex);
154 mutex_init(&pCifsFile->lock_mutex); 154 mutex_init(&pCifsFile->lock_mutex);
155 INIT_LIST_HEAD(&pCifsFile->llist); 155 INIT_LIST_HEAD(&pCifsFile->llist);
156 atomic_set(&pCifsFile->wrtPending, 0); 156 atomic_set(&pCifsFile->count, 1);
157 157
158 /* set the following in open now 158 /* set the following in open now
159 pCifsFile->pfile = file; */ 159 pCifsFile->pfile = file; */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217b..fa7beac8b80e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private(
53 private_data->pInode = inode; 53 private_data->pInode = inode;
54 private_data->invalidHandle = false; 54 private_data->invalidHandle = false;
55 private_data->closePend = false; 55 private_data->closePend = false;
56 /* we have to track num writers to the inode, since writepages 56 /* Initialize reference count to one. The private data is
57 does not tell us which handle the write is for so there can 57 freed on the release of the last reference */
58 be a close (overlapping with write) of the filehandle that 58 atomic_set(&private_data->count, 1);
59 cifs_writepages chose to use */
60 atomic_set(&private_data->wrtPending, 0);
61 59
62 return private_data; 60 return private_data;
63} 61}
@@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file)
643 if (!pTcon->need_reconnect) { 641 if (!pTcon->need_reconnect) {
644 write_unlock(&GlobalSMBSeslock); 642 write_unlock(&GlobalSMBSeslock);
645 timeout = 2; 643 timeout = 2;
646 while ((atomic_read(&pSMBFile->wrtPending) != 0) 644 while ((atomic_read(&pSMBFile->count) != 1)
647 && (timeout <= 2048)) { 645 && (timeout <= 2048)) {
648 /* Give write a better chance to get to 646 /* Give write a better chance to get to
649 server ahead of the close. We do not 647 server ahead of the close. We do not
@@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file)
657 msleep(timeout); 655 msleep(timeout);
658 timeout *= 4; 656 timeout *= 4;
659 } 657 }
660 if (atomic_read(&pSMBFile->wrtPending))
661 cERROR(1, ("close with pending write"));
662 if (!pTcon->need_reconnect && 658 if (!pTcon->need_reconnect &&
663 !pSMBFile->invalidHandle) 659 !pSMBFile->invalidHandle)
664 rc = CIFSSMBClose(xid, pTcon, 660 rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file)
681 list_del(&pSMBFile->flist); 677 list_del(&pSMBFile->flist);
682 list_del(&pSMBFile->tlist); 678 list_del(&pSMBFile->tlist);
683 write_unlock(&GlobalSMBSeslock); 679 write_unlock(&GlobalSMBSeslock);
684 timeout = 10; 680 cifsFileInfo_put(file->private_data);
685 /* We waited above to give the SMBWrite a chance to issue
686 on the wire (so we do not get SMBWrite returning EBADF
687 if writepages is racing with close. Note that writepages
688 does not specify a file handle, so it is possible for a file
689 to be opened twice, and the application close the "wrong"
690 file handle - in these cases we delay long enough to allow
691 the SMBWrite to get on the wire before the SMB Close.
692 We allow total wait here over 45 seconds, more than
693 oplock break time, and more than enough to allow any write
694 to complete on the server, or to time out on the client */
695 while ((atomic_read(&pSMBFile->wrtPending) != 0)
696 && (timeout <= 50000)) {
697 cERROR(1, ("writes pending, delay free of handle"));
698 msleep(timeout);
699 timeout *= 8;
700 }
701 kfree(file->private_data);
702 file->private_data = NULL; 681 file->private_data = NULL;
703 } else 682 } else
704 rc = -EBADF; 683 rc = -EBADF;
@@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1236 if (!open_file->invalidHandle) { 1215 if (!open_file->invalidHandle) {
1237 /* found a good file */ 1216 /* found a good file */
1238 /* lock it so it will not be closed on us */ 1217 /* lock it so it will not be closed on us */
1239 atomic_inc(&open_file->wrtPending); 1218 cifsFileInfo_get(open_file);
1240 read_unlock(&GlobalSMBSeslock); 1219 read_unlock(&GlobalSMBSeslock);
1241 return open_file; 1220 return open_file;
1242 } /* else might as well continue, and look for 1221 } /* else might as well continue, and look for
@@ -1276,7 +1255,7 @@ refind_writable:
1276 if (open_file->pfile && 1255 if (open_file->pfile &&
1277 ((open_file->pfile->f_flags & O_RDWR) || 1256 ((open_file->pfile->f_flags & O_RDWR) ||
1278 (open_file->pfile->f_flags & O_WRONLY))) { 1257 (open_file->pfile->f_flags & O_WRONLY))) {
1279 atomic_inc(&open_file->wrtPending); 1258 cifsFileInfo_get(open_file);
1280 1259
1281 if (!open_file->invalidHandle) { 1260 if (!open_file->invalidHandle) {
1282 /* found a good writable file */ 1261 /* found a good writable file */
@@ -1293,7 +1272,7 @@ refind_writable:
1293 else { /* start over in case this was deleted */ 1272 else { /* start over in case this was deleted */
1294 /* since the list could be modified */ 1273 /* since the list could be modified */
1295 read_lock(&GlobalSMBSeslock); 1274 read_lock(&GlobalSMBSeslock);
1296 atomic_dec(&open_file->wrtPending); 1275 cifsFileInfo_put(open_file);
1297 goto refind_writable; 1276 goto refind_writable;
1298 } 1277 }
1299 } 1278 }
@@ -1309,7 +1288,7 @@ refind_writable:
1309 read_lock(&GlobalSMBSeslock); 1288 read_lock(&GlobalSMBSeslock);
1310 /* can not use this handle, no write 1289 /* can not use this handle, no write
1311 pending on this one after all */ 1290 pending on this one after all */
1312 atomic_dec(&open_file->wrtPending); 1291 cifsFileInfo_put(open_file);
1313 1292
1314 if (open_file->closePend) /* list could have changed */ 1293 if (open_file->closePend) /* list could have changed */
1315 goto refind_writable; 1294 goto refind_writable;
@@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1373 if (open_file) { 1352 if (open_file) {
1374 bytes_written = cifs_write(open_file->pfile, write_data, 1353 bytes_written = cifs_write(open_file->pfile, write_data,
1375 to-from, &offset); 1354 to-from, &offset);
1376 atomic_dec(&open_file->wrtPending); 1355 cifsFileInfo_put(open_file);
1377 /* Does mm or vfs already set times? */ 1356 /* Does mm or vfs already set times? */
1378 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); 1357 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
1379 if ((bytes_written > 0) && (offset)) 1358 if ((bytes_written > 0) && (offset))
@@ -1562,7 +1541,7 @@ retry:
1562 bytes_to_write, offset, 1541 bytes_to_write, offset,
1563 &bytes_written, iov, n_iov, 1542 &bytes_written, iov, n_iov,
1564 long_op); 1543 long_op);
1565 atomic_dec(&open_file->wrtPending); 1544 cifsFileInfo_put(open_file);
1566 cifs_update_eof(cifsi, offset, bytes_written); 1545 cifs_update_eof(cifsi, offset, bytes_written);
1567 1546
1568 if (rc || bytes_written < bytes_to_write) { 1547 if (rc || bytes_written < bytes_to_write) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 18afe57b2461..1f09c7619319 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
212 * junction to the new submount (ie to setup the fake directory 212 * junction to the new submount (ie to setup the fake directory
213 * which represents a DFS referral). 213 * which represents a DFS referral).
214 */ 214 */
215void 215static void
216cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) 216cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
217{ 217{
218 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 218 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
388} 388}
389 389
390/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ 390/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
391void 391static void
392cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, 392cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
393 struct cifs_sb_info *cifs_sb, bool adjust_tz) 393 struct cifs_sb_info *cifs_sb, bool adjust_tz)
394{ 394{
@@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode,
513 cifs_sb->mnt_cifs_flags & 513 cifs_sb->mnt_cifs_flags &
514 CIFS_MOUNT_MAP_SPECIAL_CHR); 514 CIFS_MOUNT_MAP_SPECIAL_CHR);
515 if (rc1) { 515 if (rc1) {
516 /* BB EOPNOSUPP disable SERVER_INUM? */
517 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 516 cFYI(1, ("GetSrvInodeNum rc %d", rc1));
518 fattr.cf_uniqueid = iunique(sb, ROOT_I); 517 fattr.cf_uniqueid = iunique(sb, ROOT_I);
518 /* disable serverino if call not supported */
519 if (rc1 == -EINVAL)
520 cifs_sb->mnt_cifs_flags &=
521 ~CIFS_MOUNT_SERVER_INUM;
519 } 522 }
520 } else { 523 } else {
521 fattr.cf_uniqueid = iunique(sb, ROOT_I); 524 fattr.cf_uniqueid = iunique(sb, ROOT_I);
@@ -797,7 +800,7 @@ set_via_filehandle:
797 if (open_file == NULL) 800 if (open_file == NULL)
798 CIFSSMBClose(xid, pTcon, netfid); 801 CIFSSMBClose(xid, pTcon, netfid);
799 else 802 else
800 atomic_dec(&open_file->wrtPending); 803 cifsFileInfo_put(open_file);
801out: 804out:
802 return rc; 805 return rc;
803} 806}
@@ -1632,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1632 __u32 npid = open_file->pid; 1635 __u32 npid = open_file->pid;
1633 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1636 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1634 npid, false); 1637 npid, false);
1635 atomic_dec(&open_file->wrtPending); 1638 cifsFileInfo_put(open_file);
1636 cFYI(1, ("SetFSize for attrs rc = %d", rc)); 1639 cFYI(1, ("SetFSize for attrs rc = %d", rc));
1637 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1640 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1638 unsigned int bytes_written; 1641 unsigned int bytes_written;
@@ -1787,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1787 u16 nfid = open_file->netfid; 1790 u16 nfid = open_file->netfid;
1788 u32 npid = open_file->pid; 1791 u32 npid = open_file->pid;
1789 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); 1792 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
1790 atomic_dec(&open_file->wrtPending); 1793 cifsFileInfo_put(open_file);
1791 } else { 1794 } else {
1792 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, 1795 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
1793 cifs_sb->local_nls, 1796 cifs_sb->local_nls,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a6..1da4ab250eae 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
119 temp->pinode = pinode; 119 temp->pinode = pinode;
120 temp->tcon = tcon; 120 temp->tcon = tcon;
121 temp->netfid = fid; 121 temp->netfid = fid;
122 spin_lock(&GlobalMid_Lock); 122 spin_lock(&cifs_oplock_lock);
123 list_add_tail(&temp->qhead, &GlobalOplock_Q); 123 list_add_tail(&temp->qhead, &cifs_oplock_list);
124 spin_unlock(&GlobalMid_Lock); 124 spin_unlock(&cifs_oplock_lock);
125 } 125 }
126 return temp; 126 return temp;
127
128} 127}
129 128
130void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry) 129void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
131{ 130{
132 spin_lock(&GlobalMid_Lock); 131 spin_lock(&cifs_oplock_lock);
133 /* should we check if list empty first? */ 132 /* should we check if list empty first? */
134 list_del(&oplockEntry->qhead); 133 list_del(&oplockEntry->qhead);
135 spin_unlock(&GlobalMid_Lock); 134 spin_unlock(&cifs_oplock_lock);
136 kmem_cache_free(cifs_oplock_cachep, oplockEntry); 135 kmem_cache_free(cifs_oplock_cachep, oplockEntry);
137} 136}
138 137
@@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
144 if (tcon == NULL) 143 if (tcon == NULL)
145 return; 144 return;
146 145
147 spin_lock(&GlobalMid_Lock); 146 spin_lock(&cifs_oplock_lock);
148 list_for_each_entry(temp, &GlobalOplock_Q, qhead) { 147 list_for_each_entry(temp, &cifs_oplock_list, qhead) {
149 if ((temp->tcon) && (temp->tcon == tcon)) { 148 if ((temp->tcon) && (temp->tcon == tcon)) {
150 list_del(&temp->qhead); 149 list_del(&temp->qhead);
151 kmem_cache_free(cifs_oplock_cachep, temp); 150 kmem_cache_free(cifs_oplock_cachep, temp);
152 } 151 }
153 } 152 }
154 spin_unlock(&GlobalMid_Lock); 153 spin_unlock(&cifs_oplock_lock);
155} 154}
156 155
157static int 156static int
diff --git a/fs/compat.c b/fs/compat.c
index 94502dab972a..6d6f98fe64a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename,
1485 if (!bprm) 1485 if (!bprm)
1486 goto out_files; 1486 goto out_files;
1487 1487
1488 retval = -ERESTARTNOINTR; 1488 retval = prepare_bprm_creds(bprm);
1489 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1489 if (retval)
1490 goto out_free; 1490 goto out_free;
1491 current->in_execve = 1;
1492
1493 retval = -ENOMEM;
1494 bprm->cred = prepare_exec_creds();
1495 if (!bprm->cred)
1496 goto out_unlock;
1497 1491
1498 retval = check_unsafe_exec(bprm); 1492 retval = check_unsafe_exec(bprm);
1499 if (retval < 0) 1493 if (retval < 0)
1500 goto out_unlock; 1494 goto out_free;
1501 clear_in_exec = retval; 1495 clear_in_exec = retval;
1496 current->in_execve = 1;
1502 1497
1503 file = open_exec(filename); 1498 file = open_exec(filename);
1504 retval = PTR_ERR(file); 1499 retval = PTR_ERR(file);
@@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename,
1547 /* execve succeeded */ 1542 /* execve succeeded */
1548 current->fs->in_exec = 0; 1543 current->fs->in_exec = 0;
1549 current->in_execve = 0; 1544 current->in_execve = 0;
1550 mutex_unlock(&current->cred_guard_mutex);
1551 acct_update_integrals(current); 1545 acct_update_integrals(current);
1552 free_bprm(bprm); 1546 free_bprm(bprm);
1553 if (displaced) 1547 if (displaced)
@@ -1567,10 +1561,7 @@ out_file:
1567out_unmark: 1561out_unmark:
1568 if (clear_in_exec) 1562 if (clear_in_exec)
1569 current->fs->in_exec = 0; 1563 current->fs->in_exec = 0;
1570
1571out_unlock:
1572 current->in_execve = 0; 1564 current->in_execve = 0;
1573 mutex_unlock(&current->cred_guard_mutex);
1574 1565
1575out_free: 1566out_free:
1576 free_bprm(bprm); 1567 free_bprm(bprm);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f28f070a60fc..f91fd51b32e3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1905,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX)
1905COMPATIBLE_IOCTL(FIOASYNC) 1905COMPATIBLE_IOCTL(FIOASYNC)
1906COMPATIBLE_IOCTL(FIONBIO) 1906COMPATIBLE_IOCTL(FIONBIO)
1907COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ 1907COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
1908COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
1908/* 0x00 */ 1909/* 0x00 */
1909COMPATIBLE_IOCTL(FIBMAP) 1910COMPATIBLE_IOCTL(FIBMAP)
1910COMPATIBLE_IOCTL(FIGETBSZ) 1911COMPATIBLE_IOCTL(FIGETBSZ)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
51}; 51};
52 52
53static struct backing_dev_info configfs_backing_dev_info = { 53static struct backing_dev_info configfs_backing_dev_info = {
54 .name = "configfs",
54 .ra_pages = 0, /* No readahead */ 55 .ra_pages = 0, /* No readahead */
55 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 56 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
56}; 57};
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h> 34#include <linux/fs_struct.h>
35#include <linux/hardirq.h>
35#include "internal.h" 36#include "internal.h"
36 37
37int sysctl_vfs_cache_pressure __read_mostly = 100; 38int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..240cef14fe58 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
106#define CF_CONNECT_PENDING 3 106#define CF_CONNECT_PENDING 3
107#define CF_INIT_PENDING 4 107#define CF_INIT_PENDING 4
108#define CF_IS_OTHERCON 5 108#define CF_IS_OTHERCON 5
109#define CF_CLOSE 6
109 struct list_head writequeue; /* List of outgoing writequeue_entries */ 110 struct list_head writequeue; /* List of outgoing writequeue_entries */
110 spinlock_t writequeue_lock; 111 spinlock_t writequeue_lock;
111 int (*rx_action) (struct connection *); /* What to do when active */ 112 int (*rx_action) (struct connection *); /* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
299 300
300static inline void lowcomms_connect_sock(struct connection *con) 301static inline void lowcomms_connect_sock(struct connection *con)
301{ 302{
303 if (test_bit(CF_CLOSE, &con->flags))
304 return;
302 if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) 305 if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
303 queue_work(send_workqueue, &con->swork); 306 queue_work(send_workqueue, &con->swork);
304} 307}
@@ -926,10 +929,8 @@ static void tcp_connect_to_sock(struct connection *con)
926 goto out_err; 929 goto out_err;
927 930
928 memset(&saddr, 0, sizeof(saddr)); 931 memset(&saddr, 0, sizeof(saddr));
929 if (dlm_nodeid_to_addr(con->nodeid, &saddr)) { 932 if (dlm_nodeid_to_addr(con->nodeid, &saddr))
930 sock_release(sock);
931 goto out_err; 933 goto out_err;
932 }
933 934
934 sock->sk->sk_user_data = con; 935 sock->sk->sk_user_data = con;
935 con->rx_action = receive_from_sock; 936 con->rx_action = receive_from_sock;
@@ -1284,7 +1285,6 @@ out:
1284static void send_to_sock(struct connection *con) 1285static void send_to_sock(struct connection *con)
1285{ 1286{
1286 int ret = 0; 1287 int ret = 0;
1287 ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
1288 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1288 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1289 struct writequeue_entry *e; 1289 struct writequeue_entry *e;
1290 int len, offset; 1290 int len, offset;
@@ -1293,8 +1293,6 @@ static void send_to_sock(struct connection *con)
1293 if (con->sock == NULL) 1293 if (con->sock == NULL)
1294 goto out_connect; 1294 goto out_connect;
1295 1295
1296 sendpage = con->sock->ops->sendpage;
1297
1298 spin_lock(&con->writequeue_lock); 1296 spin_lock(&con->writequeue_lock);
1299 for (;;) { 1297 for (;;) {
1300 e = list_entry(con->writequeue.next, struct writequeue_entry, 1298 e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1309,8 +1307,8 @@ static void send_to_sock(struct connection *con)
1309 1307
1310 ret = 0; 1308 ret = 0;
1311 if (len) { 1309 if (len) {
1312 ret = sendpage(con->sock, e->page, offset, len, 1310 ret = kernel_sendpage(con->sock, e->page, offset, len,
1313 msg_flags); 1311 msg_flags);
1314 if (ret == -EAGAIN || ret == 0) { 1312 if (ret == -EAGAIN || ret == 0) {
1315 cond_resched(); 1313 cond_resched();
1316 goto out; 1314 goto out;
@@ -1370,6 +1368,13 @@ int dlm_lowcomms_close(int nodeid)
1370 log_print("closing connection to node %d", nodeid); 1368 log_print("closing connection to node %d", nodeid);
1371 con = nodeid2con(nodeid, 0); 1369 con = nodeid2con(nodeid, 0);
1372 if (con) { 1370 if (con) {
1371 clear_bit(CF_CONNECT_PENDING, &con->flags);
1372 clear_bit(CF_WRITE_PENDING, &con->flags);
1373 set_bit(CF_CLOSE, &con->flags);
1374 if (cancel_work_sync(&con->swork))
1375 log_print("canceled swork for node %d", nodeid);
1376 if (cancel_work_sync(&con->rwork))
1377 log_print("canceled rwork for node %d", nodeid);
1373 clean_one_writequeue(con); 1378 clean_one_writequeue(con);
1374 close_connection(con, true); 1379 close_connection(con, true);
1375 } 1380 }
@@ -1395,9 +1400,10 @@ static void process_send_sockets(struct work_struct *work)
1395 1400
1396 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { 1401 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
1397 con->connect_action(con); 1402 con->connect_action(con);
1403 set_bit(CF_WRITE_PENDING, &con->flags);
1398 } 1404 }
1399 clear_bit(CF_WRITE_PENDING, &con->flags); 1405 if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
1400 send_to_sock(con); 1406 send_to_sock(con);
1401} 1407}
1402 1408
1403 1409
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462d..55ea369f43a9 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
63 return rv; 63 return rv;
64 } 64 }
65 65
66 return genlmsg_unicast(skb, listener_nlpid); 66 return genlmsg_unicast(&init_net, skb, listener_nlpid);
67} 67}
68 68
69static int user_cmd(struct sk_buff *skb, struct genl_info *info) 69static int user_cmd(struct sk_buff *skb, struct genl_info *info)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb7..259525c9abb8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
1303 } 1303 }
1304 (*new_auth_tok)->session_key.encrypted_key_size = 1304 (*new_auth_tok)->session_key.encrypted_key_size =
1305 (body_size - (ECRYPTFS_SALT_SIZE + 5)); 1305 (body_size - (ECRYPTFS_SALT_SIZE + 5));
1306 if ((*new_auth_tok)->session_key.encrypted_key_size
1307 > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
1308 printk(KERN_WARNING "Tag 3 packet contains key larger "
1309 "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
1310 rc = -EINVAL;
1311 goto out_free;
1312 }
1306 if (unlikely(data[(*packet_size)++] != 0x04)) { 1313 if (unlikely(data[(*packet_size)++] != 0x04)) {
1307 printk(KERN_WARNING "Unknown version number [%d]\n", 1314 printk(KERN_WARNING "Unknown version number [%d]\n",
1308 data[(*packet_size) - 1]); 1315 data[(*packet_size) - 1]);
@@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
1449 rc = -EINVAL; 1456 rc = -EINVAL;
1450 goto out; 1457 goto out;
1451 } 1458 }
1459 if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
1460 printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
1461 "expected size\n");
1462 rc = -EINVAL;
1463 goto out;
1464 }
1452 if (data[(*packet_size)++] != 0x62) { 1465 if (data[(*packet_size)++] != 0x62) {
1453 printk(KERN_WARNING "Unrecognizable packet\n"); 1466 printk(KERN_WARNING "Unrecognizable packet\n");
1454 rc = -EINVAL; 1467 rc = -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 4a8849e45b21..172ceb6edde4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -678,8 +678,8 @@ exit:
678} 678}
679EXPORT_SYMBOL(open_exec); 679EXPORT_SYMBOL(open_exec);
680 680
681int kernel_read(struct file *file, unsigned long offset, 681int kernel_read(struct file *file, loff_t offset,
682 char *addr, unsigned long count) 682 char *addr, unsigned long count)
683{ 683{
684 mm_segment_t old_fs; 684 mm_segment_t old_fs;
685 loff_t pos = offset; 685 loff_t pos = offset;
@@ -1016,6 +1016,35 @@ out:
1016EXPORT_SYMBOL(flush_old_exec); 1016EXPORT_SYMBOL(flush_old_exec);
1017 1017
1018/* 1018/*
1019 * Prepare credentials and lock ->cred_guard_mutex.
1020 * install_exec_creds() commits the new creds and drops the lock.
1021 * Or, if exec fails before, free_bprm() should release ->cred and
1022 * and unlock.
1023 */
1024int prepare_bprm_creds(struct linux_binprm *bprm)
1025{
1026 if (mutex_lock_interruptible(&current->cred_guard_mutex))
1027 return -ERESTARTNOINTR;
1028
1029 bprm->cred = prepare_exec_creds();
1030 if (likely(bprm->cred))
1031 return 0;
1032
1033 mutex_unlock(&current->cred_guard_mutex);
1034 return -ENOMEM;
1035}
1036
1037void free_bprm(struct linux_binprm *bprm)
1038{
1039 free_arg_pages(bprm);
1040 if (bprm->cred) {
1041 mutex_unlock(&current->cred_guard_mutex);
1042 abort_creds(bprm->cred);
1043 }
1044 kfree(bprm);
1045}
1046
1047/*
1019 * install the new credentials for this executable 1048 * install the new credentials for this executable
1020 */ 1049 */
1021void install_exec_creds(struct linux_binprm *bprm) 1050void install_exec_creds(struct linux_binprm *bprm)
@@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm)
1024 1053
1025 commit_creds(bprm->cred); 1054 commit_creds(bprm->cred);
1026 bprm->cred = NULL; 1055 bprm->cred = NULL;
1027 1056 /*
1028 /* cred_guard_mutex must be held at least to this point to prevent 1057 * cred_guard_mutex must be held at least to this point to prevent
1029 * ptrace_attach() from altering our determination of the task's 1058 * ptrace_attach() from altering our determination of the task's
1030 * credentials; any time after this it may be unlocked */ 1059 * credentials; any time after this it may be unlocked.
1031 1060 */
1032 security_bprm_committed_creds(bprm); 1061 security_bprm_committed_creds(bprm);
1062 mutex_unlock(&current->cred_guard_mutex);
1033} 1063}
1034EXPORT_SYMBOL(install_exec_creds); 1064EXPORT_SYMBOL(install_exec_creds);
1035 1065
@@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1246 1276
1247EXPORT_SYMBOL(search_binary_handler); 1277EXPORT_SYMBOL(search_binary_handler);
1248 1278
1249void free_bprm(struct linux_binprm *bprm)
1250{
1251 free_arg_pages(bprm);
1252 if (bprm->cred)
1253 abort_creds(bprm->cred);
1254 kfree(bprm);
1255}
1256
1257/* 1279/*
1258 * sys_execve() executes a new program. 1280 * sys_execve() executes a new program.
1259 */ 1281 */
@@ -1277,20 +1299,15 @@ int do_execve(char * filename,
1277 if (!bprm) 1299 if (!bprm)
1278 goto out_files; 1300 goto out_files;
1279 1301
1280 retval = -ERESTARTNOINTR; 1302 retval = prepare_bprm_creds(bprm);
1281 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1303 if (retval)
1282 goto out_free; 1304 goto out_free;
1283 current->in_execve = 1;
1284
1285 retval = -ENOMEM;
1286 bprm->cred = prepare_exec_creds();
1287 if (!bprm->cred)
1288 goto out_unlock;
1289 1305
1290 retval = check_unsafe_exec(bprm); 1306 retval = check_unsafe_exec(bprm);
1291 if (retval < 0) 1307 if (retval < 0)
1292 goto out_unlock; 1308 goto out_free;
1293 clear_in_exec = retval; 1309 clear_in_exec = retval;
1310 current->in_execve = 1;
1294 1311
1295 file = open_exec(filename); 1312 file = open_exec(filename);
1296 retval = PTR_ERR(file); 1313 retval = PTR_ERR(file);
@@ -1340,7 +1357,6 @@ int do_execve(char * filename,
1340 /* execve succeeded */ 1357 /* execve succeeded */
1341 current->fs->in_exec = 0; 1358 current->fs->in_exec = 0;
1342 current->in_execve = 0; 1359 current->in_execve = 0;
1343 mutex_unlock(&current->cred_guard_mutex);
1344 acct_update_integrals(current); 1360 acct_update_integrals(current);
1345 free_bprm(bprm); 1361 free_bprm(bprm);
1346 if (displaced) 1362 if (displaced)
@@ -1360,10 +1376,7 @@ out_file:
1360out_unmark: 1376out_unmark:
1361 if (clear_in_exec) 1377 if (clear_in_exec)
1362 current->fs->in_exec = 0; 1378 current->fs->in_exec = 0;
1363
1364out_unlock:
1365 current->in_execve = 0; 1379 current->in_execve = 0;
1366 mutex_unlock(&current->cred_guard_mutex);
1367 1380
1368out_free: 1381out_free:
1369 free_bprm(bprm); 1382 free_bprm(bprm);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297cad..a63d44256a70 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
230 return error; 230 return error;
231} 231}
232 232
233static int 233int
234ext2_check_acl(struct inode *inode, int mask) 234ext2_check_acl(struct inode *inode, int mask)
235{ 235{
236 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); 236 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
246 return -EAGAIN; 246 return -EAGAIN;
247} 247}
248 248
249int
250ext2_permission(struct inode *inode, int mask)
251{
252 return generic_permission(inode, mask, ext2_check_acl);
253}
254
255/* 249/*
256 * Initialize the ACLs of a new inode. Called from ext2_new_inode. 250 * Initialize the ACLs of a new inode. Called from ext2_new_inode.
257 * 251 *
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898f..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
54#ifdef CONFIG_EXT2_FS_POSIX_ACL 54#ifdef CONFIG_EXT2_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext2_permission (struct inode *, int); 57extern int ext2_check_acl (struct inode *, int);
58extern int ext2_acl_chmod (struct inode *); 58extern int ext2_acl_chmod (struct inode *);
59extern int ext2_init_acl (struct inode *, struct inode *); 59extern int ext2_init_acl (struct inode *, struct inode *);
60 60
61#else 61#else
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext2_permission NULL 63#define ext2_check_acl NULL
64#define ext2_get_acl NULL 64#define ext2_get_acl NULL
65#define ext2_set_acl NULL 65#define ext2_set_acl NULL
66 66
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc9222..a2f3afd1a1c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
85 .removexattr = generic_removexattr, 85 .removexattr = generic_removexattr,
86#endif 86#endif
87 .setattr = ext2_setattr, 87 .setattr = ext2_setattr,
88 .permission = ext2_permission, 88 .check_acl = ext2_check_acl,
89 .fiemap = ext2_fiemap, 89 .fiemap = ext2_fiemap,
90}; 90};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4f..1c1638f873a4 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
482 unlock_buffer(bh); 482 unlock_buffer(bh);
483 mark_buffer_dirty_inode(bh, inode); 483 mark_buffer_dirty_inode(bh, inode);
484 /* We used to sync bh here if IS_SYNC(inode). 484 /* We used to sync bh here if IS_SYNC(inode).
485 * But we now rely upon generic_osync_inode() 485 * But we now rely upon generic_write_sync()
486 * and b_inode_buffers. But not for directories. 486 * and b_inode_buffers. But not for directories.
487 */ 487 */
488 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 488 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e1dedb0f7873..23701f289e98 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
362 if (dir_de) { 362 if (dir_de) {
363 if (old_dir != new_dir) 363 if (old_dir != new_dir)
364 ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); 364 ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
365 else {
366 kunmap(dir_page);
367 page_cache_release(dir_page);
368 }
365 inode_dec_link_count(old_dir); 369 inode_dec_link_count(old_dir);
366 } 370 }
367 return 0; 371 return 0;
@@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
396 .removexattr = generic_removexattr, 400 .removexattr = generic_removexattr,
397#endif 401#endif
398 .setattr = ext2_setattr, 402 .setattr = ext2_setattr,
399 .permission = ext2_permission, 403 .check_acl = ext2_check_acl,
400}; 404};
401 405
402const struct inode_operations ext2_special_inode_operations = { 406const struct inode_operations ext2_special_inode_operations = {
@@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
407 .removexattr = generic_removexattr, 411 .removexattr = generic_removexattr,
408#endif 412#endif
409 .setattr = ext2_setattr, 413 .setattr = ext2_setattr,
410 .permission = ext2_permission, 414 .check_acl = ext2_check_acl,
411}; 415};
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index fb3c1a21b135..522b15498f45 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -29,23 +29,25 @@ config EXT3_FS
29 module will be called ext3. 29 module will be called ext3.
30 30
31config EXT3_DEFAULTS_TO_ORDERED 31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3 (legacy option)" 32 bool "Default to 'data=ordered' in ext3"
33 depends on EXT3_FS 33 depends on EXT3_FS
34 help 34 help
35 If a filesystem does not explicitly specify a data ordering 35 The journal mode options for ext3 have different tradeoffs
36 mode, and the journal capability allowed it, ext3 used to 36 between when data is guaranteed to be on disk and
37 historically default to 'data=ordered'. 37 performance. The use of "data=writeback" can cause
38 38 unwritten data to appear in files after an system crash or
39 That was a rather unfortunate choice, because it leads to all 39 power failure, which can be a security issue. However,
40 kinds of latency problems, and the 'data=writeback' mode is more 40 "data=ordered" mode can also result in major performance
41 appropriate these days. 41 problems, including seconds-long delays before an fsync()
42 42 call returns. For details, see:
43 You should probably always answer 'n' here, and if you really 43
44 want to use 'data=ordered' mode, set it in the filesystem itself 44 http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
45 with 'tune2fs -o journal_data_ordered'. 45
46 46 If you have been historically happy with ext3's performance,
47 But if you really want to enable the legacy default, you can do 47 data=ordered mode will be a safe choice and you should
48 so by answering 'y' to this question. 48 answer 'y' here. If you understand the reliability and data
49 privacy issues of data=writeback and are willing to make
50 that trade off, answer 'n'.
49 51
50config EXT3_FS_XATTR 52config EXT3_FS_XATTR
51 bool "Ext3 extended attributes" 53 bool "Ext3 extended attributes"
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef0..c9b0df376b5f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
238 return error; 238 return error;
239} 239}
240 240
241static int 241int
242ext3_check_acl(struct inode *inode, int mask) 242ext3_check_acl(struct inode *inode, int mask)
243{ 243{
244 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); 244 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
254 return -EAGAIN; 254 return -EAGAIN;
255} 255}
256 256
257int
258ext3_permission(struct inode *inode, int mask)
259{
260 return generic_permission(inode, mask, ext3_check_acl);
261}
262
263/* 257/*
264 * Initialize the ACLs of a new inode. Called from ext3_new_inode. 258 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
265 * 259 *
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a5969..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
54#ifdef CONFIG_EXT3_FS_POSIX_ACL 54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext3_permission (struct inode *, int); 57extern int ext3_check_acl (struct inode *, int);
58extern int ext3_acl_chmod (struct inode *); 58extern int ext3_acl_chmod (struct inode *);
59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); 59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT3_FS_POSIX_ACL */ 61#else /* CONFIG_EXT3_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext3_permission NULL 63#define ext3_check_acl NULL
64 64
65static inline int 65static inline int
66ext3_acl_chmod(struct inode *inode) 66ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882f..373fa90c796a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
130 struct buffer_head *bh = NULL; 130 struct buffer_head *bh = NULL;
131 131
132 map_bh.b_state = 0; 132 map_bh.b_state = 0;
133 err = ext3_get_blocks_handle(NULL, inode, blk, 1, 133 err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
134 &map_bh, 0, 0);
135 if (err > 0) { 134 if (err > 0) {
136 pgoff_t index = map_bh.b_blocknr >> 135 pgoff_t index = map_bh.b_blocknr >>
137 (PAGE_CACHE_SHIFT - inode->i_blkbits); 136 (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231b..388bbdfa0b4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
51 return 0; 51 return 0;
52} 52}
53 53
54static ssize_t
55ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
56 unsigned long nr_segs, loff_t pos)
57{
58 struct file *file = iocb->ki_filp;
59 struct inode *inode = file->f_path.dentry->d_inode;
60 ssize_t ret;
61 int err;
62
63 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
64
65 /*
66 * Skip flushing if there was an error, or if nothing was written.
67 */
68 if (ret <= 0)
69 return ret;
70
71 /*
72 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
73 * journalling then we need to make sure that we force the transaction
74 * to disk to keep all metadata uptodate synchronously.
75 */
76 if (file->f_flags & O_SYNC) {
77 /*
78 * If we are non-data-journaled, then the dirty data has
79 * already been flushed to backing store by generic_osync_inode,
80 * and the inode has been flushed too if there have been any
81 * modifications other than mere timestamp updates.
82 *
83 * Open question --- do we care about flushing timestamps too
84 * if the inode is IS_SYNC?
85 */
86 if (!ext3_should_journal_data(inode))
87 return ret;
88
89 goto force_commit;
90 }
91
92 /*
93 * So we know that there has been no forced data flush. If the inode
94 * is marked IS_SYNC, we need to force one ourselves.
95 */
96 if (!IS_SYNC(inode))
97 return ret;
98
99 /*
100 * Open question #2 --- should we force data to disk here too? If we
101 * don't, the only impact is that data=writeback filesystems won't
102 * flush data to disk automatically on IS_SYNC, only metadata (but
103 * historically, that is what ext2 has done.)
104 */
105
106force_commit:
107 err = ext3_force_commit(inode->i_sb);
108 if (err)
109 return err;
110 return ret;
111}
112
113const struct file_operations ext3_file_operations = { 54const struct file_operations ext3_file_operations = {
114 .llseek = generic_file_llseek, 55 .llseek = generic_file_llseek,
115 .read = do_sync_read, 56 .read = do_sync_read,
116 .write = do_sync_write, 57 .write = do_sync_write,
117 .aio_read = generic_file_aio_read, 58 .aio_read = generic_file_aio_read,
118 .aio_write = ext3_file_write, 59 .aio_write = generic_file_aio_write,
119 .unlocked_ioctl = ext3_ioctl, 60 .unlocked_ioctl = ext3_ioctl,
120#ifdef CONFIG_COMPAT 61#ifdef CONFIG_COMPAT
121 .compat_ioctl = ext3_compat_ioctl, 62 .compat_ioctl = ext3_compat_ioctl,
@@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = {
137 .listxattr = ext3_listxattr, 78 .listxattr = ext3_listxattr,
138 .removexattr = generic_removexattr, 79 .removexattr = generic_removexattr,
139#endif 80#endif
140 .permission = ext3_permission, 81 .check_acl = ext3_check_acl,
141 .fiemap = ext3_fiemap, 82 .fiemap = ext3_fiemap,
142}; 83};
143 84
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..451d166bbe93 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/blkdev.h>
26#include <linux/fs.h> 27#include <linux/fs.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
73 } 74 }
74 75
75 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 76 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
76 goto out; 77 goto flush;
77 78
78 /* 79 /*
79 * The VFS has written the file data. If the inode is unaltered 80 * The VFS has written the file data. If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
85 .nr_to_write = 0, /* sys_fsync did this */ 86 .nr_to_write = 0, /* sys_fsync did this */
86 }; 87 };
87 ret = sync_inode(inode, &wbc); 88 ret = sync_inode(inode, &wbc);
89 goto out;
88 } 90 }
91flush:
92 /*
93 * In case we didn't commit a transaction, we have to flush
94 * disk caches manually so that data really is on persistent
95 * storage
96 */
97 if (test_opt(inode->i_sb, BARRIER))
98 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
89out: 99out:
90 return ret; 100 return ret;
91} 101}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5f51fed5c750..cd098a7b77fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
172 * so before we call here everything must be consistently dirtied against 172 * so before we call here everything must be consistently dirtied against
173 * this transaction. 173 * this transaction.
174 */ 174 */
175static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) 175static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
176{ 176{
177 int ret;
178
177 jbd_debug(2, "restarting handle %p\n", handle); 179 jbd_debug(2, "restarting handle %p\n", handle);
178 return ext3_journal_restart(handle, blocks_for_truncate(inode)); 180 /*
181 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
182 * At this moment, get_block can be called only for blocks inside
183 * i_size since page cache has been already dropped and writes are
184 * blocked by i_mutex. So we can safely drop the truncate_mutex.
185 */
186 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
187 ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
188 mutex_lock(&EXT3_I(inode)->truncate_mutex);
189 return ret;
179} 190}
180 191
181/* 192/*
@@ -788,7 +799,7 @@ err_out:
788int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, 799int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
789 sector_t iblock, unsigned long maxblocks, 800 sector_t iblock, unsigned long maxblocks,
790 struct buffer_head *bh_result, 801 struct buffer_head *bh_result,
791 int create, int extend_disksize) 802 int create)
792{ 803{
793 int err = -EIO; 804 int err = -EIO;
794 int offsets[4]; 805 int offsets[4];
@@ -911,13 +922,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
911 if (!err) 922 if (!err)
912 err = ext3_splice_branch(handle, inode, iblock, 923 err = ext3_splice_branch(handle, inode, iblock,
913 partial, indirect_blks, count); 924 partial, indirect_blks, count);
914 /*
915 * i_disksize growing is protected by truncate_mutex. Don't forget to
916 * protect it if you're about to implement concurrent
917 * ext3_get_block() -bzzz
918 */
919 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
920 ei->i_disksize = inode->i_size;
921 mutex_unlock(&ei->truncate_mutex); 925 mutex_unlock(&ei->truncate_mutex);
922 if (err) 926 if (err)
923 goto cleanup; 927 goto cleanup;
@@ -972,7 +976,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
972 } 976 }
973 977
974 ret = ext3_get_blocks_handle(handle, inode, iblock, 978 ret = ext3_get_blocks_handle(handle, inode, iblock,
975 max_blocks, bh_result, create, 0); 979 max_blocks, bh_result, create);
976 if (ret > 0) { 980 if (ret > 0) {
977 bh_result->b_size = (ret << inode->i_blkbits); 981 bh_result->b_size = (ret << inode->i_blkbits);
978 ret = 0; 982 ret = 0;
@@ -1005,7 +1009,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1005 dummy.b_blocknr = -1000; 1009 dummy.b_blocknr = -1000;
1006 buffer_trace_init(&dummy.b_history); 1010 buffer_trace_init(&dummy.b_history);
1007 err = ext3_get_blocks_handle(handle, inode, block, 1, 1011 err = ext3_get_blocks_handle(handle, inode, block, 1,
1008 &dummy, create, 1); 1012 &dummy, create);
1009 /* 1013 /*
1010 * ext3_get_blocks_handle() returns number of blocks 1014 * ext3_get_blocks_handle() returns number of blocks
1011 * mapped. 0 in case of a HOLE. 1015 * mapped. 0 in case of a HOLE.
@@ -1193,15 +1197,16 @@ write_begin_failed:
1193 * i_size_read because we hold i_mutex. 1197 * i_size_read because we hold i_mutex.
1194 * 1198 *
1195 * Add inode to orphan list in case we crash before truncate 1199 * Add inode to orphan list in case we crash before truncate
1196 * finishes. 1200 * finishes. Do this only if ext3_can_truncate() agrees so
1201 * that orphan processing code is happy.
1197 */ 1202 */
1198 if (pos + len > inode->i_size) 1203 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1199 ext3_orphan_add(handle, inode); 1204 ext3_orphan_add(handle, inode);
1200 ext3_journal_stop(handle); 1205 ext3_journal_stop(handle);
1201 unlock_page(page); 1206 unlock_page(page);
1202 page_cache_release(page); 1207 page_cache_release(page);
1203 if (pos + len > inode->i_size) 1208 if (pos + len > inode->i_size)
1204 vmtruncate(inode, inode->i_size); 1209 ext3_truncate(inode);
1205 } 1210 }
1206 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1211 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1207 goto retry; 1212 goto retry;
@@ -1287,7 +1292,7 @@ static int ext3_ordered_write_end(struct file *file,
1287 * There may be allocated blocks outside of i_size because 1292 * There may be allocated blocks outside of i_size because
1288 * we failed to copy some data. Prepare for truncate. 1293 * we failed to copy some data. Prepare for truncate.
1289 */ 1294 */
1290 if (pos + len > inode->i_size) 1295 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1291 ext3_orphan_add(handle, inode); 1296 ext3_orphan_add(handle, inode);
1292 ret2 = ext3_journal_stop(handle); 1297 ret2 = ext3_journal_stop(handle);
1293 if (!ret) 1298 if (!ret)
@@ -1296,7 +1301,7 @@ static int ext3_ordered_write_end(struct file *file,
1296 page_cache_release(page); 1301 page_cache_release(page);
1297 1302
1298 if (pos + len > inode->i_size) 1303 if (pos + len > inode->i_size)
1299 vmtruncate(inode, inode->i_size); 1304 ext3_truncate(inode);
1300 return ret ? ret : copied; 1305 return ret ? ret : copied;
1301} 1306}
1302 1307
@@ -1315,14 +1320,14 @@ static int ext3_writeback_write_end(struct file *file,
1315 * There may be allocated blocks outside of i_size because 1320 * There may be allocated blocks outside of i_size because
1316 * we failed to copy some data. Prepare for truncate. 1321 * we failed to copy some data. Prepare for truncate.
1317 */ 1322 */
1318 if (pos + len > inode->i_size) 1323 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1319 ext3_orphan_add(handle, inode); 1324 ext3_orphan_add(handle, inode);
1320 ret = ext3_journal_stop(handle); 1325 ret = ext3_journal_stop(handle);
1321 unlock_page(page); 1326 unlock_page(page);
1322 page_cache_release(page); 1327 page_cache_release(page);
1323 1328
1324 if (pos + len > inode->i_size) 1329 if (pos + len > inode->i_size)
1325 vmtruncate(inode, inode->i_size); 1330 ext3_truncate(inode);
1326 return ret ? ret : copied; 1331 return ret ? ret : copied;
1327} 1332}
1328 1333
@@ -1358,7 +1363,7 @@ static int ext3_journalled_write_end(struct file *file,
1358 * There may be allocated blocks outside of i_size because 1363 * There may be allocated blocks outside of i_size because
1359 * we failed to copy some data. Prepare for truncate. 1364 * we failed to copy some data. Prepare for truncate.
1360 */ 1365 */
1361 if (pos + len > inode->i_size) 1366 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1362 ext3_orphan_add(handle, inode); 1367 ext3_orphan_add(handle, inode);
1363 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1368 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1364 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1369 if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1380,7 @@ static int ext3_journalled_write_end(struct file *file,
1375 page_cache_release(page); 1380 page_cache_release(page);
1376 1381
1377 if (pos + len > inode->i_size) 1382 if (pos + len > inode->i_size)
1378 vmtruncate(inode, inode->i_size); 1383 ext3_truncate(inode);
1379 return ret ? ret : copied; 1384 return ret ? ret : copied;
1380} 1385}
1381 1386
@@ -2078,7 +2083,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2078 ext3_journal_dirty_metadata(handle, bh); 2083 ext3_journal_dirty_metadata(handle, bh);
2079 } 2084 }
2080 ext3_mark_inode_dirty(handle, inode); 2085 ext3_mark_inode_dirty(handle, inode);
2081 ext3_journal_test_restart(handle, inode); 2086 truncate_restart_transaction(handle, inode);
2082 if (bh) { 2087 if (bh) {
2083 BUFFER_TRACE(bh, "retaking write access"); 2088 BUFFER_TRACE(bh, "retaking write access");
2084 ext3_journal_get_write_access(handle, bh); 2089 ext3_journal_get_write_access(handle, bh);
@@ -2288,7 +2293,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2288 return; 2293 return;
2289 if (try_to_extend_transaction(handle, inode)) { 2294 if (try_to_extend_transaction(handle, inode)) {
2290 ext3_mark_inode_dirty(handle, inode); 2295 ext3_mark_inode_dirty(handle, inode);
2291 ext3_journal_test_restart(handle, inode); 2296 truncate_restart_transaction(handle, inode);
2292 } 2297 }
2293 2298
2294 ext3_free_blocks(handle, inode, nr, 1); 2299 ext3_free_blocks(handle, inode, nr, 1);
@@ -2898,6 +2903,10 @@ static int ext3_do_update_inode(handle_t *handle,
2898 struct buffer_head *bh = iloc->bh; 2903 struct buffer_head *bh = iloc->bh;
2899 int err = 0, rc, block; 2904 int err = 0, rc, block;
2900 2905
2906again:
2907 /* we can't allow multiple procs in here at once, its a bit racey */
2908 lock_buffer(bh);
2909
2901 /* For fields not not tracking in the in-memory inode, 2910 /* For fields not not tracking in the in-memory inode,
2902 * initialise them to zero for new inodes. */ 2911 * initialise them to zero for new inodes. */
2903 if (ei->i_state & EXT3_STATE_NEW) 2912 if (ei->i_state & EXT3_STATE_NEW)
@@ -2957,16 +2966,20 @@ static int ext3_do_update_inode(handle_t *handle,
2957 /* If this is the first large file 2966 /* If this is the first large file
2958 * created, add a flag to the superblock. 2967 * created, add a flag to the superblock.
2959 */ 2968 */
2969 unlock_buffer(bh);
2960 err = ext3_journal_get_write_access(handle, 2970 err = ext3_journal_get_write_access(handle,
2961 EXT3_SB(sb)->s_sbh); 2971 EXT3_SB(sb)->s_sbh);
2962 if (err) 2972 if (err)
2963 goto out_brelse; 2973 goto out_brelse;
2974
2964 ext3_update_dynamic_rev(sb); 2975 ext3_update_dynamic_rev(sb);
2965 EXT3_SET_RO_COMPAT_FEATURE(sb, 2976 EXT3_SET_RO_COMPAT_FEATURE(sb,
2966 EXT3_FEATURE_RO_COMPAT_LARGE_FILE); 2977 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2967 handle->h_sync = 1; 2978 handle->h_sync = 1;
2968 err = ext3_journal_dirty_metadata(handle, 2979 err = ext3_journal_dirty_metadata(handle,
2969 EXT3_SB(sb)->s_sbh); 2980 EXT3_SB(sb)->s_sbh);
2981 /* get our lock and start over */
2982 goto again;
2970 } 2983 }
2971 } 2984 }
2972 } 2985 }
@@ -2989,6 +3002,7 @@ static int ext3_do_update_inode(handle_t *handle,
2989 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 3002 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2990 3003
2991 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 3004 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
3005 unlock_buffer(bh);
2992 rc = ext3_journal_dirty_metadata(handle, bh); 3006 rc = ext3_journal_dirty_metadata(handle, bh);
2993 if (!err) 3007 if (!err)
2994 err = rc; 3008 err = rc;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b9730234..aad6400c9b77 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
2445 .listxattr = ext3_listxattr, 2445 .listxattr = ext3_listxattr,
2446 .removexattr = generic_removexattr, 2446 .removexattr = generic_removexattr,
2447#endif 2447#endif
2448 .permission = ext3_permission, 2448 .check_acl = ext3_check_acl,
2449}; 2449};
2450 2450
2451const struct inode_operations ext3_special_inode_operations = { 2451const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
2456 .listxattr = ext3_listxattr, 2456 .listxattr = ext3_listxattr,
2457 .removexattr = generic_removexattr, 2457 .removexattr = generic_removexattr,
2458#endif 2458#endif
2459 .permission = ext3_permission, 2459 .check_acl = ext3_check_acl,
2460}; 2460};
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 524b349c6299..a8d80a7f1105 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
543#endif 543#endif
544} 544}
545 545
546static char *data_mode_string(unsigned long mode)
547{
548 switch (mode) {
549 case EXT3_MOUNT_JOURNAL_DATA:
550 return "journal";
551 case EXT3_MOUNT_ORDERED_DATA:
552 return "ordered";
553 case EXT3_MOUNT_WRITEBACK_DATA:
554 return "writeback";
555 }
556 return "unknown";
557}
558
546/* 559/*
547 * Show an option if 560 * Show an option if
548 * - it's set to a non-default value OR 561 * - it's set to a non-default value OR
@@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
616 if (test_opt(sb, NOBH)) 629 if (test_opt(sb, NOBH))
617 seq_puts(seq, ",nobh"); 630 seq_puts(seq, ",nobh");
618 631
619 if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) 632 seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
620 seq_puts(seq, ",data=journal"); 633 EXT3_MOUNT_DATA_FLAGS));
621 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
622 seq_puts(seq, ",data=ordered");
623 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
624 seq_puts(seq, ",data=writeback");
625
626 if (test_opt(sb, DATA_ERR_ABORT)) 634 if (test_opt(sb, DATA_ERR_ABORT))
627 seq_puts(seq, ",data_err=abort"); 635 seq_puts(seq, ",data_err=abort");
628 636
@@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb,
1024 datacheck: 1032 datacheck:
1025 if (is_remount) { 1033 if (is_remount) {
1026 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) 1034 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
1027 != data_opt) { 1035 == data_opt)
1028 printk(KERN_ERR 1036 break;
1029 "EXT3-fs: cannot change data " 1037 printk(KERN_ERR
1030 "mode on remount\n"); 1038 "EXT3-fs (device %s): Cannot change "
1031 return 0; 1039 "data mode on remount. The filesystem "
1032 } 1040 "is mounted in data=%s mode and you "
1041 "try to remount it in data=%s mode.\n",
1042 sb->s_id,
1043 data_mode_string(sbi->s_mount_opt &
1044 EXT3_MOUNT_DATA_FLAGS),
1045 data_mode_string(data_opt));
1046 return 0;
1033 } else { 1047 } else {
1034 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; 1048 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
1035 sbi->s_mount_opt |= data_opt; 1049 sbi->s_mount_opt |= data_opt;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..d5c0ea2e8f2d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
37 37
38 To enable backwards compatibility so that systems that are 38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev, 39 still expecting to mount ext4 filesystems using ext4dev,
40 chose Y here. This feature will go away by 2.6.31, so 40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed! 41 please arrange to get your userspace programs fixed!
42 42
43config EXT4_FS_XATTR 43config EXT4_FS_XATTR
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
77 77
78 If you are not using a security module that requires using 78 If you are not using a security module that requires using
79 extended attributes for file security labels, say N. 79 extended attributes for file security labels, say N.
80
81config EXT4_DEBUG
82 bool "EXT4 debugging support"
83 depends on EXT4_FS
84 help
85 Enables run-time debugging support for the ext4 filesystem.
86
87 If you select Y here, then you will be able to turn on debugging
88 with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149ca..0df88b2a69b0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
236 return error; 236 return error;
237} 237}
238 238
239static int 239int
240ext4_check_acl(struct inode *inode, int mask) 240ext4_check_acl(struct inode *inode, int mask)
241{ 241{
242 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); 242 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
252 return -EAGAIN; 252 return -EAGAIN;
253} 253}
254 254
255int
256ext4_permission(struct inode *inode, int mask)
257{
258 return generic_permission(inode, mask, ext4_check_acl);
259}
260
261/* 255/*
262 * Initialize the ACLs of a new inode. Called from ext4_new_inode. 256 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
263 * 257 *
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba6..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
54#ifdef CONFIG_EXT4_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext4_permission(struct inode *, int); 57extern int ext4_check_acl(struct inode *, int);
58extern int ext4_acl_chmod(struct inode *); 58extern int ext4_acl_chmod(struct inode *);
59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); 59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT4_FS_POSIX_ACL */ 61#else /* CONFIG_EXT4_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext4_permission NULL 63#define ext4_check_acl NULL
64 64
65static inline int 65static inline int
66ext4_acl_chmod(struct inode *inode) 66ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
478 * new bitmap information 478 * new bitmap information
479 */ 479 */
480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
481 ext4_mb_update_group_info(grp, blocks_freed); 481 grp->bb_free += blocks_freed;
482 up_write(&grp->alloc_sem); 482 up_write(&grp->alloc_sem);
483 483
484 /* We dirtied the bitmap block */ 484 /* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..e227eea23f05 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t;
67 67
68 68
69/* prefer goal again. length */ 69/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 1 70#define EXT4_MB_HINT_MERGE 0x0001
71/* blocks already reserved */ 71/* blocks already reserved */
72#define EXT4_MB_HINT_RESERVED 2 72#define EXT4_MB_HINT_RESERVED 0x0002
73/* metadata is being allocated */ 73/* metadata is being allocated */
74#define EXT4_MB_HINT_METADATA 4 74#define EXT4_MB_HINT_METADATA 0x0004
75/* first blocks in the file */ 75/* first blocks in the file */
76#define EXT4_MB_HINT_FIRST 8 76#define EXT4_MB_HINT_FIRST 0x0008
77/* search for the best chunk */ 77/* search for the best chunk */
78#define EXT4_MB_HINT_BEST 16 78#define EXT4_MB_HINT_BEST 0x0010
79/* data is being allocated */ 79/* data is being allocated */
80#define EXT4_MB_HINT_DATA 32 80#define EXT4_MB_HINT_DATA 0x0020
81/* don't preallocate (for tails) */ 81/* don't preallocate (for tails) */
82#define EXT4_MB_HINT_NOPREALLOC 64 82#define EXT4_MB_HINT_NOPREALLOC 0x0040
83/* allocate for locality group */ 83/* allocate for locality group */
84#define EXT4_MB_HINT_GROUP_ALLOC 128 84#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
85/* allocate goal blocks or none */ 85/* allocate goal blocks or none */
86#define EXT4_MB_HINT_GOAL_ONLY 256 86#define EXT4_MB_HINT_GOAL_ONLY 0x0100
87/* goal is meaningful */ 87/* goal is meaningful */
88#define EXT4_MB_HINT_TRY_GOAL 512 88#define EXT4_MB_HINT_TRY_GOAL 0x0200
89/* blocks already pre-reserved by delayed allocation */ 89/* blocks already pre-reserved by delayed allocation */
90#define EXT4_MB_DELALLOC_RESERVED 1024 90#define EXT4_MB_DELALLOC_RESERVED 0x0400
91/* We are doing stream allocation */
92#define EXT4_MB_STREAM_ALLOC 0x0800
91 93
92 94
93struct ext4_allocation_request { 95struct ext4_allocation_request {
@@ -112,6 +114,21 @@ struct ext4_allocation_request {
112}; 114};
113 115
114/* 116/*
117 * For delayed allocation tracking
118 */
119struct mpage_da_data {
120 struct inode *inode;
121 sector_t b_blocknr; /* start block number of extent */
122 size_t b_size; /* size of extent */
123 unsigned long b_state; /* state of the extent */
124 unsigned long first_page, next_page; /* extent of pages */
125 struct writeback_control *wbc;
126 int io_done;
127 int pages_written;
128 int retval;
129};
130
131/*
115 * Special inodes numbers 132 * Special inodes numbers
116 */ 133 */
117#define EXT4_BAD_INO 1 /* Bad blocks inode */ 134#define EXT4_BAD_INO 1 /* Bad blocks inode */
@@ -251,7 +268,6 @@ struct flex_groups {
251#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 268#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
252#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 269#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
253#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 270#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
254#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
255#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 271#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
256 272
257#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 273#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
289#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ 305#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
290#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 306#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
291#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ 307#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
308#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
292 309
293/* Used to pass group descriptor data when online resize is done */ 310/* Used to pass group descriptor data when online resize is done */
294struct ext4_new_group_input { 311struct ext4_new_group_input {
@@ -386,6 +403,9 @@ struct ext4_mount_options {
386#endif 403#endif
387}; 404};
388 405
406/* Max physical block we can addres w/o extents */
407#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
408
389/* 409/*
390 * Structure of an inode on the disk 410 * Structure of an inode on the disk
391 */ 411 */
@@ -456,7 +476,6 @@ struct move_extent {
456 __u64 len; /* block length to be moved */ 476 __u64 len; /* block length to be moved */
457 __u64 moved_len; /* moved block length */ 477 __u64 moved_len; /* moved block length */
458}; 478};
459#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
460 479
461#define EXT4_EPOCH_BITS 2 480#define EXT4_EPOCH_BITS 2
462#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) 481#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -694,7 +713,6 @@ struct ext4_inode_info {
694#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 713#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
695#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 714#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
696#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 715#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
697#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
698#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 716#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
699#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 717#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
700#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 718#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -841,6 +859,7 @@ struct ext4_sb_info {
841 unsigned long s_gdb_count; /* Number of group descriptor blocks */ 859 unsigned long s_gdb_count; /* Number of group descriptor blocks */
842 unsigned long s_desc_per_block; /* Number of group descriptors per block */ 860 unsigned long s_desc_per_block; /* Number of group descriptors per block */
843 ext4_group_t s_groups_count; /* Number of groups in the fs */ 861 ext4_group_t s_groups_count; /* Number of groups in the fs */
862 ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
844 unsigned long s_overhead_last; /* Last calculated overhead */ 863 unsigned long s_overhead_last; /* Last calculated overhead */
845 unsigned long s_blocks_last; /* Last seen block count */ 864 unsigned long s_blocks_last; /* Last seen block count */
846 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 865 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -950,6 +969,7 @@ struct ext4_sb_info {
950 atomic_t s_mb_lost_chunks; 969 atomic_t s_mb_lost_chunks;
951 atomic_t s_mb_preallocated; 970 atomic_t s_mb_preallocated;
952 atomic_t s_mb_discarded; 971 atomic_t s_mb_discarded;
972 atomic_t s_lock_busy;
953 973
954 /* locality groups */ 974 /* locality groups */
955 struct ext4_locality_group *s_locality_groups; 975 struct ext4_locality_group *s_locality_groups;
@@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1340 ext4_fsblk_t, unsigned long, int, unsigned long *); 1360 ext4_fsblk_t, unsigned long, int, unsigned long *);
1341extern int ext4_mb_add_groupinfo(struct super_block *sb, 1361extern int ext4_mb_add_groupinfo(struct super_block *sb,
1342 ext4_group_t i, struct ext4_group_desc *desc); 1362 ext4_group_t i, struct ext4_group_desc *desc);
1343extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1344 ext4_grpblk_t add);
1345extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1363extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1346extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1364extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1347 ext4_group_t, int); 1365 ext4_group_t, int);
@@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1367extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1385extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1368extern int ext4_can_truncate(struct inode *inode); 1386extern int ext4_can_truncate(struct inode *inode);
1369extern void ext4_truncate(struct inode *); 1387extern void ext4_truncate(struct inode *);
1388extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1370extern void ext4_set_inode_flags(struct inode *); 1389extern void ext4_set_inode_flags(struct inode *);
1371extern void ext4_get_inode_flags(struct ext4_inode_info *); 1390extern void ext4_get_inode_flags(struct ext4_inode_info *);
1372extern int ext4_alloc_da_blocks(struct inode *inode); 1391extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1575struct ext4_group_info { 1594struct ext4_group_info {
1576 unsigned long bb_state; 1595 unsigned long bb_state;
1577 struct rb_root bb_free_root; 1596 struct rb_root bb_free_root;
1578 unsigned short bb_first_free; 1597 ext4_grpblk_t bb_first_free; /* first free block */
1579 unsigned short bb_free; 1598 ext4_grpblk_t bb_free; /* total free blocks */
1580 unsigned short bb_fragments; 1599 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1581 struct list_head bb_prealloc_list; 1600 struct list_head bb_prealloc_list;
1582#ifdef DOUBLE_CHECK 1601#ifdef DOUBLE_CHECK
1583 void *bb_bitmap; 1602 void *bb_bitmap;
1584#endif 1603#endif
1585 struct rw_semaphore alloc_sem; 1604 struct rw_semaphore alloc_sem;
1586 unsigned short bb_counters[]; 1605 ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
1606 * regions, index is order.
1607 * bb_counters[3] = 5 means
1608 * 5 free 8-block regions. */
1587}; 1609};
1588 1610
1589#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 1611#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
@@ -1591,15 +1613,42 @@ struct ext4_group_info {
1591#define EXT4_MB_GRP_NEED_INIT(grp) \ 1613#define EXT4_MB_GRP_NEED_INIT(grp) \
1592 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 1614 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1593 1615
1616#define EXT4_MAX_CONTENTION 8
1617#define EXT4_CONTENTION_THRESHOLD 2
1618
1594static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, 1619static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
1595 ext4_group_t group) 1620 ext4_group_t group)
1596{ 1621{
1597 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); 1622 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
1598} 1623}
1599 1624
1625/*
1626 * Returns true if the filesystem is busy enough that attempts to
1627 * access the block group locks has run into contention.
1628 */
1629static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
1630{
1631 return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
1632}
1633
1600static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 1634static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1601{ 1635{
1602 spin_lock(ext4_group_lock_ptr(sb, group)); 1636 spinlock_t *lock = ext4_group_lock_ptr(sb, group);
1637 if (spin_trylock(lock))
1638 /*
1639 * We're able to grab the lock right away, so drop the
1640 * lock contention counter.
1641 */
1642 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
1643 else {
1644 /*
1645 * The lock is busy, so bump the contention counter,
1646 * and then wait on the spin lock.
1647 */
1648 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
1649 EXT4_MAX_CONTENTION);
1650 spin_lock(lock);
1651 }
1603} 1652}
1604 1653
1605static inline void ext4_unlock_group(struct super_block *sb, 1654static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..61652f1d15e6 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
43#define CHECK_BINSEARCH__ 43#define CHECK_BINSEARCH__
44 44
45/* 45/*
46 * If EXT_DEBUG is defined you can use the 'extdebug' mount option 46 * Turn on EXT_DEBUG to get lots of info about extents operations.
47 * to get lots of info about what's going on.
48 */ 47 */
49#define EXT_DEBUG__ 48#define EXT_DEBUG__
50#ifdef EXT_DEBUG 49#ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
138#define EXT_BREAK 1 137#define EXT_BREAK 1
139#define EXT_REPEAT 2 138#define EXT_REPEAT 2
140 139
140/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
141#define EXT_MAX_BLOCK 0xffffffff 141#define EXT_MAX_BLOCK 0xffffffff
142 142
143/* 143/*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
44 handle, err); 44 handle, err);
45 } 45 }
46 else 46 else
47 brelse(bh); 47 bforget(bh);
48 return err; 48 return err;
49} 49}
50 50
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
60 handle, err); 60 handle, err);
61 } 61 }
62 else 62 else
63 brelse(bh); 63 bforget(bh);
64 return err; 64 return err;
65} 65}
66 66
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
89 ext4_journal_abort_handle(where, __func__, bh, 89 ext4_journal_abort_handle(where, __func__, bh,
90 handle, err); 90 handle, err);
91 } else { 91 } else {
92 mark_buffer_dirty(bh); 92 if (inode && bh)
93 mark_buffer_dirty_inode(bh, inode);
94 else
95 mark_buffer_dirty(bh);
93 if (inode && inode_needs_sync(inode)) { 96 if (inode && inode_needs_sync(inode)) {
94 sync_dirty_buffer(bh); 97 sync_dirty_buffer(bh);
95 if (buffer_req(bh) && !buffer_uptodate(bh)) { 98 if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..7a3832577923 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94} 94}
95 95
96static int ext4_ext_journal_restart(handle_t *handle, int needed) 96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode,
98 int needed)
97{ 99{
98 int err; 100 int err;
99 101
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
104 err = ext4_journal_extend(handle, needed); 106 err = ext4_journal_extend(handle, needed);
105 if (err <= 0) 107 if (err <= 0)
106 return err; 108 return err;
107 return ext4_journal_restart(handle, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /*
111 * We have dropped i_data_sem so someone might have cached again
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115
116 return err;
108} 117}
109 118
110/* 119/*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
220 return newblock; 229 return newblock;
221} 230}
222 231
223static int ext4_ext_space_block(struct inode *inode) 232static inline int ext4_ext_space_block(struct inode *inode, int check)
224{ 233{
225 int size; 234 int size;
226 235
227 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 236 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
228 / sizeof(struct ext4_extent); 237 / sizeof(struct ext4_extent);
238 if (!check) {
229#ifdef AGGRESSIVE_TEST 239#ifdef AGGRESSIVE_TEST
230 if (size > 6) 240 if (size > 6)
231 size = 6; 241 size = 6;
232#endif 242#endif
243 }
233 return size; 244 return size;
234} 245}
235 246
236static int ext4_ext_space_block_idx(struct inode *inode) 247static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
237{ 248{
238 int size; 249 int size;
239 250
240 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 251 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
241 / sizeof(struct ext4_extent_idx); 252 / sizeof(struct ext4_extent_idx);
253 if (!check) {
242#ifdef AGGRESSIVE_TEST 254#ifdef AGGRESSIVE_TEST
243 if (size > 5) 255 if (size > 5)
244 size = 5; 256 size = 5;
245#endif 257#endif
258 }
246 return size; 259 return size;
247} 260}
248 261
249static int ext4_ext_space_root(struct inode *inode) 262static inline int ext4_ext_space_root(struct inode *inode, int check)
250{ 263{
251 int size; 264 int size;
252 265
253 size = sizeof(EXT4_I(inode)->i_data); 266 size = sizeof(EXT4_I(inode)->i_data);
254 size -= sizeof(struct ext4_extent_header); 267 size -= sizeof(struct ext4_extent_header);
255 size /= sizeof(struct ext4_extent); 268 size /= sizeof(struct ext4_extent);
269 if (!check) {
256#ifdef AGGRESSIVE_TEST 270#ifdef AGGRESSIVE_TEST
257 if (size > 3) 271 if (size > 3)
258 size = 3; 272 size = 3;
259#endif 273#endif
274 }
260 return size; 275 return size;
261} 276}
262 277
263static int ext4_ext_space_root_idx(struct inode *inode) 278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
264{ 279{
265 int size; 280 int size;
266 281
267 size = sizeof(EXT4_I(inode)->i_data); 282 size = sizeof(EXT4_I(inode)->i_data);
268 size -= sizeof(struct ext4_extent_header); 283 size -= sizeof(struct ext4_extent_header);
269 size /= sizeof(struct ext4_extent_idx); 284 size /= sizeof(struct ext4_extent_idx);
285 if (!check) {
270#ifdef AGGRESSIVE_TEST 286#ifdef AGGRESSIVE_TEST
271 if (size > 4) 287 if (size > 4)
272 size = 4; 288 size = 4;
273#endif 289#endif
290 }
274 return size; 291 return size;
275} 292}
276 293
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
284 int lcap, icap, rcap, leafs, idxs, num; 301 int lcap, icap, rcap, leafs, idxs, num;
285 int newextents = blocks; 302 int newextents = blocks;
286 303
287 rcap = ext4_ext_space_root_idx(inode); 304 rcap = ext4_ext_space_root_idx(inode, 0);
288 lcap = ext4_ext_space_block(inode); 305 lcap = ext4_ext_space_block(inode, 0);
289 icap = ext4_ext_space_block_idx(inode); 306 icap = ext4_ext_space_block_idx(inode, 0);
290 307
291 /* number of new leaf blocks needed */ 308 /* number of new leaf blocks needed */
292 num = leafs = (newextents + lcap - 1) / lcap; 309 num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
311 328
312 if (depth == ext_depth(inode)) { 329 if (depth == ext_depth(inode)) {
313 if (depth == 0) 330 if (depth == 0)
314 max = ext4_ext_space_root(inode); 331 max = ext4_ext_space_root(inode, 1);
315 else 332 else
316 max = ext4_ext_space_root_idx(inode); 333 max = ext4_ext_space_root_idx(inode, 1);
317 } else { 334 } else {
318 if (depth == 0) 335 if (depth == 0)
319 max = ext4_ext_space_block(inode); 336 max = ext4_ext_space_block(inode, 1);
320 else 337 else
321 max = ext4_ext_space_block_idx(inode); 338 max = ext4_ext_space_block_idx(inode, 1);
322 } 339 }
323 340
324 return max; 341 return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
437 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 454 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
438 idx_pblock(path->p_idx)); 455 idx_pblock(path->p_idx));
439 } else if (path->p_ext) { 456 } else if (path->p_ext) {
440 ext_debug(" %d:%d:%llu ", 457 ext_debug(" %d:[%d]%d:%llu ",
441 le32_to_cpu(path->p_ext->ee_block), 458 le32_to_cpu(path->p_ext->ee_block),
459 ext4_ext_is_uninitialized(path->p_ext),
442 ext4_ext_get_actual_len(path->p_ext), 460 ext4_ext_get_actual_len(path->p_ext),
443 ext_pblock(path->p_ext)); 461 ext_pblock(path->p_ext));
444 } else 462 } else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
460 eh = path[depth].p_hdr; 478 eh = path[depth].p_hdr;
461 ex = EXT_FIRST_EXTENT(eh); 479 ex = EXT_FIRST_EXTENT(eh);
462 480
481 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
482
463 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 483 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
464 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), 484 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
485 ext4_ext_is_uninitialized(ex),
465 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 486 ext4_ext_get_actual_len(ex), ext_pblock(ex));
466 } 487 }
467 ext_debug("\n"); 488 ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
580 } 601 }
581 602
582 path->p_ext = l - 1; 603 path->p_ext = l - 1;
583 ext_debug(" -> %d:%llu:%d ", 604 ext_debug(" -> %d:%llu:[%d]%d ",
584 le32_to_cpu(path->p_ext->ee_block), 605 le32_to_cpu(path->p_ext->ee_block),
585 ext_pblock(path->p_ext), 606 ext_pblock(path->p_ext),
607 ext4_ext_is_uninitialized(path->p_ext),
586 ext4_ext_get_actual_len(path->p_ext)); 608 ext4_ext_get_actual_len(path->p_ext));
587 609
588#ifdef CHECK_BINSEARCH 610#ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
612 eh->eh_depth = 0; 634 eh->eh_depth = 0;
613 eh->eh_entries = 0; 635 eh->eh_entries = 0;
614 eh->eh_magic = EXT4_EXT_MAGIC; 636 eh->eh_magic = EXT4_EXT_MAGIC;
615 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); 637 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
616 ext4_mark_inode_dirty(handle, inode); 638 ext4_mark_inode_dirty(handle, inode);
617 ext4_ext_invalidate_cache(inode); 639 ext4_ext_invalidate_cache(inode);
618 return 0; 640 return 0;
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
837 859
838 neh = ext_block_hdr(bh); 860 neh = ext_block_hdr(bh);
839 neh->eh_entries = 0; 861 neh->eh_entries = 0;
840 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 862 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
841 neh->eh_magic = EXT4_EXT_MAGIC; 863 neh->eh_magic = EXT4_EXT_MAGIC;
842 neh->eh_depth = 0; 864 neh->eh_depth = 0;
843 ex = EXT_FIRST_EXTENT(neh); 865 ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
850 path[depth].p_ext++; 872 path[depth].p_ext++;
851 while (path[depth].p_ext <= 873 while (path[depth].p_ext <=
852 EXT_MAX_EXTENT(path[depth].p_hdr)) { 874 EXT_MAX_EXTENT(path[depth].p_hdr)) {
853 ext_debug("move %d:%llu:%d in new leaf %llu\n", 875 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
854 le32_to_cpu(path[depth].p_ext->ee_block), 876 le32_to_cpu(path[depth].p_ext->ee_block),
855 ext_pblock(path[depth].p_ext), 877 ext_pblock(path[depth].p_ext),
878 ext4_ext_is_uninitialized(path[depth].p_ext),
856 ext4_ext_get_actual_len(path[depth].p_ext), 879 ext4_ext_get_actual_len(path[depth].p_ext),
857 newblock); 880 newblock);
858 /*memmove(ex++, path[depth].p_ext++, 881 /*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
912 neh = ext_block_hdr(bh); 935 neh = ext_block_hdr(bh);
913 neh->eh_entries = cpu_to_le16(1); 936 neh->eh_entries = cpu_to_le16(1);
914 neh->eh_magic = EXT4_EXT_MAGIC; 937 neh->eh_magic = EXT4_EXT_MAGIC;
915 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 938 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
916 neh->eh_depth = cpu_to_le16(depth - i); 939 neh->eh_depth = cpu_to_le16(depth - i);
917 fidx = EXT_FIRST_INDEX(neh); 940 fidx = EXT_FIRST_INDEX(neh);
918 fidx->ei_block = border; 941 fidx->ei_block = border;
@@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1037 /* old root could have indexes or leaves 1060 /* old root could have indexes or leaves
1038 * so calculate e_max right way */ 1061 * so calculate e_max right way */
1039 if (ext_depth(inode)) 1062 if (ext_depth(inode))
1040 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 1063 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1041 else 1064 else
1042 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 1065 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1043 neh->eh_magic = EXT4_EXT_MAGIC; 1066 neh->eh_magic = EXT4_EXT_MAGIC;
1044 set_buffer_uptodate(bh); 1067 set_buffer_uptodate(bh);
1045 unlock_buffer(bh); 1068 unlock_buffer(bh);
@@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1054 goto out; 1077 goto out;
1055 1078
1056 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1079 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1057 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); 1080 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1058 curp->p_hdr->eh_entries = cpu_to_le16(1); 1081 curp->p_hdr->eh_entries = cpu_to_le16(1);
1059 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1082 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1060 1083
@@ -1580,9 +1603,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1580 1603
1581 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1582 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1583 ext_debug("append %d block to %d:%d (from %llu)\n", 1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext),
1584 ext4_ext_get_actual_len(newext), 1608 ext4_ext_get_actual_len(newext),
1585 le32_to_cpu(ex->ee_block), 1609 le32_to_cpu(ex->ee_block),
1610 ext4_ext_is_uninitialized(ex),
1586 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1611 ext4_ext_get_actual_len(ex), ext_pblock(ex));
1587 err = ext4_ext_get_access(handle, inode, path + depth); 1612 err = ext4_ext_get_access(handle, inode, path + depth);
1588 if (err) 1613 if (err)
@@ -1651,9 +1676,10 @@ has_space:
1651 1676
1652 if (!nearex) { 1677 if (!nearex) {
1653 /* there is no extent in this leaf, create first one */ 1678 /* there is no extent in this leaf, create first one */
1654 ext_debug("first extent in the leaf: %d:%llu:%d\n", 1679 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1655 le32_to_cpu(newext->ee_block), 1680 le32_to_cpu(newext->ee_block),
1656 ext_pblock(newext), 1681 ext_pblock(newext),
1682 ext4_ext_is_uninitialized(newext),
1657 ext4_ext_get_actual_len(newext)); 1683 ext4_ext_get_actual_len(newext));
1658 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1684 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1659 } else if (le32_to_cpu(newext->ee_block) 1685 } else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1689,11 @@ has_space:
1663 len = EXT_MAX_EXTENT(eh) - nearex; 1689 len = EXT_MAX_EXTENT(eh) - nearex;
1664 len = (len - 1) * sizeof(struct ext4_extent); 1690 len = (len - 1) * sizeof(struct ext4_extent);
1665 len = len < 0 ? 0 : len; 1691 len = len < 0 ? 0 : len;
1666 ext_debug("insert %d:%llu:%d after: nearest 0x%p, " 1692 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1667 "move %d from 0x%p to 0x%p\n", 1693 "move %d from 0x%p to 0x%p\n",
1668 le32_to_cpu(newext->ee_block), 1694 le32_to_cpu(newext->ee_block),
1669 ext_pblock(newext), 1695 ext_pblock(newext),
1696 ext4_ext_is_uninitialized(newext),
1670 ext4_ext_get_actual_len(newext), 1697 ext4_ext_get_actual_len(newext),
1671 nearex, len, nearex + 1, nearex + 2); 1698 nearex, len, nearex + 1, nearex + 2);
1672 memmove(nearex + 2, nearex + 1, len); 1699 memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1703,11 @@ has_space:
1676 BUG_ON(newext->ee_block == nearex->ee_block); 1703 BUG_ON(newext->ee_block == nearex->ee_block);
1677 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1704 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1678 len = len < 0 ? 0 : len; 1705 len = len < 0 ? 0 : len;
1679 ext_debug("insert %d:%llu:%d before: nearest 0x%p, " 1706 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1680 "move %d from 0x%p to 0x%p\n", 1707 "move %d from 0x%p to 0x%p\n",
1681 le32_to_cpu(newext->ee_block), 1708 le32_to_cpu(newext->ee_block),
1682 ext_pblock(newext), 1709 ext_pblock(newext),
1710 ext4_ext_is_uninitialized(newext),
1683 ext4_ext_get_actual_len(newext), 1711 ext4_ext_get_actual_len(newext),
1684 nearex, len, nearex + 1, nearex + 2); 1712 nearex, len, nearex + 1, nearex + 2);
1685 memmove(nearex + 1, nearex, len); 1713 memmove(nearex + 1, nearex, len);
@@ -2094,7 +2122,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2094 else 2122 else
2095 uninitialized = 0; 2123 uninitialized = 0;
2096 2124
2097 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); 2125 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2126 uninitialized, ex_ee_len);
2098 path[depth].p_ext = ex; 2127 path[depth].p_ext = ex;
2099 2128
2100 a = ex_ee_block > start ? ex_ee_block : start; 2129 a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2138 } 2167 }
2139 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2168 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2140 2169
2141 err = ext4_ext_journal_restart(handle, credits); 2170 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2142 if (err) 2171 if (err)
2143 goto out; 2172 goto out;
2144 2173
@@ -2327,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2327 if (err == 0) { 2356 if (err == 0) {
2328 ext_inode_hdr(inode)->eh_depth = 0; 2357 ext_inode_hdr(inode)->eh_depth = 0;
2329 ext_inode_hdr(inode)->eh_max = 2358 ext_inode_hdr(inode)->eh_max =
2330 cpu_to_le16(ext4_ext_space_root(inode)); 2359 cpu_to_le16(ext4_ext_space_root(inode, 0));
2331 err = ext4_ext_dirty(handle, inode, path); 2360 err = ext4_ext_dirty(handle, inode, path);
2332 } 2361 }
2333 } 2362 }
@@ -2743,6 +2772,7 @@ insert:
2743 } else if (err) 2772 } else if (err)
2744 goto fix_extent_len; 2773 goto fix_extent_len;
2745out: 2774out:
2775 ext4_ext_show_leaf(inode, path);
2746 return err ? err : allocated; 2776 return err ? err : allocated;
2747 2777
2748fix_extent_len: 2778fix_extent_len:
@@ -2786,7 +2816,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2786 struct ext4_allocation_request ar; 2816 struct ext4_allocation_request ar;
2787 2817
2788 __clear_bit(BH_New, &bh_result->b_state); 2818 __clear_bit(BH_New, &bh_result->b_state);
2789 ext_debug("blocks %u/%u requested for inode %u\n", 2819 ext_debug("blocks %u/%u requested for inode %lu\n",
2790 iblock, max_blocks, inode->i_ino); 2820 iblock, max_blocks, inode->i_ino);
2791 2821
2792 /* check in cache */ 2822 /* check in cache */
@@ -2849,7 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2849 newblock = iblock - ee_block + ee_start; 2879 newblock = iblock - ee_block + ee_start;
2850 /* number of remaining blocks in the extent */ 2880 /* number of remaining blocks in the extent */
2851 allocated = ee_len - (iblock - ee_block); 2881 allocated = ee_len - (iblock - ee_block);
2852 ext_debug("%u fit into %lu:%d -> %llu\n", iblock, 2882 ext_debug("%u fit into %u:%d -> %llu\n", iblock,
2853 ee_block, ee_len, newblock); 2883 ee_block, ee_len, newblock);
2854 2884
2855 /* Do not put uninitialized extent in the cache */ 2885 /* Do not put uninitialized extent in the cache */
@@ -2950,7 +2980,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2950 newblock = ext4_mb_new_blocks(handle, &ar, &err); 2980 newblock = ext4_mb_new_blocks(handle, &ar, &err);
2951 if (!newblock) 2981 if (!newblock)
2952 goto out2; 2982 goto out2;
2953 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2983 ext_debug("allocate new block: goal %llu, found %llu/%u\n",
2954 ar.goal, newblock, allocated); 2984 ar.goal, newblock, allocated);
2955 2985
2956 /* try to insert new extent into found leaf and return */ 2986 /* try to insert new extent into found leaf and return */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c6..5ca3eca70a1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
58ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 58ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
59 unsigned long nr_segs, loff_t pos) 59 unsigned long nr_segs, loff_t pos)
60{ 60{
61 struct file *file = iocb->ki_filp; 61 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
62 struct inode *inode = file->f_path.dentry->d_inode;
63 ssize_t ret;
64 int err;
65 62
66 /* 63 /*
67 * If we have encountered a bitmap-format file, the size limit 64 * If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
81 } 78 }
82 } 79 }
83 80
84 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 81 return generic_file_aio_write(iocb, iov, nr_segs, pos);
85 /*
86 * Skip flushing if there was an error, or if nothing was written.
87 */
88 if (ret <= 0)
89 return ret;
90
91 /*
92 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
93 * journalling then we need to make sure that we force the transaction
94 * to disk to keep all metadata uptodate synchronously.
95 */
96 if (file->f_flags & O_SYNC) {
97 /*
98 * If we are non-data-journaled, then the dirty data has
99 * already been flushed to backing store by generic_osync_inode,
100 * and the inode has been flushed too if there have been any
101 * modifications other than mere timestamp updates.
102 *
103 * Open question --- do we care about flushing timestamps too
104 * if the inode is IS_SYNC?
105 */
106 if (!ext4_should_journal_data(inode))
107 return ret;
108
109 goto force_commit;
110 }
111
112 /*
113 * So we know that there has been no forced data flush. If the inode
114 * is marked IS_SYNC, we need to force one ourselves.
115 */
116 if (!IS_SYNC(inode))
117 return ret;
118
119 /*
120 * Open question #2 --- should we force data to disk here too? If we
121 * don't, the only impact is that data=writeback filesystems won't
122 * flush data to disk automatically on IS_SYNC, only metadata (but
123 * historically, that is what ext2 has done.)
124 */
125
126force_commit:
127 err = ext4_force_commit(inode->i_sb);
128 if (err)
129 return err;
130 return ret;
131} 82}
132 83
133static struct vm_operations_struct ext4_file_vm_ops = { 84static struct vm_operations_struct ext4_file_vm_ops = {
@@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = {
207 .listxattr = ext4_listxattr, 158 .listxattr = ext4_listxattr,
208 .removexattr = generic_removexattr, 159 .removexattr = generic_removexattr,
209#endif 160#endif
210 .permission = ext4_permission, 161 .check_acl = ext4_check_acl,
211 .fallocate = ext4_fallocate, 162 .fallocate = ext4_fallocate,
212 .fiemap = ext4_fiemap, 163 .fiemap = ext4_fiemap,
213}; 164};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..07475740b512 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
50{ 50{
51 struct inode *inode = dentry->d_inode; 51 struct inode *inode = dentry->d_inode;
52 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 52 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
53 int ret = 0; 53 int err, ret = 0;
54 54
55 J_ASSERT(ext4_journal_current_handle() == NULL); 55 J_ASSERT(ext4_journal_current_handle() == NULL);
56 56
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
79 goto out; 79 goto out;
80 } 80 }
81 81
82 if (!journal)
83 ret = sync_mapping_buffers(inode->i_mapping);
84
82 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 85 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
83 goto out; 86 goto out;
84 87
@@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
91 .sync_mode = WB_SYNC_ALL, 94 .sync_mode = WB_SYNC_ALL,
92 .nr_to_write = 0, /* sys_fsync did this */ 95 .nr_to_write = 0, /* sys_fsync did this */
93 }; 96 };
94 ret = sync_inode(inode, &wbc); 97 err = sync_inode(inode, &wbc);
95 if (journal && (journal->j_flags & JBD2_BARRIER)) 98 if (ret == 0)
96 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 99 ret = err;
97 } 100 }
98out: 101out:
102 if (journal && (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
99 return ret; 104 return ret;
100} 105}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1189 1189
1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1192 i, ext4_free_inodes_count(sb, gdp), x); 1192 (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1193 bitmap_count += x; 1193 bitmap_count += x;
1194 } 1194 }
1195 brelse(bitmap_bh); 1195 brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..4abd683b963d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
192 * so before we call here everything must be consistently dirtied against 192 * so before we call here everything must be consistently dirtied against
193 * this transaction. 193 * this transaction.
194 */ 194 */
195static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 195 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
196 int nblocks)
196{ 197{
198 int ret;
199
200 /*
201 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
202 * moment, get_block can be called only for blocks inside i_size since
203 * page cache has been already dropped and writes are blocked by
204 * i_mutex. So we can safely drop the i_data_sem here.
205 */
197 BUG_ON(EXT4_JOURNAL(inode) == NULL); 206 BUG_ON(EXT4_JOURNAL(inode) == NULL);
198 jbd_debug(2, "restarting handle %p\n", handle); 207 jbd_debug(2, "restarting handle %p\n", handle);
199 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 208 up_write(&EXT4_I(inode)->i_data_sem);
209 ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
210 down_write(&EXT4_I(inode)->i_data_sem);
211
212 return ret;
200} 213}
201 214
202/* 215/*
@@ -341,9 +354,7 @@ static int ext4_block_to_path(struct inode *inode,
341 int n = 0; 354 int n = 0;
342 int final = 0; 355 int final = 0;
343 356
344 if (i_block < 0) { 357 if (i_block < direct_blocks) {
345 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
346 } else if (i_block < direct_blocks) {
347 offsets[n++] = i_block; 358 offsets[n++] = i_block;
348 final = direct_blocks; 359 final = direct_blocks;
349 } else if ((i_block -= direct_blocks) < indirect_blocks) { 360 } else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
551 * 562 *
552 * Normally this function find the preferred place for block allocation, 563 * Normally this function find the preferred place for block allocation,
553 * returns it. 564 * returns it.
565 * Because this is only used for non-extent files, we limit the block nr
566 * to 32 bits.
554 */ 567 */
555static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 568static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
556 Indirect *partial) 569 Indirect *partial)
557{ 570{
571 ext4_fsblk_t goal;
572
558 /* 573 /*
559 * XXX need to get goal block from mballoc's data structures 574 * XXX need to get goal block from mballoc's data structures
560 */ 575 */
561 576
562 return ext4_find_near(inode, partial); 577 goal = ext4_find_near(inode, partial);
578 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
579 return goal;
563} 580}
564 581
565/** 582/**
@@ -640,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
640 if (*err) 657 if (*err)
641 goto failed_out; 658 goto failed_out;
642 659
660 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
661
643 target -= count; 662 target -= count;
644 /* allocate blocks for indirect blocks */ 663 /* allocate blocks for indirect blocks */
645 while (index < indirect_blks && count) { 664 while (index < indirect_blks && count) {
@@ -674,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
674 ar.flags = EXT4_MB_HINT_DATA; 693 ar.flags = EXT4_MB_HINT_DATA;
675 694
676 current_block = ext4_mb_new_blocks(handle, &ar, err); 695 current_block = ext4_mb_new_blocks(handle, &ar, err);
696 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
677 697
678 if (*err && (target == blks)) { 698 if (*err && (target == blks)) {
679 /* 699 /*
@@ -762,8 +782,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
762 BUFFER_TRACE(bh, "call get_create_access"); 782 BUFFER_TRACE(bh, "call get_create_access");
763 err = ext4_journal_get_create_access(handle, bh); 783 err = ext4_journal_get_create_access(handle, bh);
764 if (err) { 784 if (err) {
785 /* Don't brelse(bh) here; it's done in
786 * ext4_journal_forget() below */
765 unlock_buffer(bh); 787 unlock_buffer(bh);
766 brelse(bh);
767 goto failed; 788 goto failed;
768 } 789 }
769 790
@@ -1109,16 +1130,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1109 ext4_discard_preallocations(inode); 1130 ext4_discard_preallocations(inode);
1110} 1131}
1111 1132
1112static int check_block_validity(struct inode *inode, sector_t logical, 1133static int check_block_validity(struct inode *inode, const char *msg,
1113 sector_t phys, int len) 1134 sector_t logical, sector_t phys, int len)
1114{ 1135{
1115 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1136 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1116 ext4_error(inode->i_sb, "check_block_validity", 1137 ext4_error(inode->i_sb, msg,
1117 "inode #%lu logical block %llu mapped to %llu " 1138 "inode #%lu logical block %llu mapped to %llu "
1118 "(size %d)", inode->i_ino, 1139 "(size %d)", inode->i_ino,
1119 (unsigned long long) logical, 1140 (unsigned long long) logical,
1120 (unsigned long long) phys, len); 1141 (unsigned long long) phys, len);
1121 WARN_ON(1);
1122 return -EIO; 1142 return -EIO;
1123 } 1143 }
1124 return 0; 1144 return 0;
@@ -1170,8 +1190,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1170 up_read((&EXT4_I(inode)->i_data_sem)); 1190 up_read((&EXT4_I(inode)->i_data_sem));
1171 1191
1172 if (retval > 0 && buffer_mapped(bh)) { 1192 if (retval > 0 && buffer_mapped(bh)) {
1173 int ret = check_block_validity(inode, block, 1193 int ret = check_block_validity(inode, "file system corruption",
1174 bh->b_blocknr, retval); 1194 block, bh->b_blocknr, retval);
1175 if (ret != 0) 1195 if (ret != 0)
1176 return ret; 1196 return ret;
1177 } 1197 }
@@ -1235,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1235 * i_data's format changing. Force the migrate 1255 * i_data's format changing. Force the migrate
1236 * to fail by clearing migrate flags 1256 * to fail by clearing migrate flags
1237 */ 1257 */
1238 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1258 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
1239 ~EXT4_EXT_MIGRATE;
1240 } 1259 }
1241 } 1260 }
1242 1261
@@ -1252,8 +1271,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1252 1271
1253 up_write((&EXT4_I(inode)->i_data_sem)); 1272 up_write((&EXT4_I(inode)->i_data_sem));
1254 if (retval > 0 && buffer_mapped(bh)) { 1273 if (retval > 0 && buffer_mapped(bh)) {
1255 int ret = check_block_validity(inode, block, 1274 int ret = check_block_validity(inode, "file system "
1256 bh->b_blocknr, retval); 1275 "corruption after allocation",
1276 block, bh->b_blocknr, retval);
1257 if (ret != 0) 1277 if (ret != 0)
1258 return ret; 1278 return ret;
1259 } 1279 }
@@ -1863,18 +1883,6 @@ static void ext4_da_page_release_reservation(struct page *page,
1863 * Delayed allocation stuff 1883 * Delayed allocation stuff
1864 */ 1884 */
1865 1885
1866struct mpage_da_data {
1867 struct inode *inode;
1868 sector_t b_blocknr; /* start block number of extent */
1869 size_t b_size; /* size of extent */
1870 unsigned long b_state; /* state of the extent */
1871 unsigned long first_page, next_page; /* extent of pages */
1872 struct writeback_control *wbc;
1873 int io_done;
1874 int pages_written;
1875 int retval;
1876};
1877
1878/* 1886/*
1879 * mpage_da_submit_io - walks through extent of pages and try to write 1887 * mpage_da_submit_io - walks through extent of pages and try to write
1880 * them with writepage() call back 1888 * them with writepage() call back
@@ -2737,6 +2745,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2737 long pages_skipped; 2745 long pages_skipped;
2738 int range_cyclic, cycled = 1, io_done = 0; 2746 int range_cyclic, cycled = 1, io_done = 0;
2739 int needed_blocks, ret = 0, nr_to_writebump = 0; 2747 int needed_blocks, ret = 0, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start;
2740 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2741 2750
2742 trace_ext4_da_writepages(inode, wbc); 2751 trace_ext4_da_writepages(inode, wbc);
@@ -2850,6 +2859,7 @@ retry:
2850 mpd.io_done = 1; 2859 mpd.io_done = 1;
2851 ret = MPAGE_DA_EXTENT_TAIL; 2860 ret = MPAGE_DA_EXTENT_TAIL;
2852 } 2861 }
2862 trace_ext4_da_write_pages(inode, &mpd);
2853 wbc->nr_to_write -= mpd.pages_written; 2863 wbc->nr_to_write -= mpd.pages_written;
2854 2864
2855 ext4_journal_stop(handle); 2865 ext4_journal_stop(handle);
@@ -2905,6 +2915,7 @@ out_writepages:
2905 if (!no_nrwrite_index_update) 2915 if (!no_nrwrite_index_update)
2906 wbc->no_nrwrite_index_update = 0; 2916 wbc->no_nrwrite_index_update = 0;
2907 wbc->nr_to_write -= nr_to_writebump; 2917 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start;
2908 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2909 return ret; 2920 return ret;
2910} 2921}
@@ -3117,6 +3128,8 @@ out:
3117 */ 3128 */
3118int ext4_alloc_da_blocks(struct inode *inode) 3129int ext4_alloc_da_blocks(struct inode *inode)
3119{ 3130{
3131 trace_ext4_alloc_da_blocks(inode);
3132
3120 if (!EXT4_I(inode)->i_reserved_data_blocks && 3133 if (!EXT4_I(inode)->i_reserved_data_blocks &&
3121 !EXT4_I(inode)->i_reserved_meta_blocks) 3134 !EXT4_I(inode)->i_reserved_meta_blocks)
3122 return 0; 3135 return 0;
@@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3659 ext4_handle_dirty_metadata(handle, inode, bh); 3672 ext4_handle_dirty_metadata(handle, inode, bh);
3660 } 3673 }
3661 ext4_mark_inode_dirty(handle, inode); 3674 ext4_mark_inode_dirty(handle, inode);
3662 ext4_journal_test_restart(handle, inode); 3675 ext4_truncate_restart_trans(handle, inode,
3676 blocks_for_truncate(inode));
3663 if (bh) { 3677 if (bh) {
3664 BUFFER_TRACE(bh, "retaking write access"); 3678 BUFFER_TRACE(bh, "retaking write access");
3665 ext4_journal_get_write_access(handle, bh); 3679 ext4_journal_get_write_access(handle, bh);
@@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3870 return; 3884 return;
3871 if (try_to_extend_transaction(handle, inode)) { 3885 if (try_to_extend_transaction(handle, inode)) {
3872 ext4_mark_inode_dirty(handle, inode); 3886 ext4_mark_inode_dirty(handle, inode);
3873 ext4_journal_test_restart(handle, inode); 3887 ext4_truncate_restart_trans(handle, inode,
3888 blocks_for_truncate(inode));
3874 } 3889 }
3875 3890
3876 ext4_free_blocks(handle, inode, nr, 1, 1); 3891 ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3958,8 +3973,7 @@ void ext4_truncate(struct inode *inode)
3958 if (!ext4_can_truncate(inode)) 3973 if (!ext4_can_truncate(inode))
3959 return; 3974 return;
3960 3975
3961 if (ei->i_disksize && inode->i_size == 0 && 3976 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3962 !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3963 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 3977 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3964 3978
3965 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3979 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4533,7 +4547,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
4533 */ 4547 */
4534static int ext4_do_update_inode(handle_t *handle, 4548static int ext4_do_update_inode(handle_t *handle,
4535 struct inode *inode, 4549 struct inode *inode,
4536 struct ext4_iloc *iloc) 4550 struct ext4_iloc *iloc,
4551 int do_sync)
4537{ 4552{
4538 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4553 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4539 struct ext4_inode_info *ei = EXT4_I(inode); 4554 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4581,8 +4596,7 @@ static int ext4_do_update_inode(handle_t *handle,
4581 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4596 if (ext4_inode_blocks_set(handle, raw_inode, ei))
4582 goto out_brelse; 4597 goto out_brelse;
4583 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4598 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4584 /* clear the migrate flag in the raw_inode */ 4599 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
4585 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
4586 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4600 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
4587 cpu_to_le32(EXT4_OS_HURD)) 4601 cpu_to_le32(EXT4_OS_HURD))
4588 raw_inode->i_file_acl_high = 4602 raw_inode->i_file_acl_high =
@@ -4635,10 +4649,22 @@ static int ext4_do_update_inode(handle_t *handle,
4635 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4649 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4636 } 4650 }
4637 4651
4638 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4652 /*
4639 rc = ext4_handle_dirty_metadata(handle, inode, bh); 4653 * If we're not using a journal and we were called from
4640 if (!err) 4654 * ext4_write_inode() to sync the inode (making do_sync true),
4641 err = rc; 4655 * we can just use sync_dirty_buffer() directly to do our dirty
4656 * work. Testing s_journal here is a bit redundant but it's
4657 * worth it to avoid potential future trouble.
4658 */
4659 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4660 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4661 sync_dirty_buffer(bh);
4662 } else {
4663 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4664 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4665 if (!err)
4666 err = rc;
4667 }
4642 ei->i_state &= ~EXT4_STATE_NEW; 4668 ei->i_state &= ~EXT4_STATE_NEW;
4643 4669
4644out_brelse: 4670out_brelse:
@@ -4684,19 +4710,32 @@ out_brelse:
4684 */ 4710 */
4685int ext4_write_inode(struct inode *inode, int wait) 4711int ext4_write_inode(struct inode *inode, int wait)
4686{ 4712{
4713 int err;
4714
4687 if (current->flags & PF_MEMALLOC) 4715 if (current->flags & PF_MEMALLOC)
4688 return 0; 4716 return 0;
4689 4717
4690 if (ext4_journal_current_handle()) { 4718 if (EXT4_SB(inode->i_sb)->s_journal) {
4691 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4719 if (ext4_journal_current_handle()) {
4692 dump_stack(); 4720 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4693 return -EIO; 4721 dump_stack();
4694 } 4722 return -EIO;
4723 }
4695 4724
4696 if (!wait) 4725 if (!wait)
4697 return 0; 4726 return 0;
4727
4728 err = ext4_force_commit(inode->i_sb);
4729 } else {
4730 struct ext4_iloc iloc;
4698 4731
4699 return ext4_force_commit(inode->i_sb); 4732 err = ext4_get_inode_loc(inode, &iloc);
4733 if (err)
4734 return err;
4735 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
4736 inode, &iloc, wait);
4737 }
4738 return err;
4700} 4739}
4701 4740
4702/* 4741/*
@@ -4990,7 +5029,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
4990 get_bh(iloc->bh); 5029 get_bh(iloc->bh);
4991 5030
4992 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5031 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
4993 err = ext4_do_update_inode(handle, inode, iloc); 5032 err = ext4_do_update_inode(handle, inode, iloc, 0);
4994 put_bh(iloc->bh); 5033 put_bh(iloc->bh);
4995 return err; 5034 return err;
4996} 5035}
@@ -5281,12 +5320,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5281 else 5320 else
5282 len = PAGE_CACHE_SIZE; 5321 len = PAGE_CACHE_SIZE;
5283 5322
5323 lock_page(page);
5324 /*
5325 * return if we have all the buffers mapped. This avoid
5326 * the need to call write_begin/write_end which does a
5327 * journal_start/journal_stop which can block and take
5328 * long time
5329 */
5284 if (page_has_buffers(page)) { 5330 if (page_has_buffers(page)) {
5285 /* return if we have all the buffers mapped */
5286 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5331 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5287 ext4_bh_unmapped)) 5332 ext4_bh_unmapped)) {
5333 unlock_page(page);
5288 goto out_unlock; 5334 goto out_unlock;
5335 }
5289 } 5336 }
5337 unlock_page(page);
5290 /* 5338 /*
5291 * OK, we need to fill the hole... Do write_begin write_end 5339 * OK, we need to fill the hole... Do write_begin write_end
5292 * to do block allocation/reservation.We are not holding 5340 * to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
243 me.donor_start, me.len, &me.moved_len); 243 me.donor_start, me.len, &me.moved_len);
244 fput(donor_filp); 244 fput(donor_filp);
245 245
246 if (!err) 246 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
247 if (copy_to_user((struct move_extent *)arg, 247 return -EFAULT;
248 &me, sizeof(me))) 248
249 return -EFAULT;
250 return err; 249 return err;
251 } 250 }
252 251
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h>
25#include <trace/events/ext4.h> 26#include <trace/events/ext4.h>
26 27
27/* 28/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
622 623
623/* FIXME!! need more doc */ 624/* FIXME!! need more doc */
624static void ext4_mb_mark_free_simple(struct super_block *sb, 625static void ext4_mb_mark_free_simple(struct super_block *sb,
625 void *buddy, unsigned first, int len, 626 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
626 struct ext4_group_info *grp) 627 struct ext4_group_info *grp)
627{ 628{
628 struct ext4_sb_info *sbi = EXT4_SB(sb); 629 struct ext4_sb_info *sbi = EXT4_SB(sb);
629 unsigned short min; 630 ext4_grpblk_t min;
630 unsigned short max; 631 ext4_grpblk_t max;
631 unsigned short chunk; 632 ext4_grpblk_t chunk;
632 unsigned short border; 633 unsigned short border;
633 634
634 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 635 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
662 void *buddy, void *bitmap, ext4_group_t group) 663 void *buddy, void *bitmap, ext4_group_t group)
663{ 664{
664 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 665 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
665 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); 666 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
666 unsigned short i = 0; 667 ext4_grpblk_t i = 0;
667 unsigned short first; 668 ext4_grpblk_t first;
668 unsigned short len; 669 ext4_grpblk_t len;
669 unsigned free = 0; 670 unsigned free = 0;
670 unsigned fragments = 0; 671 unsigned fragments = 0;
671 unsigned long long period = get_cycles(); 672 unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
743 char *data; 744 char *data;
744 char *bitmap; 745 char *bitmap;
745 746
746 mb_debug("init page %lu\n", page->index); 747 mb_debug(1, "init page %lu\n", page->index);
747 748
748 inode = page->mapping->host; 749 inode = page->mapping->host;
749 sb = inode->i_sb; 750 sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
822 set_bitmap_uptodate(bh[i]); 823 set_bitmap_uptodate(bh[i]);
823 bh[i]->b_end_io = end_buffer_read_sync; 824 bh[i]->b_end_io = end_buffer_read_sync;
824 submit_bh(READ, bh[i]); 825 submit_bh(READ, bh[i]);
825 mb_debug("read bitmap for group %u\n", first_group + i); 826 mb_debug(1, "read bitmap for group %u\n", first_group + i);
826 } 827 }
827 828
828 /* wait for I/O completion */ 829 /* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 if ((first_block + i) & 1) { 863 if ((first_block + i) & 1) {
863 /* this is block of buddy */ 864 /* this is block of buddy */
864 BUG_ON(incore == NULL); 865 BUG_ON(incore == NULL);
865 mb_debug("put buddy for group %u in page %lu/%x\n", 866 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
866 group, page->index, i * blocksize); 867 group, page->index, i * blocksize);
867 grinfo = ext4_get_group_info(sb, group); 868 grinfo = ext4_get_group_info(sb, group);
868 grinfo->bb_fragments = 0; 869 grinfo->bb_fragments = 0;
869 memset(grinfo->bb_counters, 0, 870 memset(grinfo->bb_counters, 0,
870 sizeof(unsigned short)*(sb->s_blocksize_bits+2)); 871 sizeof(*grinfo->bb_counters) *
872 (sb->s_blocksize_bits+2));
871 /* 873 /*
872 * incore got set to the group block bitmap below 874 * incore got set to the group block bitmap below
873 */ 875 */
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
878 } else { 880 } else {
879 /* this is block of bitmap */ 881 /* this is block of bitmap */
880 BUG_ON(incore != NULL); 882 BUG_ON(incore != NULL);
881 mb_debug("put bitmap for group %u in page %lu/%x\n", 883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
882 group, page->index, i * blocksize); 884 group, page->index, i * blocksize);
883 885
884 /* see comments in ext4_mb_put_pa() */ 886 /* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
908 return err; 910 return err;
909} 911}
910 912
913static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{
916
917 int ret = 0;
918 void *bitmap;
919 int blocks_per_page;
920 int block, pnum, poff;
921 int num_grp_locked = 0;
922 struct ext4_group_info *this_grp;
923 struct ext4_sb_info *sbi = EXT4_SB(sb);
924 struct inode *inode = sbi->s_buddy_cache;
925 struct page *page = NULL, *bitmap_page = NULL;
926
927 mb_debug(1, "init group %u\n", group);
928 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
929 this_grp = ext4_get_group_info(sb, group);
930 /*
931 * This ensures that we don't reinit the buddy cache
932 * page which map to the group from which we are already
933 * allocating. If we are looking at the buddy cache we would
934 * have taken a reference using ext4_mb_load_buddy and that
935 * would have taken the alloc_sem lock.
936 */
937 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
938 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
939 /*
940 * somebody initialized the group
941 * return without doing anything
942 */
943 ret = 0;
944 goto err;
945 }
946 /*
947 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks.
949 * So for each group we need two blocks.
950 */
951 block = group * 2;
952 pnum = block / blocks_per_page;
953 poff = block % blocks_per_page;
954 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
955 if (page) {
956 BUG_ON(page->mapping != inode->i_mapping);
957 ret = ext4_mb_init_cache(page, NULL);
958 if (ret) {
959 unlock_page(page);
960 goto err;
961 }
962 unlock_page(page);
963 }
964 if (page == NULL || !PageUptodate(page)) {
965 ret = -EIO;
966 goto err;
967 }
968 mark_page_accessed(page);
969 bitmap_page = page;
970 bitmap = page_address(page) + (poff * sb->s_blocksize);
971
972 /* init buddy cache */
973 block++;
974 pnum = block / blocks_per_page;
975 poff = block % blocks_per_page;
976 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
977 if (page == bitmap_page) {
978 /*
979 * If both the bitmap and buddy are in
980 * the same page we don't need to force
981 * init the buddy
982 */
983 unlock_page(page);
984 } else if (page) {
985 BUG_ON(page->mapping != inode->i_mapping);
986 ret = ext4_mb_init_cache(page, bitmap);
987 if (ret) {
988 unlock_page(page);
989 goto err;
990 }
991 unlock_page(page);
992 }
993 if (page == NULL || !PageUptodate(page)) {
994 ret = -EIO;
995 goto err;
996 }
997 mark_page_accessed(page);
998err:
999 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1000 if (bitmap_page)
1001 page_cache_release(bitmap_page);
1002 if (page)
1003 page_cache_release(page);
1004 return ret;
1005}
1006
911static noinline_for_stack int 1007static noinline_for_stack int
912ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
913 struct ext4_buddy *e4b) 1009 struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
922 struct ext4_sb_info *sbi = EXT4_SB(sb); 1018 struct ext4_sb_info *sbi = EXT4_SB(sb);
923 struct inode *inode = sbi->s_buddy_cache; 1019 struct inode *inode = sbi->s_buddy_cache;
924 1020
925 mb_debug("load group %u\n", group); 1021 mb_debug(1, "load group %u\n", group);
926 1022
927 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1023 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
928 grp = ext4_get_group_info(sb, group); 1024 grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
941 * groups mapped by the page is blocked 1037 * groups mapped by the page is blocked
942 * till we are done with allocation 1038 * till we are done with allocation
943 */ 1039 */
1040repeat_load_buddy:
944 down_read(e4b->alloc_semp); 1041 down_read(e4b->alloc_semp);
945 1042
1043 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1044 /* we need to check for group need init flag
1045 * with alloc_semp held so that we can be sure
1046 * that new blocks didn't get added to the group
1047 * when we are loading the buddy cache
1048 */
1049 up_read(e4b->alloc_semp);
1050 /*
1051 * we need full data about the group
1052 * to make a good selection
1053 */
1054 ret = ext4_mb_init_group(sb, group);
1055 if (ret)
1056 return ret;
1057 goto repeat_load_buddy;
1058 }
1059
946 /* 1060 /*
947 * the buddy cache inode stores the block bitmap 1061 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks. 1062 * and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1360 ac->alloc_semp = e4b->alloc_semp; 1474 ac->alloc_semp = e4b->alloc_semp;
1361 e4b->alloc_semp = NULL; 1475 e4b->alloc_semp = NULL;
1362 /* store last allocated for subsequent stream allocation */ 1476 /* store last allocated for subsequent stream allocation */
1363 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1477 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1364 spin_lock(&sbi->s_md_lock); 1478 spin_lock(&sbi->s_md_lock);
1365 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1479 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1366 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1480 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1837 1951
1838} 1952}
1839 1953
1840static noinline_for_stack
1841int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1842{
1843
1844 int ret;
1845 void *bitmap;
1846 int blocks_per_page;
1847 int block, pnum, poff;
1848 int num_grp_locked = 0;
1849 struct ext4_group_info *this_grp;
1850 struct ext4_sb_info *sbi = EXT4_SB(sb);
1851 struct inode *inode = sbi->s_buddy_cache;
1852 struct page *page = NULL, *bitmap_page = NULL;
1853
1854 mb_debug("init group %lu\n", group);
1855 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1856 this_grp = ext4_get_group_info(sb, group);
1857 /*
1858 * This ensures we don't add group
1859 * to this buddy cache via resize
1860 */
1861 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1862 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1863 /*
1864 * somebody initialized the group
1865 * return without doing anything
1866 */
1867 ret = 0;
1868 goto err;
1869 }
1870 /*
1871 * the buddy cache inode stores the block bitmap
1872 * and buddy information in consecutive blocks.
1873 * So for each group we need two blocks.
1874 */
1875 block = group * 2;
1876 pnum = block / blocks_per_page;
1877 poff = block % blocks_per_page;
1878 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1879 if (page) {
1880 BUG_ON(page->mapping != inode->i_mapping);
1881 ret = ext4_mb_init_cache(page, NULL);
1882 if (ret) {
1883 unlock_page(page);
1884 goto err;
1885 }
1886 unlock_page(page);
1887 }
1888 if (page == NULL || !PageUptodate(page)) {
1889 ret = -EIO;
1890 goto err;
1891 }
1892 mark_page_accessed(page);
1893 bitmap_page = page;
1894 bitmap = page_address(page) + (poff * sb->s_blocksize);
1895
1896 /* init buddy cache */
1897 block++;
1898 pnum = block / blocks_per_page;
1899 poff = block % blocks_per_page;
1900 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1901 if (page == bitmap_page) {
1902 /*
1903 * If both the bitmap and buddy are in
1904 * the same page we don't need to force
1905 * init the buddy
1906 */
1907 unlock_page(page);
1908 } else if (page) {
1909 BUG_ON(page->mapping != inode->i_mapping);
1910 ret = ext4_mb_init_cache(page, bitmap);
1911 if (ret) {
1912 unlock_page(page);
1913 goto err;
1914 }
1915 unlock_page(page);
1916 }
1917 if (page == NULL || !PageUptodate(page)) {
1918 ret = -EIO;
1919 goto err;
1920 }
1921 mark_page_accessed(page);
1922err:
1923 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1924 if (bitmap_page)
1925 page_cache_release(bitmap_page);
1926 if (page)
1927 page_cache_release(page);
1928 return ret;
1929}
1930
1931static noinline_for_stack int 1954static noinline_for_stack int
1932ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1955ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1933{ 1956{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1938 struct ext4_sb_info *sbi; 1961 struct ext4_sb_info *sbi;
1939 struct super_block *sb; 1962 struct super_block *sb;
1940 struct ext4_buddy e4b; 1963 struct ext4_buddy e4b;
1941 loff_t size, isize;
1942 1964
1943 sb = ac->ac_sb; 1965 sb = ac->ac_sb;
1944 sbi = EXT4_SB(sb); 1966 sbi = EXT4_SB(sb);
1945 ngroups = ext4_get_groups_count(sb); 1967 ngroups = ext4_get_groups_count(sb);
1968 /* non-extent files are limited to low blocks/groups */
1969 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
1970 ngroups = sbi->s_blockfile_groups;
1971
1946 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1972 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1947 1973
1948 /* first, try the goal */ 1974 /* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1974 } 2000 }
1975 2001
1976 bsbits = ac->ac_sb->s_blocksize_bits; 2002 bsbits = ac->ac_sb->s_blocksize_bits;
1977 /* if stream allocation is enabled, use global goal */
1978 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
1979 isize = i_size_read(ac->ac_inode) >> bsbits;
1980 if (size < isize)
1981 size = isize;
1982 2003
1983 if (size < sbi->s_mb_stream_request && 2004 /* if stream allocation is enabled, use global goal */
1984 (ac->ac_flags & EXT4_MB_HINT_DATA)) { 2005 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1985 /* TBD: may be hot point */ 2006 /* TBD: may be hot point */
1986 spin_lock(&sbi->s_md_lock); 2007 spin_lock(&sbi->s_md_lock);
1987 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2008 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
1988 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2009 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1989 spin_unlock(&sbi->s_md_lock); 2010 spin_unlock(&sbi->s_md_lock);
1990 } 2011 }
2012
1991 /* Let's just scan groups to find more-less suitable blocks */ 2013 /* Let's just scan groups to find more-less suitable blocks */
1992 cr = ac->ac_2order ? 0 : 1; 2014 cr = ac->ac_2order ? 0 : 1;
1993 /* 2015 /*
@@ -2015,27 +2037,6 @@ repeat:
2015 if (grp->bb_free == 0) 2037 if (grp->bb_free == 0)
2016 continue; 2038 continue;
2017 2039
2018 /*
2019 * if the group is already init we check whether it is
2020 * a good group and if not we don't load the buddy
2021 */
2022 if (EXT4_MB_GRP_NEED_INIT(grp)) {
2023 /*
2024 * we need full data about the group
2025 * to make a good selection
2026 */
2027 err = ext4_mb_init_group(sb, group);
2028 if (err)
2029 goto out;
2030 }
2031
2032 /*
2033 * If the particular group doesn't satisfy our
2034 * criteria we continue with the next group
2035 */
2036 if (!ext4_mb_good_group(ac, group, cr))
2037 continue;
2038
2039 err = ext4_mb_load_buddy(sb, group, &e4b); 2040 err = ext4_mb_load_buddy(sb, group, &e4b);
2040 if (err) 2041 if (err)
2041 goto out; 2042 goto out;
@@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2156 2157
2157 if (v == SEQ_START_TOKEN) { 2158 if (v == SEQ_START_TOKEN) {
2158 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " 2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2159 "%-5s %-2s %-5s %-5s %-5s %-6s\n", 2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2160 "pid", "inode", "original", "goal", "result", "found", 2161 "pid", "inode", "original", "goal", "result", "found",
2161 "grps", "cr", "flags", "merge", "tail", "broken"); 2162 "grps", "cr", "flags", "merge", "tail", "broken");
2162 return 0; 2163 return 0;
@@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2164 2165
2165 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2166 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2167 "%-5u %-5s %-5u %-6u\n"; 2168 "0x%04x %-5s %-5u %-6u\n";
2168 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, 2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2169 hs->result.fe_start, hs->result.fe_len, 2170 hs->result.fe_start, hs->result.fe_len,
2170 hs->result.fe_logical); 2171 hs->result.fe_logical);
@@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2205{ 2206{
2206} 2207}
2207 2208
2208static struct seq_operations ext4_mb_seq_history_ops = { 2209static const struct seq_operations ext4_mb_seq_history_ops = {
2209 .start = ext4_mb_seq_history_start, 2210 .start = ext4_mb_seq_history_start,
2210 .next = ext4_mb_seq_history_next, 2211 .next = ext4_mb_seq_history_next,
2211 .stop = ext4_mb_seq_history_stop, 2212 .stop = ext4_mb_seq_history_stop,
@@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
2287 return count; 2288 return count;
2288} 2289}
2289 2290
2290static struct file_operations ext4_mb_seq_history_fops = { 2291static const struct file_operations ext4_mb_seq_history_fops = {
2291 .owner = THIS_MODULE, 2292 .owner = THIS_MODULE,
2292 .open = ext4_mb_seq_history_open, 2293 .open = ext4_mb_seq_history_open,
2293 .read = seq_read, 2294 .read = seq_read,
@@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2328 struct ext4_buddy e4b; 2329 struct ext4_buddy e4b;
2329 struct sg { 2330 struct sg {
2330 struct ext4_group_info info; 2331 struct ext4_group_info info;
2331 unsigned short counters[16]; 2332 ext4_grpblk_t counters[16];
2332 } sg; 2333 } sg;
2333 2334
2334 group--; 2335 group--;
@@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2366{ 2367{
2367} 2368}
2368 2369
2369static struct seq_operations ext4_mb_seq_groups_ops = { 2370static const struct seq_operations ext4_mb_seq_groups_ops = {
2370 .start = ext4_mb_seq_groups_start, 2371 .start = ext4_mb_seq_groups_start,
2371 .next = ext4_mb_seq_groups_next, 2372 .next = ext4_mb_seq_groups_next,
2372 .stop = ext4_mb_seq_groups_stop, 2373 .stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2387 2388
2388} 2389}
2389 2390
2390static struct file_operations ext4_mb_seq_groups_fops = { 2391static const struct file_operations ext4_mb_seq_groups_fops = {
2391 .owner = THIS_MODULE, 2392 .owner = THIS_MODULE,
2392 .open = ext4_mb_seq_groups_open, 2393 .open = ext4_mb_seq_groups_open,
2393 .read = seq_read, 2394 .read = seq_read,
@@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2532 2533
2533 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2534 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2534 init_rwsem(&meta_group_info[i]->alloc_sem); 2535 init_rwsem(&meta_group_info[i]->alloc_sem);
2535 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2536 meta_group_info[i]->bb_free_root.rb_node = NULL;
2536 2537
2537#ifdef DOUBLE_CHECK 2538#ifdef DOUBLE_CHECK
2538 { 2539 {
@@ -2558,26 +2559,15 @@ exit_meta_group_info:
2558 return -ENOMEM; 2559 return -ENOMEM;
2559} /* ext4_mb_add_groupinfo */ 2560} /* ext4_mb_add_groupinfo */
2560 2561
2561/*
2562 * Update an existing group.
2563 * This function is used for online resize
2564 */
2565void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2566{
2567 grp->bb_free += add;
2568}
2569
2570static int ext4_mb_init_backend(struct super_block *sb) 2562static int ext4_mb_init_backend(struct super_block *sb)
2571{ 2563{
2572 ext4_group_t ngroups = ext4_get_groups_count(sb); 2564 ext4_group_t ngroups = ext4_get_groups_count(sb);
2573 ext4_group_t i; 2565 ext4_group_t i;
2574 int metalen;
2575 struct ext4_sb_info *sbi = EXT4_SB(sb); 2566 struct ext4_sb_info *sbi = EXT4_SB(sb);
2576 struct ext4_super_block *es = sbi->s_es; 2567 struct ext4_super_block *es = sbi->s_es;
2577 int num_meta_group_infos; 2568 int num_meta_group_infos;
2578 int num_meta_group_infos_max; 2569 int num_meta_group_infos_max;
2579 int array_size; 2570 int array_size;
2580 struct ext4_group_info **meta_group_info;
2581 struct ext4_group_desc *desc; 2571 struct ext4_group_desc *desc;
2582 2572
2583 /* This is the number of blocks used by GDT */ 2573 /* This is the number of blocks used by GDT */
@@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
2622 goto err_freesgi; 2612 goto err_freesgi;
2623 } 2613 }
2624 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2614 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2625
2626 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2627 for (i = 0; i < num_meta_group_infos; i++) {
2628 if ((i + 1) == num_meta_group_infos)
2629 metalen = sizeof(*meta_group_info) *
2630 (ngroups -
2631 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2632 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2633 if (meta_group_info == NULL) {
2634 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2635 "buddy group\n");
2636 goto err_freemeta;
2637 }
2638 sbi->s_group_info[i] = meta_group_info;
2639 }
2640
2641 for (i = 0; i < ngroups; i++) { 2615 for (i = 0; i < ngroups; i++) {
2642 desc = ext4_get_group_desc(sb, i, NULL); 2616 desc = ext4_get_group_desc(sb, i, NULL);
2643 if (desc == NULL) { 2617 if (desc == NULL) {
@@ -2655,7 +2629,6 @@ err_freebuddy:
2655 while (i-- > 0) 2629 while (i-- > 0)
2656 kfree(ext4_get_group_info(sb, i)); 2630 kfree(ext4_get_group_info(sb, i));
2657 i = num_meta_group_infos; 2631 i = num_meta_group_infos;
2658err_freemeta:
2659 while (i-- > 0) 2632 while (i-- > 0)
2660 kfree(sbi->s_group_info[i]); 2633 kfree(sbi->s_group_info[i]);
2661 iput(sbi->s_buddy_cache); 2634 iput(sbi->s_buddy_cache);
@@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2672 unsigned max; 2645 unsigned max;
2673 int ret; 2646 int ret;
2674 2647
2675 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2648 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2676 2649
2677 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2650 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2678 if (sbi->s_mb_offsets == NULL) { 2651 if (sbi->s_mb_offsets == NULL) {
2679 return -ENOMEM; 2652 return -ENOMEM;
2680 } 2653 }
2681 2654
2682 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2655 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2683 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2656 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2684 if (sbi->s_mb_maxs == NULL) { 2657 if (sbi->s_mb_maxs == NULL) {
2685 kfree(sbi->s_mb_offsets); 2658 kfree(sbi->s_mb_offsets);
@@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2758 kmem_cache_free(ext4_pspace_cachep, pa); 2731 kmem_cache_free(ext4_pspace_cachep, pa);
2759 } 2732 }
2760 if (count) 2733 if (count)
2761 mb_debug("mballoc: %u PAs left\n", count); 2734 mb_debug(1, "mballoc: %u PAs left\n", count);
2762 2735
2763} 2736}
2764 2737
@@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2839 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2812 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2840 entry = list_entry(l, struct ext4_free_data, list); 2813 entry = list_entry(l, struct ext4_free_data, list);
2841 2814
2842 mb_debug("gonna free %u blocks in group %u (0x%p):", 2815 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2843 entry->count, entry->group, entry); 2816 entry->count, entry->group, entry);
2844 2817
2845 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2818 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2874 ext4_mb_release_desc(&e4b); 2847 ext4_mb_release_desc(&e4b);
2875 } 2848 }
2876 2849
2877 mb_debug("freed %u blocks in %u structures\n", count, count2); 2850 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2851}
2852
2853#ifdef CONFIG_EXT4_DEBUG
2854u8 mb_enable_debug __read_mostly;
2855
2856static struct dentry *debugfs_dir;
2857static struct dentry *debugfs_debug;
2858
2859static void __init ext4_create_debugfs_entry(void)
2860{
2861 debugfs_dir = debugfs_create_dir("ext4", NULL);
2862 if (debugfs_dir)
2863 debugfs_debug = debugfs_create_u8("mballoc-debug",
2864 S_IRUGO | S_IWUSR,
2865 debugfs_dir,
2866 &mb_enable_debug);
2867}
2868
2869static void ext4_remove_debugfs_entry(void)
2870{
2871 debugfs_remove(debugfs_debug);
2872 debugfs_remove(debugfs_dir);
2878} 2873}
2879 2874
2875#else
2876
2877static void __init ext4_create_debugfs_entry(void)
2878{
2879}
2880
2881static void ext4_remove_debugfs_entry(void)
2882{
2883}
2884
2885#endif
2886
2880int __init init_ext4_mballoc(void) 2887int __init init_ext4_mballoc(void)
2881{ 2888{
2882 ext4_pspace_cachep = 2889 ext4_pspace_cachep =
@@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void)
2904 kmem_cache_destroy(ext4_ac_cachep); 2911 kmem_cache_destroy(ext4_ac_cachep);
2905 return -ENOMEM; 2912 return -ENOMEM;
2906 } 2913 }
2914 ext4_create_debugfs_entry();
2907 return 0; 2915 return 0;
2908} 2916}
2909 2917
@@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void)
2917 kmem_cache_destroy(ext4_pspace_cachep); 2925 kmem_cache_destroy(ext4_pspace_cachep);
2918 kmem_cache_destroy(ext4_ac_cachep); 2926 kmem_cache_destroy(ext4_ac_cachep);
2919 kmem_cache_destroy(ext4_free_ext_cachep); 2927 kmem_cache_destroy(ext4_free_ext_cachep);
2928 ext4_remove_debugfs_entry();
2920} 2929}
2921 2930
2922 2931
@@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3061 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 3070 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
3062 else 3071 else
3063 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3072 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3064 mb_debug("#%u: goal %u blocks for locality group\n", 3073 mb_debug(1, "#%u: goal %u blocks for locality group\n",
3065 current->pid, ac->ac_g_ex.fe_len); 3074 current->pid, ac->ac_g_ex.fe_len);
3066} 3075}
3067 3076
@@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3180 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3189 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3181 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3190 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3182 3191
3183 /* skip PA normalized request doesn't overlap with */ 3192 /* skip PAs this normalized request doesn't overlap with */
3184 if (pa->pa_lstart >= end) { 3193 if (pa->pa_lstart >= end || pa_end <= start) {
3185 spin_unlock(&pa->pa_lock);
3186 continue;
3187 }
3188 if (pa_end <= start) {
3189 spin_unlock(&pa->pa_lock); 3194 spin_unlock(&pa->pa_lock);
3190 continue; 3195 continue;
3191 } 3196 }
3192 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3197 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3193 3198
3199 /* adjust start or end to be adjacent to this pa */
3194 if (pa_end <= ac->ac_o_ex.fe_logical) { 3200 if (pa_end <= ac->ac_o_ex.fe_logical) {
3195 BUG_ON(pa_end < start); 3201 BUG_ON(pa_end < start);
3196 start = pa_end; 3202 start = pa_end;
3197 } 3203 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3198
3199 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3200 BUG_ON(pa->pa_lstart > end); 3204 BUG_ON(pa->pa_lstart > end);
3201 end = pa->pa_lstart; 3205 end = pa->pa_lstart;
3202 } 3206 }
@@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3251 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3255 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3252 } 3256 }
3253 3257
3254 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, 3258 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3255 (unsigned) orig_size, (unsigned) start); 3259 (unsigned) orig_size, (unsigned) start);
3256} 3260}
3257 3261
@@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3300 BUG_ON(pa->pa_free < len); 3304 BUG_ON(pa->pa_free < len);
3301 pa->pa_free -= len; 3305 pa->pa_free -= len;
3302 3306
3303 mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); 3307 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3304} 3308}
3305 3309
3306/* 3310/*
@@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3324 * in on-disk bitmap -- see ext4_mb_release_context() 3328 * in on-disk bitmap -- see ext4_mb_release_context()
3325 * Other CPUs are prevented from allocating from this pa by lg_mutex 3329 * Other CPUs are prevented from allocating from this pa by lg_mutex
3326 */ 3330 */
3327 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3331 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3328} 3332}
3329 3333
3330/* 3334/*
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3382 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3386 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3383 continue; 3387 continue;
3384 3388
3389 /* non-extent files can't have physical blocks past 2^32 */
3390 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
3391 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3392 continue;
3393
3385 /* found preallocated blocks, use them */ 3394 /* found preallocated blocks, use them */
3386 spin_lock(&pa->pa_lock); 3395 spin_lock(&pa->pa_lock);
3387 if (pa->pa_deleted == 0 && pa->pa_free) { 3396 if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3503 preallocated += len; 3512 preallocated += len;
3504 count++; 3513 count++;
3505 } 3514 }
3506 mb_debug("prellocated %u for group %u\n", preallocated, group); 3515 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3507} 3516}
3508 3517
3509static void ext4_mb_pa_callback(struct rcu_head *head) 3518static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3638 pa->pa_deleted = 0; 3647 pa->pa_deleted = 0;
3639 pa->pa_type = MB_INODE_PA; 3648 pa->pa_type = MB_INODE_PA;
3640 3649
3641 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3650 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3642 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3651 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3643 trace_ext4_mb_new_inode_pa(ac, pa); 3652 trace_ext4_mb_new_inode_pa(ac, pa);
3644 3653
@@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3698 pa->pa_deleted = 0; 3707 pa->pa_deleted = 0;
3699 pa->pa_type = MB_GROUP_PA; 3708 pa->pa_type = MB_GROUP_PA;
3700 3709
3701 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3710 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3702 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3711 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3703 trace_ext4_mb_new_group_pa(ac, pa); 3712 trace_ext4_mb_new_group_pa(ac, pa);
3704 3713
@@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3777 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3786 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3778 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3787 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3779 le32_to_cpu(sbi->s_es->s_first_data_block); 3788 le32_to_cpu(sbi->s_es->s_first_data_block);
3780 mb_debug(" free preallocated %u/%u in group %u\n", 3789 mb_debug(1, " free preallocated %u/%u in group %u\n",
3781 (unsigned) start, (unsigned) next - bit, 3790 (unsigned) start, (unsigned) next - bit,
3782 (unsigned) group); 3791 (unsigned) group);
3783 free += next - bit; 3792 free += next - bit;
@@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3868 int busy = 0; 3877 int busy = 0;
3869 int free = 0; 3878 int free = 0;
3870 3879
3871 mb_debug("discard preallocation for group %u\n", group); 3880 mb_debug(1, "discard preallocation for group %u\n", group);
3872 3881
3873 if (list_empty(&grp->bb_prealloc_list)) 3882 if (list_empty(&grp->bb_prealloc_list))
3874 return 0; 3883 return 0;
@@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode)
3992 return; 4001 return;
3993 } 4002 }
3994 4003
3995 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4004 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3996 trace_ext4_discard_preallocations(inode); 4005 trace_ext4_discard_preallocations(inode);
3997 4006
3998 INIT_LIST_HEAD(&list); 4007 INIT_LIST_HEAD(&list);
@@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
4097{ 4106{
4098 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); 4107 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
4099} 4108}
4100#ifdef MB_DEBUG 4109#ifdef CONFIG_EXT4_DEBUG
4101static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4110static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4102{ 4111{
4103 struct super_block *sb = ac->ac_sb; 4112 struct super_block *sb = ac->ac_sb;
@@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4139 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4148 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4140 NULL, &start); 4149 NULL, &start);
4141 spin_unlock(&pa->pa_lock); 4150 spin_unlock(&pa->pa_lock);
4142 printk(KERN_ERR "PA:%lu:%d:%u \n", i, 4151 printk(KERN_ERR "PA:%u:%d:%u \n", i,
4143 start, pa->pa_len); 4152 start, pa->pa_len);
4144 } 4153 }
4145 ext4_unlock_group(sb, i); 4154 ext4_unlock_group(sb, i);
4146 4155
4147 if (grp->bb_free == 0) 4156 if (grp->bb_free == 0)
4148 continue; 4157 continue;
4149 printk(KERN_ERR "%lu: %d/%d \n", 4158 printk(KERN_ERR "%u: %d/%d \n",
4150 i, grp->bb_free, grp->bb_fragments); 4159 i, grp->bb_free, grp->bb_fragments);
4151 } 4160 }
4152 printk(KERN_ERR "\n"); 4161 printk(KERN_ERR "\n");
@@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4174 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4183 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4175 return; 4184 return;
4176 4185
4186 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4187 return;
4188
4177 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4178 isize = i_size_read(ac->ac_inode) >> bsbits; 4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits;
4179 size = max(size, isize); 4192 size = max(size, isize);
4180 4193
4181 /* don't use group allocation for large files */ 4194 if ((size == isize) &&
4182 if (size >= sbi->s_mb_stream_request) 4195 !ext4_fs_is_busy(sbi) &&
4196 (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4197 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4183 return; 4198 return;
4199 }
4184 4200
4185 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4201 /* don't use group allocation for large files */
4202 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4186 return; 4204 return;
4205 }
4187 4206
4188 BUG_ON(ac->ac_lg != NULL); 4207 BUG_ON(ac->ac_lg != NULL);
4189 /* 4208 /*
@@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4246 * locality group. this is a policy, actually */ 4265 * locality group. this is a policy, actually */
4247 ext4_mb_group_or_file(ac); 4266 ext4_mb_group_or_file(ac);
4248 4267
4249 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4268 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4250 "left: %u/%u, right %u/%u to %swritable\n", 4269 "left: %u/%u, right %u/%u to %swritable\n",
4251 (unsigned) ar->len, (unsigned) ar->logical, 4270 (unsigned) ar->len, (unsigned) ar->logical,
4252 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4271 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4268 struct ext4_prealloc_space *pa, *tmp; 4287 struct ext4_prealloc_space *pa, *tmp;
4269 struct ext4_allocation_context *ac; 4288 struct ext4_allocation_context *ac;
4270 4289
4271 mb_debug("discard locality group preallocation\n"); 4290 mb_debug(1, "discard locality group preallocation\n");
4272 4291
4273 INIT_LIST_HEAD(&discard_list); 4292 INIT_LIST_HEAD(&discard_list);
4274 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4293 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..188d3d709b24 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
37 37
38/* 38/*
39 */ 39 */
40#define MB_DEBUG__ 40#ifdef CONFIG_EXT4_DEBUG
41#ifdef MB_DEBUG 41extern u8 mb_enable_debug;
42#define mb_debug(fmt, a...) printk(fmt, ##a) 42
43#define mb_debug(n, fmt, a...) \
44 do { \
45 if ((n) <= mb_enable_debug) { \
46 printk(KERN_DEBUG "(%s, %d): %s: ", \
47 __FILE__, __LINE__, __func__); \
48 printk(fmt, ## a); \
49 } \
50 } while (0)
43#else 51#else
44#define mb_debug(fmt, a...) 52#define mb_debug(n, fmt, a...)
45#endif 53#endif
46 54
47/* 55/*
@@ -128,8 +136,8 @@ struct ext4_prealloc_space {
128 unsigned pa_deleted; 136 unsigned pa_deleted;
129 ext4_fsblk_t pa_pstart; /* phys. block */ 137 ext4_fsblk_t pa_pstart; /* phys. block */
130 ext4_lblk_t pa_lstart; /* log. block */ 138 ext4_lblk_t pa_lstart; /* log. block */
131 unsigned short pa_len; /* len of preallocated chunk */ 139 ext4_grpblk_t pa_len; /* len of preallocated chunk */
132 unsigned short pa_free; /* how many blocks are free */ 140 ext4_grpblk_t pa_free; /* how many blocks are free */
133 unsigned short pa_type; /* pa type. inode or group */ 141 unsigned short pa_type; /* pa type. inode or group */
134 spinlock_t *pa_obj_lock; 142 spinlock_t *pa_obj_lock;
135 struct inode *pa_inode; /* hack, for history only */ 143 struct inode *pa_inode; /* hack, for history only */
@@ -144,7 +152,7 @@ struct ext4_free_extent {
144 ext4_lblk_t fe_logical; 152 ext4_lblk_t fe_logical;
145 ext4_grpblk_t fe_start; 153 ext4_grpblk_t fe_start;
146 ext4_group_t fe_group; 154 ext4_group_t fe_group;
147 int fe_len; 155 ext4_grpblk_t fe_len;
148}; 156};
149 157
150/* 158/*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..bf519f239ae6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
353 353
354 down_write(&EXT4_I(inode)->i_data_sem); 354 down_write(&EXT4_I(inode)->i_data_sem);
355 /* 355 /*
356 * if EXT4_EXT_MIGRATE is cleared a block allocation 356 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
357 * happened after we started the migrate. We need to 357 * happened after we started the migrate. We need to
358 * fail the migrate 358 * fail the migrate
359 */ 359 */
360 if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { 360 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
361 retval = -EAGAIN; 361 retval = -EAGAIN;
362 up_write(&EXT4_I(inode)->i_data_sem); 362 up_write(&EXT4_I(inode)->i_data_sem);
363 goto err_out; 363 goto err_out;
364 } else 364 } else
365 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 365 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
366 ~EXT4_EXT_MIGRATE;
367 /* 366 /*
368 * We have the extent map build with the tmp inode. 367 * We have the extent map build with the tmp inode.
369 * Now copy the i_data across 368 * Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
517 * when we add extents we extent the journal 516 * when we add extents we extent the journal
518 */ 517 */
519 /* 518 /*
520 * Even though we take i_mutex we can still cause block allocation 519 * Even though we take i_mutex we can still cause block
521 * via mmap write to holes. If we have allocated new blocks we fail 520 * allocation via mmap write to holes. If we have allocated
522 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. 521 * new blocks we fail migrate. New block allocation will
523 * The flag is updated with i_data_sem held to prevent racing with 522 * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
524 * block allocation. 523 * with i_data_sem held to prevent racing with block
524 * allocation.
525 */ 525 */
526 down_read((&EXT4_I(inode)->i_data_sem)); 526 down_read((&EXT4_I(inode)->i_data_sem));
527 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; 527 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
528 up_read((&EXT4_I(inode)->i_data_sem)); 528 up_read((&EXT4_I(inode)->i_data_sem));
529 529
530 handle = ext4_journal_start(inode, 1); 530 handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
618 tmp_inode->i_nlink = 0; 618 tmp_inode->i_nlink = 0;
619 619
620 ext4_journal_stop(handle); 620 ext4_journal_stop(handle);
621 621 unlock_new_inode(tmp_inode);
622 iput(tmp_inode); 622 iput(tmp_inode);
623 623
624 return retval; 624 return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..c07a2915e40b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
19#include "ext4_extents.h" 19#include "ext4_extents.h"
20#include "ext4.h" 20#include "ext4.h"
21 21
22#define get_ext_path(path, inode, block, ret) \ 22/**
23 do { \ 23 * get_ext_path - Find an extent path for designated logical block number.
24 path = ext4_ext_find_extent(inode, block, path); \ 24 *
25 if (IS_ERR(path)) { \ 25 * @inode: an inode which is searched
26 ret = PTR_ERR(path); \ 26 * @lblock: logical block number to find an extent path
27 path = NULL; \ 27 * @path: pointer to an extent path pointer (for output)
28 } \ 28 *
29 } while (0) 29 * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
30 * on failure.
31 */
32static inline int
33get_ext_path(struct inode *inode, ext4_lblk_t lblock,
34 struct ext4_ext_path **path)
35{
36 int ret = 0;
37
38 *path = ext4_ext_find_extent(inode, lblock, *path);
39 if (IS_ERR(*path)) {
40 ret = PTR_ERR(*path);
41 *path = NULL;
42 } else if ((*path)[ext_depth(inode)].p_ext == NULL)
43 ret = -ENODATA;
44
45 return ret;
46}
30 47
31/** 48/**
32 * copy_extent_status - Copy the extent's initialization status 49 * copy_extent_status - Copy the extent's initialization status
@@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
113} 130}
114 131
115/** 132/**
133 * mext_check_null_inode - NULL check for two inodes
134 *
135 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
136 */
137static int
138mext_check_null_inode(struct inode *inode1, struct inode *inode2,
139 const char *function)
140{
141 int ret = 0;
142
143 if (inode1 == NULL) {
144 ext4_error(inode2->i_sb, function,
145 "Both inodes should not be NULL: "
146 "inode1 NULL inode2 %lu", inode2->i_ino);
147 ret = -EIO;
148 } else if (inode2 == NULL) {
149 ext4_error(inode1->i_sb, function,
150 "Both inodes should not be NULL: "
151 "inode1 %lu inode2 NULL", inode1->i_ino);
152 ret = -EIO;
153 }
154 return ret;
155}
156
157/**
116 * mext_double_down_read - Acquire two inodes' read semaphore 158 * mext_double_down_read - Acquire two inodes' read semaphore
117 * 159 *
118 * @orig_inode: original inode structure 160 * @orig_inode: original inode structure
@@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
124{ 166{
125 struct inode *first = orig_inode, *second = donor_inode; 167 struct inode *first = orig_inode, *second = donor_inode;
126 168
127 BUG_ON(orig_inode == NULL || donor_inode == NULL);
128
129 /* 169 /*
130 * Use the inode number to provide the stable locking order instead 170 * Use the inode number to provide the stable locking order instead
131 * of its address, because the C language doesn't guarantee you can 171 * of its address, because the C language doesn't guarantee you can
@@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
152{ 192{
153 struct inode *first = orig_inode, *second = donor_inode; 193 struct inode *first = orig_inode, *second = donor_inode;
154 194
155 BUG_ON(orig_inode == NULL || donor_inode == NULL);
156
157 /* 195 /*
158 * Use the inode number to provide the stable locking order instead 196 * Use the inode number to provide the stable locking order instead
159 * of its address, because the C language doesn't guarantee you can 197 * of its address, because the C language doesn't guarantee you can
@@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
178static void 216static void
179mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) 217mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
180{ 218{
181 BUG_ON(orig_inode == NULL || donor_inode == NULL);
182
183 up_read(&EXT4_I(orig_inode)->i_data_sem); 219 up_read(&EXT4_I(orig_inode)->i_data_sem);
184 up_read(&EXT4_I(donor_inode)->i_data_sem); 220 up_read(&EXT4_I(donor_inode)->i_data_sem);
185} 221}
@@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
194static void 230static void
195mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) 231mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
196{ 232{
197 BUG_ON(orig_inode == NULL || donor_inode == NULL);
198
199 up_write(&EXT4_I(orig_inode)->i_data_sem); 233 up_write(&EXT4_I(orig_inode)->i_data_sem);
200 up_write(&EXT4_I(donor_inode)->i_data_sem); 234 up_write(&EXT4_I(donor_inode)->i_data_sem);
201} 235}
@@ -283,8 +317,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
283 } 317 }
284 318
285 if (new_flag) { 319 if (new_flag) {
286 get_ext_path(orig_path, orig_inode, eblock, err); 320 err = get_ext_path(orig_inode, eblock, &orig_path);
287 if (orig_path == NULL) 321 if (err)
288 goto out; 322 goto out;
289 323
290 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
@@ -293,9 +327,9 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
293 } 327 }
294 328
295 if (end_flag) { 329 if (end_flag) {
296 get_ext_path(orig_path, orig_inode, 330 err = get_ext_path(orig_inode,
297 le32_to_cpu(end_ext->ee_block) - 1, err); 331 le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
298 if (orig_path == NULL) 332 if (err)
299 goto out; 333 goto out;
300 334
301 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
@@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
519 * oext |-----------| 553 * oext |-----------|
520 * new_ext |-------| 554 * new_ext |-------|
521 */ 555 */
522 BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); 556 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
557 ext4_error(orig_inode->i_sb, __func__,
558 "new_ext_end(%u) should be less than or equal to "
559 "oext->ee_block(%u) + oext_alen(%d) - 1",
560 new_ext_end, le32_to_cpu(oext->ee_block),
561 oext_alen);
562 ret = -EIO;
563 goto out;
564 }
523 565
524 /* 566 /*
525 * Case: new_ext is smaller than original extent 567 * Case: new_ext is smaller than original extent
@@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
543 585
544 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, 586 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
545 o_end, &start_ext, &new_ext, &end_ext); 587 o_end, &start_ext, &new_ext, &end_ext);
588out:
546 return ret; 589 return ret;
547} 590}
548 591
@@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
554 * @orig_off: block offset of original inode 597 * @orig_off: block offset of original inode
555 * @donor_off: block offset of donor inode 598 * @donor_off: block offset of donor inode
556 * @max_count: the maximun length of extents 599 * @max_count: the maximun length of extents
600 *
601 * Return 0 on success, or a negative error value on failure.
557 */ 602 */
558static void 603static int
559mext_calc_swap_extents(struct ext4_extent *tmp_dext, 604mext_calc_swap_extents(struct ext4_extent *tmp_dext,
560 struct ext4_extent *tmp_oext, 605 struct ext4_extent *tmp_oext,
561 ext4_lblk_t orig_off, ext4_lblk_t donor_off, 606 ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
564 ext4_lblk_t diff, orig_diff; 609 ext4_lblk_t diff, orig_diff;
565 struct ext4_extent dext_old, oext_old; 610 struct ext4_extent dext_old, oext_old;
566 611
612 BUG_ON(orig_off != donor_off);
613
614 /* original and donor extents have to cover the same block offset */
615 if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
616 le32_to_cpu(tmp_oext->ee_block) +
617 ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
618 return -ENODATA;
619
620 if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
621 le32_to_cpu(tmp_dext->ee_block) +
622 ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
623 return -ENODATA;
624
567 dext_old = *tmp_dext; 625 dext_old = *tmp_dext;
568 oext_old = *tmp_oext; 626 oext_old = *tmp_oext;
569 627
@@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
591 649
592 copy_extent_status(&oext_old, tmp_dext); 650 copy_extent_status(&oext_old, tmp_dext);
593 copy_extent_status(&dext_old, tmp_oext); 651 copy_extent_status(&dext_old, tmp_oext);
652
653 return 0;
594} 654}
595 655
596/** 656/**
@@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
631 mext_double_down_write(orig_inode, donor_inode); 691 mext_double_down_write(orig_inode, donor_inode);
632 692
633 /* Get the original extent for the block "orig_off" */ 693 /* Get the original extent for the block "orig_off" */
634 get_ext_path(orig_path, orig_inode, orig_off, err); 694 err = get_ext_path(orig_inode, orig_off, &orig_path);
635 if (orig_path == NULL) 695 if (err)
636 goto out; 696 goto out;
637 697
638 /* Get the donor extent for the head */ 698 /* Get the donor extent for the head */
639 get_ext_path(donor_path, donor_inode, donor_off, err); 699 err = get_ext_path(donor_inode, donor_off, &donor_path);
640 if (donor_path == NULL) 700 if (err)
641 goto out; 701 goto out;
642 depth = ext_depth(orig_inode); 702 depth = ext_depth(orig_inode);
643 oext = orig_path[depth].p_ext; 703 oext = orig_path[depth].p_ext;
@@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
647 dext = donor_path[depth].p_ext; 707 dext = donor_path[depth].p_ext;
648 tmp_dext = *dext; 708 tmp_dext = *dext;
649 709
650 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 710 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
651 donor_off, count); 711 donor_off, count);
712 if (err)
713 goto out;
652 714
653 /* Loop for the donor extents */ 715 /* Loop for the donor extents */
654 while (1) { 716 while (1) {
655 /* The extent for donor must be found. */ 717 /* The extent for donor must be found. */
656 BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); 718 if (!dext) {
719 ext4_error(donor_inode->i_sb, __func__,
720 "The extent for donor must be found");
721 err = -EIO;
722 goto out;
723 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
724 ext4_error(donor_inode->i_sb, __func__,
725 "Donor offset(%u) and the first block of donor "
726 "extent(%u) should be equal",
727 donor_off,
728 le32_to_cpu(tmp_dext.ee_block));
729 err = -EIO;
730 goto out;
731 }
657 732
658 /* Set donor extent to orig extent */ 733 /* Set donor extent to orig extent */
659 err = mext_leaf_block(handle, orig_inode, 734 err = mext_leaf_block(handle, orig_inode,
@@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
678 753
679 if (orig_path) 754 if (orig_path)
680 ext4_ext_drop_refs(orig_path); 755 ext4_ext_drop_refs(orig_path);
681 get_ext_path(orig_path, orig_inode, orig_off, err); 756 err = get_ext_path(orig_inode, orig_off, &orig_path);
682 if (orig_path == NULL) 757 if (err)
683 goto out; 758 goto out;
684 depth = ext_depth(orig_inode); 759 depth = ext_depth(orig_inode);
685 oext = orig_path[depth].p_ext; 760 oext = orig_path[depth].p_ext;
@@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 767
693 if (donor_path) 768 if (donor_path)
694 ext4_ext_drop_refs(donor_path); 769 ext4_ext_drop_refs(donor_path);
695 get_ext_path(donor_path, donor_inode, 770 err = get_ext_path(donor_inode, donor_off, &donor_path);
696 donor_off, err); 771 if (err)
697 if (donor_path == NULL)
698 goto out; 772 goto out;
699 depth = ext_depth(donor_inode); 773 depth = ext_depth(donor_inode);
700 dext = donor_path[depth].p_ext; 774 dext = donor_path[depth].p_ext;
@@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
705 } 779 }
706 tmp_dext = *dext; 780 tmp_dext = *dext;
707 781
708 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 782 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
709 donor_off, 783 donor_off, count - replaced_count);
710 count - replaced_count); 784 if (err)
785 goto out;
711 } 786 }
712 787
713out: 788out:
@@ -740,7 +815,7 @@ out:
740 * on success, or a negative error value on failure. 815 * on success, or a negative error value on failure.
741 */ 816 */
742static int 817static int
743move_extent_par_page(struct file *o_filp, struct inode *donor_inode, 818move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
744 pgoff_t orig_page_offset, int data_offset_in_page, 819 pgoff_t orig_page_offset, int data_offset_in_page,
745 int block_len_in_page, int uninit) 820 int block_len_in_page, int uninit)
746{ 821{
@@ -871,6 +946,7 @@ out:
871 if (PageLocked(page)) 946 if (PageLocked(page))
872 unlock_page(page); 947 unlock_page(page);
873 page_cache_release(page); 948 page_cache_release(page);
949 ext4_journal_stop(handle);
874 } 950 }
875out2: 951out2:
876 ext4_journal_stop(handle); 952 ext4_journal_stop(handle);
@@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode,
897 struct inode *donor_inode, __u64 orig_start, 973 struct inode *donor_inode, __u64 orig_start,
898 __u64 donor_start, __u64 *len, __u64 moved_len) 974 __u64 donor_start, __u64 *len, __u64 moved_len)
899{ 975{
976 ext4_lblk_t orig_blocks, donor_blocks;
977 unsigned int blkbits = orig_inode->i_blkbits;
978 unsigned int blocksize = 1 << blkbits;
979
900 /* Regular file check */ 980 /* Regular file check */
901 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { 981 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
902 ext4_debug("ext4 move extent: The argument files should be " 982 ext4_debug("ext4 move extent: The argument files should be "
@@ -960,54 +1040,58 @@ mext_check_arguments(struct inode *orig_inode,
960 return -EINVAL; 1040 return -EINVAL;
961 } 1041 }
962 1042
963 if ((orig_start > MAX_DEFRAG_SIZE) || 1043 if ((orig_start > EXT_MAX_BLOCK) ||
964 (donor_start > MAX_DEFRAG_SIZE) || 1044 (donor_start > EXT_MAX_BLOCK) ||
965 (*len > MAX_DEFRAG_SIZE) || 1045 (*len > EXT_MAX_BLOCK) ||
966 (orig_start + *len > MAX_DEFRAG_SIZE)) { 1046 (orig_start + *len > EXT_MAX_BLOCK)) {
967 ext4_debug("ext4 move extent: Can't handle over [%lu] blocks " 1047 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
968 "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE, 1048 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
969 orig_inode->i_ino, donor_inode->i_ino); 1049 orig_inode->i_ino, donor_inode->i_ino);
970 return -EINVAL; 1050 return -EINVAL;
971 } 1051 }
972 1052
973 if (orig_inode->i_size > donor_inode->i_size) { 1053 if (orig_inode->i_size > donor_inode->i_size) {
974 if (orig_start >= donor_inode->i_size) { 1054 donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
1055 /* TODO: eliminate this artificial restriction */
1056 if (orig_start >= donor_blocks) {
975 ext4_debug("ext4 move extent: orig start offset " 1057 ext4_debug("ext4 move extent: orig start offset "
976 "[%llu] should be less than donor file size " 1058 "[%llu] should be less than donor file blocks "
977 "[%lld] [ino:orig %lu, donor_inode %lu]\n", 1059 "[%u] [ino:orig %lu, donor %lu]\n",
978 orig_start, donor_inode->i_size, 1060 orig_start, donor_blocks,
979 orig_inode->i_ino, donor_inode->i_ino); 1061 orig_inode->i_ino, donor_inode->i_ino);
980 return -EINVAL; 1062 return -EINVAL;
981 } 1063 }
982 1064
983 if (orig_start + *len > donor_inode->i_size) { 1065 /* TODO: eliminate this artificial restriction */
1066 if (orig_start + *len > donor_blocks) {
984 ext4_debug("ext4 move extent: End offset [%llu] should " 1067 ext4_debug("ext4 move extent: End offset [%llu] should "
985 "be less than donor file size [%lld]." 1068 "be less than donor file blocks [%u]."
986 "So adjust length from %llu to %lld " 1069 "So adjust length from %llu to %llu "
987 "[ino:orig %lu, donor %lu]\n", 1070 "[ino:orig %lu, donor %lu]\n",
988 orig_start + *len, donor_inode->i_size, 1071 orig_start + *len, donor_blocks,
989 *len, donor_inode->i_size - orig_start, 1072 *len, donor_blocks - orig_start,
990 orig_inode->i_ino, donor_inode->i_ino); 1073 orig_inode->i_ino, donor_inode->i_ino);
991 *len = donor_inode->i_size - orig_start; 1074 *len = donor_blocks - orig_start;
992 } 1075 }
993 } else { 1076 } else {
994 if (orig_start >= orig_inode->i_size) { 1077 orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
1078 if (orig_start >= orig_blocks) {
995 ext4_debug("ext4 move extent: start offset [%llu] " 1079 ext4_debug("ext4 move extent: start offset [%llu] "
996 "should be less than original file size " 1080 "should be less than original file blocks "
997 "[%lld] [inode:orig %lu, donor %lu]\n", 1081 "[%u] [ino:orig %lu, donor %lu]\n",
998 orig_start, orig_inode->i_size, 1082 orig_start, orig_blocks,
999 orig_inode->i_ino, donor_inode->i_ino); 1083 orig_inode->i_ino, donor_inode->i_ino);
1000 return -EINVAL; 1084 return -EINVAL;
1001 } 1085 }
1002 1086
1003 if (orig_start + *len > orig_inode->i_size) { 1087 if (orig_start + *len > orig_blocks) {
1004 ext4_debug("ext4 move extent: Adjust length " 1088 ext4_debug("ext4 move extent: Adjust length "
1005 "from %llu to %lld. Because it should be " 1089 "from %llu to %llu. Because it should be "
1006 "less than original file size " 1090 "less than original file blocks "
1007 "[ino:orig %lu, donor %lu]\n", 1091 "[ino:orig %lu, donor %lu]\n",
1008 *len, orig_inode->i_size - orig_start, 1092 *len, orig_blocks - orig_start,
1009 orig_inode->i_ino, donor_inode->i_ino); 1093 orig_inode->i_ino, donor_inode->i_ino);
1010 *len = orig_inode->i_size - orig_start; 1094 *len = orig_blocks - orig_start;
1011 } 1095 }
1012 } 1096 }
1013 1097
@@ -1027,18 +1111,23 @@ mext_check_arguments(struct inode *orig_inode,
1027 * @inode1: the inode structure 1111 * @inode1: the inode structure
1028 * @inode2: the inode structure 1112 * @inode2: the inode structure
1029 * 1113 *
1030 * Lock two inodes' i_mutex by i_ino order. This function is moved from 1114 * Lock two inodes' i_mutex by i_ino order.
1031 * fs/inode.c. 1115 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1032 */ 1116 */
1033static void 1117static int
1034mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1118mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1035{ 1119{
1036 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { 1120 int ret = 0;
1037 if (inode1) 1121
1038 mutex_lock(&inode1->i_mutex); 1122 BUG_ON(inode1 == NULL && inode2 == NULL);
1039 else if (inode2) 1123
1040 mutex_lock(&inode2->i_mutex); 1124 ret = mext_check_null_inode(inode1, inode2, __func__);
1041 return; 1125 if (ret < 0)
1126 goto out;
1127
1128 if (inode1 == inode2) {
1129 mutex_lock(&inode1->i_mutex);
1130 goto out;
1042 } 1131 }
1043 1132
1044 if (inode1->i_ino < inode2->i_ino) { 1133 if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1137,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1048 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1137 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1049 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1138 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1050 } 1139 }
1140
1141out:
1142 return ret;
1051} 1143}
1052 1144
1053/** 1145/**
@@ -1056,17 +1148,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1056 * @inode1: the inode that is released first 1148 * @inode1: the inode that is released first
1057 * @inode2: the inode that is released second 1149 * @inode2: the inode that is released second
1058 * 1150 *
1059 * This function is moved from fs/inode.c. 1151 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1060 */ 1152 */
1061 1153
1062static void 1154static int
1063mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1155mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1064{ 1156{
1157 int ret = 0;
1158
1159 BUG_ON(inode1 == NULL && inode2 == NULL);
1160
1161 ret = mext_check_null_inode(inode1, inode2, __func__);
1162 if (ret < 0)
1163 goto out;
1164
1065 if (inode1) 1165 if (inode1)
1066 mutex_unlock(&inode1->i_mutex); 1166 mutex_unlock(&inode1->i_mutex);
1067 1167
1068 if (inode2 && inode2 != inode1) 1168 if (inode2 && inode2 != inode1)
1069 mutex_unlock(&inode2->i_mutex); 1169 mutex_unlock(&inode2->i_mutex);
1170
1171out:
1172 return ret;
1070} 1173}
1071 1174
1072/** 1175/**
@@ -1123,70 +1226,76 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1123 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; 1226 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
1124 ext4_lblk_t rest_blocks; 1227 ext4_lblk_t rest_blocks;
1125 pgoff_t orig_page_offset = 0, seq_end_page; 1228 pgoff_t orig_page_offset = 0, seq_end_page;
1126 int ret, depth, last_extent = 0; 1229 int ret1, ret2, depth, last_extent = 0;
1127 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 1230 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
1128 int data_offset_in_page; 1231 int data_offset_in_page;
1129 int block_len_in_page; 1232 int block_len_in_page;
1130 int uninit; 1233 int uninit;
1131 1234
1132 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1133 mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0)
1238 return ret1;
1134 1239
1135 mext_double_down_read(orig_inode, donor_inode); 1240 mext_double_down_read(orig_inode, donor_inode);
1136 /* Check the filesystem environment whether move_extent can be done */ 1241 /* Check the filesystem environment whether move_extent can be done */
1137 ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 1242 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
1138 donor_start, &len, *moved_len); 1243 donor_start, &len, *moved_len);
1139 mext_double_up_read(orig_inode, donor_inode); 1244 mext_double_up_read(orig_inode, donor_inode);
1140 if (ret) 1245 if (ret1)
1141 goto out2; 1246 goto out;
1142 1247
1143 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; 1248 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
1144 block_end = block_start + len - 1; 1249 block_end = block_start + len - 1;
1145 if (file_end < block_end) 1250 if (file_end < block_end)
1146 len -= block_end - file_end; 1251 len -= block_end - file_end;
1147 1252
1148 get_ext_path(orig_path, orig_inode, block_start, ret); 1253 ret1 = get_ext_path(orig_inode, block_start, &orig_path);
1149 if (orig_path == NULL) 1254 if (ret1)
1150 goto out2; 1255 goto out;
1151 1256
1152 /* Get path structure to check the hole */ 1257 /* Get path structure to check the hole */
1153 get_ext_path(holecheck_path, orig_inode, block_start, ret); 1258 ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
1154 if (holecheck_path == NULL) 1259 if (ret1)
1155 goto out; 1260 goto out;
1156 1261
1157 depth = ext_depth(orig_inode); 1262 depth = ext_depth(orig_inode);
1158 ext_cur = holecheck_path[depth].p_ext; 1263 ext_cur = holecheck_path[depth].p_ext;
1159 if (ext_cur == NULL) {
1160 ret = -EINVAL;
1161 goto out;
1162 }
1163 1264
1164 /* 1265 /*
1165 * Get proper extent whose ee_block is beyond block_start 1266 * Get proper starting location of block replacement if block_start was
1166 * if block_start was within the hole. 1267 * within the hole.
1167 */ 1268 */
1168 if (le32_to_cpu(ext_cur->ee_block) + 1269 if (le32_to_cpu(ext_cur->ee_block) +
1169 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { 1270 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
1271 /*
1272 * The hole exists between extents or the tail of
1273 * original file.
1274 */
1170 last_extent = mext_next_extent(orig_inode, 1275 last_extent = mext_next_extent(orig_inode,
1171 holecheck_path, &ext_cur); 1276 holecheck_path, &ext_cur);
1172 if (last_extent < 0) { 1277 if (last_extent < 0) {
1173 ret = last_extent; 1278 ret1 = last_extent;
1174 goto out; 1279 goto out;
1175 } 1280 }
1176 last_extent = mext_next_extent(orig_inode, orig_path, 1281 last_extent = mext_next_extent(orig_inode, orig_path,
1177 &ext_dummy); 1282 &ext_dummy);
1178 if (last_extent < 0) { 1283 if (last_extent < 0) {
1179 ret = last_extent; 1284 ret1 = last_extent;
1180 goto out; 1285 goto out;
1181 } 1286 }
1182 } 1287 seq_start = le32_to_cpu(ext_cur->ee_block);
1183 seq_start = block_start; 1288 } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
1289 /* The hole exists at the beginning of original file. */
1290 seq_start = le32_to_cpu(ext_cur->ee_block);
1291 else
1292 seq_start = block_start;
1184 1293
1185 /* No blocks within the specified range. */ 1294 /* No blocks within the specified range. */
1186 if (le32_to_cpu(ext_cur->ee_block) > block_end) { 1295 if (le32_to_cpu(ext_cur->ee_block) > block_end) {
1187 ext4_debug("ext4 move extent: The specified range of file " 1296 ext4_debug("ext4 move extent: The specified range of file "
1188 "may be the hole\n"); 1297 "may be the hole\n");
1189 ret = -EINVAL; 1298 ret1 = -EINVAL;
1190 goto out; 1299 goto out;
1191 } 1300 }
1192 1301
@@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1206 last_extent = mext_next_extent(orig_inode, holecheck_path, 1315 last_extent = mext_next_extent(orig_inode, holecheck_path,
1207 &ext_cur); 1316 &ext_cur);
1208 if (last_extent < 0) { 1317 if (last_extent < 0) {
1209 ret = last_extent; 1318 ret1 = last_extent;
1210 break; 1319 break;
1211 } 1320 }
1212 add_blocks = ext4_ext_get_actual_len(ext_cur); 1321 add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1258 while (orig_page_offset <= seq_end_page) { 1367 while (orig_page_offset <= seq_end_page) {
1259 1368
1260 /* Swap original branches with new branches */ 1369 /* Swap original branches with new branches */
1261 ret = move_extent_par_page(o_filp, donor_inode, 1370 ret1 = move_extent_per_page(o_filp, donor_inode,
1262 orig_page_offset, 1371 orig_page_offset,
1263 data_offset_in_page, 1372 data_offset_in_page,
1264 block_len_in_page, uninit); 1373 block_len_in_page, uninit);
1265 if (ret < 0) 1374 if (ret1 < 0)
1266 goto out; 1375 goto out;
1267 orig_page_offset++; 1376 orig_page_offset++;
1268 /* Count how many blocks we have exchanged */ 1377 /* Count how many blocks we have exchanged */
1269 *moved_len += block_len_in_page; 1378 *moved_len += block_len_in_page;
1270 BUG_ON(*moved_len > len); 1379 if (*moved_len > len) {
1380 ext4_error(orig_inode->i_sb, __func__,
1381 "We replaced blocks too much! "
1382 "sum of replaced: %llu requested: %llu",
1383 *moved_len, len);
1384 ret1 = -EIO;
1385 goto out;
1386 }
1271 1387
1272 data_offset_in_page = 0; 1388 data_offset_in_page = 0;
1273 rest_blocks -= block_len_in_page; 1389 rest_blocks -= block_len_in_page;
@@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1280 /* Decrease buffer counter */ 1396 /* Decrease buffer counter */
1281 if (holecheck_path) 1397 if (holecheck_path)
1282 ext4_ext_drop_refs(holecheck_path); 1398 ext4_ext_drop_refs(holecheck_path);
1283 get_ext_path(holecheck_path, orig_inode, 1399 ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
1284 seq_start, ret); 1400 if (ret1)
1285 if (holecheck_path == NULL)
1286 break; 1401 break;
1287 depth = holecheck_path->p_depth; 1402 depth = holecheck_path->p_depth;
1288 1403
1289 /* Decrease buffer counter */ 1404 /* Decrease buffer counter */
1290 if (orig_path) 1405 if (orig_path)
1291 ext4_ext_drop_refs(orig_path); 1406 ext4_ext_drop_refs(orig_path);
1292 get_ext_path(orig_path, orig_inode, seq_start, ret); 1407 ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
1293 if (orig_path == NULL) 1408 if (ret1)
1294 break; 1409 break;
1295 1410
1296 ext_cur = holecheck_path[depth].p_ext; 1411 ext_cur = holecheck_path[depth].p_ext;
@@ -1307,14 +1422,13 @@ out:
1307 ext4_ext_drop_refs(holecheck_path); 1422 ext4_ext_drop_refs(holecheck_path);
1308 kfree(holecheck_path); 1423 kfree(holecheck_path);
1309 } 1424 }
1310out2:
1311 mext_inode_double_unlock(orig_inode, donor_inode);
1312 1425
1313 if (ret) 1426 ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
1314 return ret;
1315 1427
1316 /* All of the specified blocks must be exchanged in succeed */ 1428 if (ret1)
1317 BUG_ON(*moved_len != len); 1429 return ret1;
1430 else if (ret2)
1431 return ret2;
1318 1432
1319 return 0; 1433 return 0;
1320} 1434}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16ff..42f81d285cd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1518 return retval; 1518 return retval;
1519 1519
1520 if (blocks == 1 && !dx_fallback && 1520 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) 1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
1522 return make_indexed_dir(handle, dentry, inode, bh); 1522 retval = make_indexed_dir(handle, dentry, inode, bh);
1523 if (retval == -ENOSPC)
1524 brelse(bh);
1525 return retval;
1526 }
1523 brelse(bh); 1527 brelse(bh);
1524 } 1528 }
1525 bh = ext4_append(handle, dir, &block, &retval); 1529 bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1528 de = (struct ext4_dir_entry_2 *) bh->b_data; 1532 de = (struct ext4_dir_entry_2 *) bh->b_data;
1529 de->inode = 0; 1533 de->inode = 0;
1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1534 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1531 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1535 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1536 if (retval == -ENOSPC)
1537 brelse(bh);
1538 return retval;
1532} 1539}
1533 1540
1534/* 1541/*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1590 goto cleanup; 1597 goto cleanup;
1591 node2 = (struct dx_node *)(bh2->b_data); 1598 node2 = (struct dx_node *)(bh2->b_data);
1592 entries2 = node2->entries; 1599 entries2 = node2->entries;
1600 memset(&node2->fake, 0, sizeof(struct fake_dirent));
1593 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, 1601 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1594 sb->s_blocksize); 1602 sb->s_blocksize);
1595 node2->fake.inode = 0;
1596 BUFFER_TRACE(frame->bh, "get_write_access"); 1603 BUFFER_TRACE(frame->bh, "get_write_access");
1597 err = ext4_journal_get_write_access(handle, frame->bh); 1604 err = ext4_journal_get_write_access(handle, frame->bh);
1598 if (err) 1605 if (err)
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1657 if (!de) 1664 if (!de)
1658 goto cleanup; 1665 goto cleanup;
1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1666 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1660 bh = NULL; 1667 if (err != -ENOSPC)
1668 bh = NULL;
1661 goto cleanup; 1669 goto cleanup;
1662 1670
1663journal_error: 1671journal_error:
@@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
2310 struct inode *inode = old_dentry->d_inode; 2318 struct inode *inode = old_dentry->d_inode;
2311 int err, retries = 0; 2319 int err, retries = 0;
2312 2320
2313 if (EXT4_DIR_LINK_MAX(inode)) 2321 if (inode->i_nlink >= EXT4_LINK_MAX)
2314 return -EMLINK; 2322 return -EMLINK;
2315 2323
2316 /* 2324 /*
@@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2413 goto end_rename; 2421 goto end_rename;
2414 retval = -EMLINK; 2422 retval = -EMLINK;
2415 if (!new_inode && new_dir != old_dir && 2423 if (!new_inode && new_dir != old_dir &&
2416 new_dir->i_nlink >= EXT4_LINK_MAX) 2424 EXT4_DIR_LINK_MAX(new_dir))
2417 goto end_rename; 2425 goto end_rename;
2418 } 2426 }
2419 if (!new_bh) { 2427 if (!new_bh) {
@@ -2536,7 +2544,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2536 .listxattr = ext4_listxattr, 2544 .listxattr = ext4_listxattr,
2537 .removexattr = generic_removexattr, 2545 .removexattr = generic_removexattr,
2538#endif 2546#endif
2539 .permission = ext4_permission, 2547 .check_acl = ext4_check_acl,
2540 .fiemap = ext4_fiemap, 2548 .fiemap = ext4_fiemap,
2541}; 2549};
2542 2550
@@ -2548,5 +2556,5 @@ const struct inode_operations ext4_special_inode_operations = {
2548 .listxattr = ext4_listxattr, 2556 .listxattr = ext4_listxattr,
2549 .removexattr = generic_removexattr, 2557 .removexattr = generic_removexattr,
2550#endif 2558#endif
2551 .permission = ext4_permission, 2559 .check_acl = ext4_check_acl,
2552}; 2560};
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
746 struct inode *inode = NULL; 746 struct inode *inode = NULL;
747 handle_t *handle; 747 handle_t *handle;
748 int gdb_off, gdb_num; 748 int gdb_off, gdb_num;
749 int num_grp_locked = 0;
750 int err, err2; 749 int err, err2;
751 750
752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 751 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
856 * using the new disk blocks. 855 * using the new disk blocks.
857 */ 856 */
858 857
859 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
860 /* Update group descriptor block for new group */ 858 /* Update group descriptor block for new group */
861 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 859 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
862 gdb_off * EXT4_DESC_SIZE(sb)); 860 gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
875 * descriptor 873 * descriptor
876 */ 874 */
877 err = ext4_mb_add_groupinfo(sb, input->group, gdp); 875 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
878 if (err) { 876 if (err)
879 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
880 goto exit_journal; 877 goto exit_journal;
881 }
882 878
883 /* 879 /*
884 * Make the new blocks and inodes valid next. We do this before 880 * Make the new blocks and inodes valid next. We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
920 916
921 /* Update the global fs size fields */ 917 /* Update the global fs size fields */
922 sbi->s_groups_count++; 918 sbi->s_groups_count++;
923 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
924 919
925 ext4_handle_dirty_metadata(handle, NULL, primary); 920 ext4_handle_dirty_metadata(handle, NULL, primary);
926 921
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..a6b1ab734728 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
45#include "ext4_jbd2.h" 45#include "ext4_jbd2.h"
46#include "xattr.h" 46#include "xattr.h"
47#include "acl.h" 47#include "acl.h"
48#include "mballoc.h"
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
50#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
@@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
344 errstr = "Out of memory"; 345 errstr = "Out of memory";
345 break; 346 break;
346 case -EROFS: 347 case -EROFS:
347 if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) 348 if (!sb || (EXT4_SB(sb)->s_journal &&
349 EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
348 errstr = "Journal has aborted"; 350 errstr = "Journal has aborted";
349 else 351 else
350 errstr = "Readonly filesystem"; 352 errstr = "Readonly filesystem";
@@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb,
1279 *journal_devnum = option; 1281 *journal_devnum = option;
1280 break; 1282 break;
1281 case Opt_journal_checksum: 1283 case Opt_journal_checksum:
1282 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1284 break; /* Kept for backwards compatibility */
1283 break;
1284 case Opt_journal_async_commit: 1285 case Opt_journal_async_commit:
1285 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1286 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1286 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1287 break; 1287 break;
1288 case Opt_noload: 1288 case Opt_noload:
1289 set_opt(sbi->s_mount_opt, NOLOAD); 1289 set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
1695 gdp = ext4_get_group_desc(sb, i, NULL); 1695 gdp = ext4_get_group_desc(sb, i, NULL);
1696 1696
1697 flex_group = ext4_flex_group(sbi, i); 1697 flex_group = ext4_flex_group(sbi, i);
1698 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, 1698 atomic_add(ext4_free_inodes_count(sb, gdp),
1699 ext4_free_inodes_count(sb, gdp)); 1699 &sbi->s_flex_groups[flex_group].free_inodes);
1700 atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, 1700 atomic_add(ext4_free_blks_count(sb, gdp),
1701 ext4_free_blks_count(sb, gdp)); 1701 &sbi->s_flex_groups[flex_group].free_blocks);
1702 atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, 1702 atomic_add(ext4_used_dirs_count(sb, gdp),
1703 ext4_used_dirs_count(sb, gdp)); 1703 &sbi->s_flex_groups[flex_group].used_dirs);
1704 } 1704 }
1705 1705
1706 return 1; 1706 return 1;
@@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = {
2253 .release = ext4_sb_release, 2253 .release = ext4_sb_release,
2254}; 2254};
2255 2255
2256/*
2257 * Check whether this filesystem can be mounted based on
2258 * the features present and the RDONLY/RDWR mount requested.
2259 * Returns 1 if this filesystem can be mounted as requested,
2260 * 0 if it cannot be.
2261 */
2262static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2263{
2264 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
2265 ext4_msg(sb, KERN_ERR,
2266 "Couldn't mount because of "
2267 "unsupported optional features (%x)",
2268 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2269 ~EXT4_FEATURE_INCOMPAT_SUPP));
2270 return 0;
2271 }
2272
2273 if (readonly)
2274 return 1;
2275
2276 /* Check that feature set is OK for a read-write mount */
2277 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
2278 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
2279 "unsupported optional features (%x)",
2280 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2281 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2282 return 0;
2283 }
2284 /*
2285 * Large file size enabled file system can only be mounted
2286 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
2287 */
2288 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2289 if (sizeof(blkcnt_t) < sizeof(u64)) {
2290 ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
2291 "cannot be mounted RDWR without "
2292 "CONFIG_LBDAF");
2293 return 0;
2294 }
2295 }
2296 return 1;
2297}
2298
2256static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2299static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2257 __releases(kernel_lock) 2300 __releases(kernel_lock)
2258 __acquires(kernel_lock) 2301 __acquires(kernel_lock)
@@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2274 unsigned int db_count; 2317 unsigned int db_count;
2275 unsigned int i; 2318 unsigned int i;
2276 int needs_recovery, has_huge_files; 2319 int needs_recovery, has_huge_files;
2277 int features;
2278 __u64 blocks_count; 2320 __u64 blocks_count;
2279 int err; 2321 int err;
2280 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 2322 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2401 * previously didn't change the revision level when setting the flags, 2443 * previously didn't change the revision level when setting the flags,
2402 * so there is a chance incompat flags are set on a rev 0 filesystem. 2444 * so there is a chance incompat flags are set on a rev 0 filesystem.
2403 */ 2445 */
2404 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2446 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
2405 if (features) {
2406 ext4_msg(sb, KERN_ERR,
2407 "Couldn't mount because of "
2408 "unsupported optional features (%x)",
2409 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2410 ~EXT4_FEATURE_INCOMPAT_SUPP));
2411 goto failed_mount;
2412 }
2413 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2414 if (!(sb->s_flags & MS_RDONLY) && features) {
2415 ext4_msg(sb, KERN_ERR,
2416 "Couldn't mount RDWR because of "
2417 "unsupported optional features (%x)",
2418 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2419 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2420 goto failed_mount; 2447 goto failed_mount;
2421 } 2448
2422 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2423 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2424 if (has_huge_files) {
2425 /*
2426 * Large file size enabled file system can only be
2427 * mount if kernel is build with CONFIG_LBDAF
2428 */
2429 if (sizeof(root->i_blocks) < sizeof(u64) &&
2430 !(sb->s_flags & MS_RDONLY)) {
2431 ext4_msg(sb, KERN_ERR, "Filesystem with huge "
2432 "files cannot be mounted read-write "
2433 "without CONFIG_LBDAF");
2434 goto failed_mount;
2435 }
2436 }
2437 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 2449 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
2438 2450
2439 if (blocksize < EXT4_MIN_BLOCK_SIZE || 2451 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2469 } 2481 }
2470 } 2482 }
2471 2483
2484 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2485 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2472 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 2486 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
2473 has_huge_files); 2487 has_huge_files);
2474 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 2488 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2549 goto failed_mount; 2563 goto failed_mount;
2550 } 2564 }
2551 2565
2552 if (ext4_blocks_count(es) > 2566 /*
2553 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 2567 * Test whether we have more sectors than will fit in sector_t,
2568 * and whether the max offset is addressable by the page cache.
2569 */
2570 if ((ext4_blocks_count(es) >
2571 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
2572 (ext4_blocks_count(es) >
2573 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2554 ext4_msg(sb, KERN_ERR, "filesystem" 2574 ext4_msg(sb, KERN_ERR, "filesystem"
2555 " too large to mount safely"); 2575 " too large to mount safely on this system");
2556 if (sizeof(sector_t) < 8) 2576 if (sizeof(sector_t) < 8)
2557 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 2577 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2578 ret = -EFBIG;
2558 goto failed_mount; 2579 goto failed_mount;
2559 } 2580 }
2560 2581
@@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2595 goto failed_mount; 2616 goto failed_mount;
2596 } 2617 }
2597 sbi->s_groups_count = blocks_count; 2618 sbi->s_groups_count = blocks_count;
2619 sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
2620 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
2598 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 2621 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2599 EXT4_DESC_PER_BLOCK(sb); 2622 EXT4_DESC_PER_BLOCK(sb);
2600 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), 2623 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2729 goto failed_mount4; 2752 goto failed_mount4;
2730 } 2753 }
2731 2754
2732 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2755 jbd2_journal_set_features(sbi->s_journal,
2733 jbd2_journal_set_features(sbi->s_journal, 2756 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2734 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 2757 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
2758 jbd2_journal_set_features(sbi->s_journal, 0, 0,
2735 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2759 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2736 } else if (test_opt(sb, JOURNAL_CHECKSUM)) { 2760 else
2737 jbd2_journal_set_features(sbi->s_journal,
2738 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2739 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 2761 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2740 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2762 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2741 } else {
2742 jbd2_journal_clear_features(sbi->s_journal,
2743 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2744 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2745 }
2746 2763
2747 /* We have now updated the journal if required, so we can 2764 /* We have now updated the journal if required, so we can
2748 * validate the data journaling mode. */ 2765 * validate the data journaling mode. */
@@ -3208,7 +3225,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3208 clear_buffer_write_io_error(sbh); 3225 clear_buffer_write_io_error(sbh);
3209 set_buffer_uptodate(sbh); 3226 set_buffer_uptodate(sbh);
3210 } 3227 }
3211 es->s_wtime = cpu_to_le32(get_seconds()); 3228 /*
3229 * If the file system is mounted read-only, don't update the
3230 * superblock write time. This avoids updating the superblock
3231 * write time when we are mounting the root file system
3232 * read/only but we need to replay the journal; at that point,
3233 * for people who are east of GMT and who make their clock
3234 * tick in localtime for Windows bug-for-bug compatibility,
3235 * the clock is set in the future, and this will cause e2fsck
3236 * to complain and force a full file system check.
3237 */
3238 if (!(sb->s_flags & MS_RDONLY))
3239 es->s_wtime = cpu_to_le32(get_seconds());
3212 es->s_kbytes_written = 3240 es->s_kbytes_written =
3213 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3241 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3214 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3242 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3477 if (sbi->s_journal) 3505 if (sbi->s_journal)
3478 ext4_mark_recovery_complete(sb, es); 3506 ext4_mark_recovery_complete(sb, es);
3479 } else { 3507 } else {
3480 int ret; 3508 /* Make sure we can mount this feature set readwrite */
3481 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3509 if (!ext4_feature_set_ok(sb, 0)) {
3482 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3483 ext4_msg(sb, KERN_WARNING, "couldn't "
3484 "remount RDWR because of unsupported "
3485 "optional features (%x)",
3486 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3487 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3488 err = -EROFS; 3510 err = -EROFS;
3489 goto restore_opts; 3511 goto restore_opts;
3490 } 3512 }
3491
3492 /* 3513 /*
3493 * Make sure the group descriptor checksums 3514 * Make sure the group descriptor checksums
3494 * are sane. If they aren't, refuse to remount r/w. 3515 * are sane. If they aren't, refuse to remount r/w.
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
810 get_bh(new_bh); 810 get_bh(new_bh);
811 } else { 811 } else {
812 /* We need to allocate a new block */ 812 /* We need to allocate a new block */
813 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 813 ext4_fsblk_t goal, block;
814
815 goal = ext4_group_first_block_no(sb,
814 EXT4_I(inode)->i_block_group); 816 EXT4_I(inode)->i_block_group);
815 ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, 817
818 /* non-extent files can't have physical blocks past 2^32 */
819 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
820 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
821
822 block = ext4_new_meta_blocks(handle, inode,
816 goal, NULL, &error); 823 goal, NULL, &error);
817 if (error) 824 if (error)
818 goto cleanup; 825 goto cleanup;
826
827 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
828 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
829
819 ea_idebug(inode, "creating block %d", block); 830 ea_idebug(inode, "creating block %d", block);
820 831
821 new_bh = sb_getblk(sb, block); 832 new_bh = sb_getblk(sb, block);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95c..e8c159de236b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
176 176
177 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; 177 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
178 mark_inode_dirty(inode); 178 mark_inode_dirty(inode);
179 if (IS_SYNC(inode)) 179 if (IS_SYNC(inode)) {
180 err = sync_page_range_nolock(inode, mapping, start, count); 180 int err2;
181
182 /*
183 * Opencode syncing since we don't have a file open to use
184 * standard fsync path.
185 */
186 err = filemap_fdatawrite_range(mapping, start,
187 start + count - 1);
188 err2 = sync_mapping_buffers(mapping);
189 if (!err)
190 err = err2;
191 err2 = write_inode_now(inode, 1);
192 if (!err)
193 err = err2;
194 if (!err) {
195 err = filemap_fdatawait_range(mapping, start,
196 start + count - 1);
197 }
198 }
181out: 199out:
182 return err; 200 return err;
183} 201}
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd7..4e35be873e09 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
119 MSDOS_I(inode)->i_start = new_dclus; 119 MSDOS_I(inode)->i_start = new_dclus;
120 MSDOS_I(inode)->i_logstart = new_dclus; 120 MSDOS_I(inode)->i_logstart = new_dclus;
121 /* 121 /*
122 * Since generic_osync_inode() synchronize later if 122 * Since generic_write_sync() synchronizes regular files later,
123 * this is not directory, we don't here. 123 * we sync here only directories.
124 */ 124 */
125 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { 125 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
126 ret = fat_sync_inode(inode); 126 ret = fat_sync_inode(inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,245 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/kthread.h>
23#include <linux/freezer.h>
22#include <linux/writeback.h> 24#include <linux/writeback.h>
23#include <linux/blkdev.h> 25#include <linux/blkdev.h>
24#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
25#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
26#include "internal.h" 28#include "internal.h"
27 29
30#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
28 31
29/** 32/*
30 * writeback_acquire - attempt to get exclusive writeback access to a device 33 * We don't actually have pdflush, but this one is exported though /proc...
31 * @bdi: the device's backing_dev_info structure 34 */
32 * 35int nr_pdflush_threads;
33 * It is a waste of resources to have more than one pdflush thread blocked on 36
34 * a single request queue. Exclusion at the request_queue level is obtained 37/*
35 * via a flag in the request_queue's backing_dev_info.state. 38 * Passed into wb_writeback(), essentially a subset of writeback_control
36 * 39 */
37 * Non-request_queue-backed address_spaces will share default_backing_dev_info, 40struct wb_writeback_args {
38 * unless they implement their own. Which is somewhat inefficient, as this 41 long nr_pages;
39 * may prevent concurrent writeback against multiple devices. 42 struct super_block *sb;
43 enum writeback_sync_modes sync_mode;
44 int for_kupdate;
45 int range_cyclic;
46};
47
48/*
49 * Work items for the bdi_writeback threads
40 */ 50 */
41static int writeback_acquire(struct backing_dev_info *bdi) 51struct bdi_work {
52 struct list_head list; /* pending work list */
53 struct rcu_head rcu_head; /* for RCU free/clear of work */
54
55 unsigned long seen; /* threads that have seen this work */
56 atomic_t pending; /* number of threads still to do work */
57
58 struct wb_writeback_args args; /* writeback arguments */
59
60 unsigned long state; /* flag bits, see WS_* */
61};
62
63enum {
64 WS_USED_B = 0,
65 WS_ONSTACK_B,
66};
67
68#define WS_USED (1 << WS_USED_B)
69#define WS_ONSTACK (1 << WS_ONSTACK_B)
70
71static inline bool bdi_work_on_stack(struct bdi_work *work)
72{
73 return test_bit(WS_ONSTACK_B, &work->state);
74}
75
76static inline void bdi_work_init(struct bdi_work *work,
77 struct wb_writeback_args *args)
42{ 78{
43 return !test_and_set_bit(BDI_pdflush, &bdi->state); 79 INIT_RCU_HEAD(&work->rcu_head);
80 work->args = *args;
81 work->state = WS_USED;
44} 82}
45 83
46/** 84/**
47 * writeback_in_progress - determine whether there is writeback in progress 85 * writeback_in_progress - determine whether there is writeback in progress
48 * @bdi: the device's backing_dev_info structure. 86 * @bdi: the device's backing_dev_info structure.
49 * 87 *
50 * Determine whether there is writeback in progress against a backing device. 88 * Determine whether there is writeback waiting to be handled against a
89 * backing device.
51 */ 90 */
52int writeback_in_progress(struct backing_dev_info *bdi) 91int writeback_in_progress(struct backing_dev_info *bdi)
53{ 92{
54 return test_bit(BDI_pdflush, &bdi->state); 93 return !list_empty(&bdi->work_list);
55} 94}
56 95
57/** 96static void bdi_work_clear(struct bdi_work *work)
58 * writeback_release - relinquish exclusive writeback access against a device.
59 * @bdi: the device's backing_dev_info structure
60 */
61static void writeback_release(struct backing_dev_info *bdi)
62{ 97{
63 BUG_ON(!writeback_in_progress(bdi)); 98 clear_bit(WS_USED_B, &work->state);
64 clear_bit(BDI_pdflush, &bdi->state); 99 smp_mb__after_clear_bit();
100 /*
101 * work can have disappeared at this point. bit waitq functions
102 * should be able to tolerate this, provided bdi_sched_wait does
103 * not dereference it's pointer argument.
104 */
105 wake_up_bit(&work->state, WS_USED_B);
65} 106}
66 107
67static noinline void block_dump___mark_inode_dirty(struct inode *inode) 108static void bdi_work_free(struct rcu_head *head)
68{ 109{
69 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 110 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
70 struct dentry *dentry;
71 const char *name = "?";
72 111
73 dentry = d_find_alias(inode); 112 if (!bdi_work_on_stack(work))
74 if (dentry) { 113 kfree(work);
75 spin_lock(&dentry->d_lock); 114 else
76 name = (const char *) dentry->d_name.name; 115 bdi_work_clear(work);
77 }
78 printk(KERN_DEBUG
79 "%s(%d): dirtied inode %lu (%s) on %s\n",
80 current->comm, task_pid_nr(current), inode->i_ino,
81 name, inode->i_sb->s_id);
82 if (dentry) {
83 spin_unlock(&dentry->d_lock);
84 dput(dentry);
85 }
86 }
87} 116}
88 117
89/** 118static void wb_work_complete(struct bdi_work *work)
90 * __mark_inode_dirty - internal function
91 * @inode: inode to mark
92 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
93 * Mark an inode as dirty. Callers should use mark_inode_dirty or
94 * mark_inode_dirty_sync.
95 *
96 * Put the inode on the super block's dirty list.
97 *
98 * CAREFUL! We mark it dirty unconditionally, but move it onto the
99 * dirty list only if it is hashed or if it refers to a blockdev.
100 * If it was not hashed, it will never be added to the dirty list
101 * even if it is later hashed, as it will have been marked dirty already.
102 *
103 * In short, make sure you hash any inodes _before_ you start marking
104 * them dirty.
105 *
106 * This function *must* be atomic for the I_DIRTY_PAGES case -
107 * set_page_dirty() is called under spinlock in several places.
108 *
109 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
110 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
111 * the kernel-internal blockdev inode represents the dirtying time of the
112 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
113 * page->mapping->host, so the page-dirtying time is recorded in the internal
114 * blockdev inode.
115 */
116void __mark_inode_dirty(struct inode *inode, int flags)
117{ 119{
118 struct super_block *sb = inode->i_sb; 120 const enum writeback_sync_modes sync_mode = work->args.sync_mode;
121 int onstack = bdi_work_on_stack(work);
119 122
120 /* 123 /*
121 * Don't do this for I_DIRTY_PAGES - that doesn't actually 124 * For allocated work, we can clear the done/seen bit right here.
122 * dirty the inode itself 125 * For on-stack work, we need to postpone both the clear and free
126 * to after the RCU grace period, since the stack could be invalidated
127 * as soon as bdi_work_clear() has done the wakeup.
123 */ 128 */
124 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 129 if (!onstack)
125 if (sb->s_op->dirty_inode) 130 bdi_work_clear(work);
126 sb->s_op->dirty_inode(inode); 131 if (sync_mode == WB_SYNC_NONE || onstack)
127 } 132 call_rcu(&work->rcu_head, bdi_work_free);
133}
128 134
135static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
136{
129 /* 137 /*
130 * make sure that changes are seen by all cpus before we test i_state 138 * The caller has retrieved the work arguments from this work,
131 * -- mikulas 139 * drop our reference. If this is the last ref, delete and free it
132 */ 140 */
133 smp_mb(); 141 if (atomic_dec_and_test(&work->pending)) {
142 struct backing_dev_info *bdi = wb->bdi;
134 143
135 /* avoid the locking if we can */ 144 spin_lock(&bdi->wb_lock);
136 if ((inode->i_state & flags) == flags) 145 list_del_rcu(&work->list);
137 return; 146 spin_unlock(&bdi->wb_lock);
138 147
139 if (unlikely(block_dump)) 148 wb_work_complete(work);
140 block_dump___mark_inode_dirty(inode); 149 }
150}
141 151
142 spin_lock(&inode_lock); 152static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
143 if ((inode->i_state & flags) != flags) { 153{
144 const int was_dirty = inode->i_state & I_DIRTY; 154 work->seen = bdi->wb_mask;
155 BUG_ON(!work->seen);
156 atomic_set(&work->pending, bdi->wb_cnt);
157 BUG_ON(!bdi->wb_cnt);
145 158
146 inode->i_state |= flags; 159 /*
160 * list_add_tail_rcu() contains the necessary barriers to
161 * make sure the above stores are seen before the item is
162 * noticed on the list
163 */
164 spin_lock(&bdi->wb_lock);
165 list_add_tail_rcu(&work->list, &bdi->work_list);
166 spin_unlock(&bdi->wb_lock);
147 167
148 /* 168 /*
149 * If the inode is being synced, just update its dirty state. 169 * If the default thread isn't there, make sure we add it. When
150 * The unlocker will place the inode on the appropriate 170 * it gets created and wakes up, we'll run this work.
151 * superblock list, based upon its state. 171 */
152 */ 172 if (unlikely(list_empty_careful(&bdi->wb_list)))
153 if (inode->i_state & I_SYNC) 173 wake_up_process(default_backing_dev_info.wb.task);
154 goto out; 174 else {
175 struct bdi_writeback *wb = &bdi->wb;
155 176
156 /* 177 if (wb->task)
157 * Only add valid (hashed) inodes to the superblock's 178 wake_up_process(wb->task);
158 * dirty list. Add blockdev inodes as well. 179 }
159 */ 180}
160 if (!S_ISBLK(inode->i_mode)) {
161 if (hlist_unhashed(&inode->i_hash))
162 goto out;
163 }
164 if (inode->i_state & (I_FREEING|I_CLEAR))
165 goto out;
166 181
167 /* 182/*
168 * If the inode was already on s_dirty/s_io/s_more_io, don't 183 * Used for on-stack allocated work items. The caller needs to wait until
169 * reposition it (that would break s_dirty time-ordering). 184 * the wb threads have acked the work before it's safe to continue.
170 */ 185 */
171 if (!was_dirty) { 186static void bdi_wait_on_work_clear(struct bdi_work *work)
172 inode->dirtied_when = jiffies; 187{
173 list_move(&inode->i_list, &sb->s_dirty); 188 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
174 } 189 TASK_UNINTERRUPTIBLE);
190}
191
192static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
193 struct wb_writeback_args *args)
194{
195 struct bdi_work *work;
196
197 /*
198 * This is WB_SYNC_NONE writeback, so if allocation fails just
199 * wakeup the thread for old dirty data writeback
200 */
201 work = kmalloc(sizeof(*work), GFP_ATOMIC);
202 if (work) {
203 bdi_work_init(work, args);
204 bdi_queue_work(bdi, work);
205 } else {
206 struct bdi_writeback *wb = &bdi->wb;
207
208 if (wb->task)
209 wake_up_process(wb->task);
175 } 210 }
176out:
177 spin_unlock(&inode_lock);
178} 211}
179 212
180EXPORT_SYMBOL(__mark_inode_dirty); 213/**
214 * bdi_sync_writeback - start and wait for writeback
215 * @bdi: the backing device to write from
216 * @sb: write inodes from this super_block
217 *
218 * Description:
219 * This does WB_SYNC_ALL data integrity writeback and waits for the
220 * IO to complete. Callers must hold the sb s_umount semaphore for
221 * reading, to avoid having the super disappear before we are done.
222 */
223static void bdi_sync_writeback(struct backing_dev_info *bdi,
224 struct super_block *sb)
225{
226 struct wb_writeback_args args = {
227 .sb = sb,
228 .sync_mode = WB_SYNC_ALL,
229 .nr_pages = LONG_MAX,
230 .range_cyclic = 0,
231 };
232 struct bdi_work work;
181 233
182static int write_inode(struct inode *inode, int sync) 234 bdi_work_init(&work, &args);
235 work.state |= WS_ONSTACK;
236
237 bdi_queue_work(bdi, &work);
238 bdi_wait_on_work_clear(&work);
239}
240
241/**
242 * bdi_start_writeback - start writeback
243 * @bdi: the backing device to write from
244 * @nr_pages: the number of pages to write
245 *
246 * Description:
247 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
248 * started when this function returns, we make no guarentees on
249 * completion. Caller need not hold sb s_umount semaphore.
250 *
251 */
252void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
183{ 253{
184 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 254 struct wb_writeback_args args = {
185 return inode->i_sb->s_op->write_inode(inode, sync); 255 .sync_mode = WB_SYNC_NONE,
186 return 0; 256 .nr_pages = nr_pages,
257 .range_cyclic = 1,
258 };
259
260 bdi_alloc_queue_work(bdi, &args);
187} 261}
188 262
189/* 263/*
@@ -191,31 +265,32 @@ static int write_inode(struct inode *inode, int sync)
191 * furthest end of its superblock's dirty-inode list. 265 * furthest end of its superblock's dirty-inode list.
192 * 266 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 267 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the s_dirty list. If that is 268 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written 269 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 270 * out and we don't reset its dirtied_when.
197 */ 271 */
198static void redirty_tail(struct inode *inode) 272static void redirty_tail(struct inode *inode)
199{ 273{
200 struct super_block *sb = inode->i_sb; 274 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
201 275
202 if (!list_empty(&sb->s_dirty)) { 276 if (!list_empty(&wb->b_dirty)) {
203 struct inode *tail_inode; 277 struct inode *tail;
204 278
205 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 279 tail = list_entry(wb->b_dirty.next, struct inode, i_list);
206 if (time_before(inode->dirtied_when, 280 if (time_before(inode->dirtied_when, tail->dirtied_when))
207 tail_inode->dirtied_when))
208 inode->dirtied_when = jiffies; 281 inode->dirtied_when = jiffies;
209 } 282 }
210 list_move(&inode->i_list, &sb->s_dirty); 283 list_move(&inode->i_list, &wb->b_dirty);
211} 284}
212 285
213/* 286/*
214 * requeue inode for re-scanning after sb->s_io list is exhausted. 287 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 288 */
216static void requeue_io(struct inode *inode) 289static void requeue_io(struct inode *inode)
217{ 290{
218 list_move(&inode->i_list, &inode->i_sb->s_more_io); 291 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
292
293 list_move(&inode->i_list, &wb->b_more_io);
219} 294}
220 295
221static void inode_sync_complete(struct inode *inode) 296static void inode_sync_complete(struct inode *inode)
@@ -262,20 +337,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
262/* 337/*
263 * Queue all expired dirty inodes for io, eldest first. 338 * Queue all expired dirty inodes for io, eldest first.
264 */ 339 */
265static void queue_io(struct super_block *sb, 340static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
266 unsigned long *older_than_this)
267{ 341{
268 list_splice_init(&sb->s_more_io, sb->s_io.prev); 342 list_splice_init(&wb->b_more_io, wb->b_io.prev);
269 move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); 343 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
270} 344}
271 345
272int sb_has_dirty_inodes(struct super_block *sb) 346static int write_inode(struct inode *inode, int sync)
273{ 347{
274 return !list_empty(&sb->s_dirty) || 348 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
275 !list_empty(&sb->s_io) || 349 return inode->i_sb->s_op->write_inode(inode, sync);
276 !list_empty(&sb->s_more_io); 350 return 0;
277} 351}
278EXPORT_SYMBOL(sb_has_dirty_inodes);
279 352
280/* 353/*
281 * Wait for writeback on an inode to complete. 354 * Wait for writeback on an inode to complete.
@@ -322,11 +395,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
322 if (inode->i_state & I_SYNC) { 395 if (inode->i_state & I_SYNC) {
323 /* 396 /*
324 * If this inode is locked for writeback and we are not doing 397 * If this inode is locked for writeback and we are not doing
325 * writeback-for-data-integrity, move it to s_more_io so that 398 * writeback-for-data-integrity, move it to b_more_io so that
326 * writeback can proceed with the other inodes on s_io. 399 * writeback can proceed with the other inodes on s_io.
327 * 400 *
328 * We'll have another go at writing back this inode when we 401 * We'll have another go at writing back this inode when we
329 * completed a full scan of s_io. 402 * completed a full scan of b_io.
330 */ 403 */
331 if (!wait) { 404 if (!wait) {
332 requeue_io(inode); 405 requeue_io(inode);
@@ -371,11 +444,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
371 /* 444 /*
372 * We didn't write back all the pages. nfs_writepages() 445 * We didn't write back all the pages. nfs_writepages()
373 * sometimes bales out without doing anything. Redirty 446 * sometimes bales out without doing anything. Redirty
374 * the inode; Move it from s_io onto s_more_io/s_dirty. 447 * the inode; Move it from b_io onto b_more_io/b_dirty.
375 */ 448 */
376 /* 449 /*
377 * akpm: if the caller was the kupdate function we put 450 * akpm: if the caller was the kupdate function we put
378 * this inode at the head of s_dirty so it gets first 451 * this inode at the head of b_dirty so it gets first
379 * consideration. Otherwise, move it to the tail, for 452 * consideration. Otherwise, move it to the tail, for
380 * the reasons described there. I'm not really sure 453 * the reasons described there. I'm not really sure
381 * how much sense this makes. Presumably I had a good 454 * how much sense this makes. Presumably I had a good
@@ -385,7 +458,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
385 if (wbc->for_kupdate) { 458 if (wbc->for_kupdate) {
386 /* 459 /*
387 * For the kupdate function we move the inode 460 * For the kupdate function we move the inode
388 * to s_more_io so it will get more writeout as 461 * to b_more_io so it will get more writeout as
389 * soon as the queue becomes uncongested. 462 * soon as the queue becomes uncongested.
390 */ 463 */
391 inode->i_state |= I_DIRTY_PAGES; 464 inode->i_state |= I_DIRTY_PAGES;
@@ -434,50 +507,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
434} 507}
435 508
436/* 509/*
437 * Write out a superblock's list of dirty inodes. A wait will be performed 510 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
438 * upon no inodes, all inodes or the final one, depending upon sync_mode. 511 * before calling writeback. So make sure that we do pin it, so it doesn't
439 * 512 * go away while we are writing inodes from it.
440 * If older_than_this is non-NULL, then only write out inodes which
441 * had their first dirtying at a time earlier than *older_than_this.
442 *
443 * If we're a pdflush thread, then implement pdflush collision avoidance
444 * against the entire list.
445 * 513 *
446 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 514 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
447 * This function assumes that the blockdev superblock's inodes are backed by 515 * 1 if we failed.
448 * a variety of queues, so all inodes are searched. For other superblocks,
449 * assume that all inodes are backed by the same queue.
450 *
451 * FIXME: this linear search could get expensive with many fileystems. But
452 * how to fix? We need to go from an address_space to all inodes which share
453 * a queue with that address_space. (Easy: have a global "dirty superblocks"
454 * list).
455 *
456 * The inodes to be written are parked on sb->s_io. They are moved back onto
457 * sb->s_dirty as they are selected for writing. This way, none can be missed
458 * on the writer throttling path, and we get decent balancing between many
459 * throttled threads: we don't want them all piling up on inode_sync_wait.
460 */ 516 */
461void generic_sync_sb_inodes(struct super_block *sb, 517static int pin_sb_for_writeback(struct writeback_control *wbc,
518 struct inode *inode)
519{
520 struct super_block *sb = inode->i_sb;
521
522 /*
523 * Caller must already hold the ref for this
524 */
525 if (wbc->sync_mode == WB_SYNC_ALL) {
526 WARN_ON(!rwsem_is_locked(&sb->s_umount));
527 return 0;
528 }
529
530 spin_lock(&sb_lock);
531 sb->s_count++;
532 if (down_read_trylock(&sb->s_umount)) {
533 if (sb->s_root) {
534 spin_unlock(&sb_lock);
535 return 0;
536 }
537 /*
538 * umounted, drop rwsem again and fall through to failure
539 */
540 up_read(&sb->s_umount);
541 }
542
543 sb->s_count--;
544 spin_unlock(&sb_lock);
545 return 1;
546}
547
548static void unpin_sb_for_writeback(struct writeback_control *wbc,
549 struct inode *inode)
550{
551 struct super_block *sb = inode->i_sb;
552
553 if (wbc->sync_mode == WB_SYNC_ALL)
554 return;
555
556 up_read(&sb->s_umount);
557 put_super(sb);
558}
559
560static void writeback_inodes_wb(struct bdi_writeback *wb,
462 struct writeback_control *wbc) 561 struct writeback_control *wbc)
463{ 562{
563 struct super_block *sb = wbc->sb;
564 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
464 const unsigned long start = jiffies; /* livelock avoidance */ 565 const unsigned long start = jiffies; /* livelock avoidance */
465 int sync = wbc->sync_mode == WB_SYNC_ALL;
466 566
467 spin_lock(&inode_lock); 567 spin_lock(&inode_lock);
468 if (!wbc->for_kupdate || list_empty(&sb->s_io))
469 queue_io(sb, wbc->older_than_this);
470 568
471 while (!list_empty(&sb->s_io)) { 569 if (!wbc->for_kupdate || list_empty(&wb->b_io))
472 struct inode *inode = list_entry(sb->s_io.prev, 570 queue_io(wb, wbc->older_than_this);
571
572 while (!list_empty(&wb->b_io)) {
573 struct inode *inode = list_entry(wb->b_io.prev,
473 struct inode, i_list); 574 struct inode, i_list);
474 struct address_space *mapping = inode->i_mapping;
475 struct backing_dev_info *bdi = mapping->backing_dev_info;
476 long pages_skipped; 575 long pages_skipped;
477 576
478 if (!bdi_cap_writeback_dirty(bdi)) { 577 /*
578 * super block given and doesn't match, skip this inode
579 */
580 if (sb && sb != inode->i_sb) {
581 redirty_tail(inode);
582 continue;
583 }
584
585 if (!bdi_cap_writeback_dirty(wb->bdi)) {
479 redirty_tail(inode); 586 redirty_tail(inode);
480 if (sb_is_blkdev_sb(sb)) { 587 if (is_blkdev_sb) {
481 /* 588 /*
482 * Dirty memory-backed blockdev: the ramdisk 589 * Dirty memory-backed blockdev: the ramdisk
483 * driver does this. Skip just this inode 590 * driver does this. Skip just this inode
@@ -497,21 +604,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
497 continue; 604 continue;
498 } 605 }
499 606
500 if (wbc->nonblocking && bdi_write_congested(bdi)) { 607 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
501 wbc->encountered_congestion = 1; 608 wbc->encountered_congestion = 1;
502 if (!sb_is_blkdev_sb(sb)) 609 if (!is_blkdev_sb)
503 break; /* Skip a congested fs */ 610 break; /* Skip a congested fs */
504 requeue_io(inode); 611 requeue_io(inode);
505 continue; /* Skip a congested blockdev */ 612 continue; /* Skip a congested blockdev */
506 } 613 }
507 614
508 if (wbc->bdi && bdi != wbc->bdi) {
509 if (!sb_is_blkdev_sb(sb))
510 break; /* fs has the wrong queue */
511 requeue_io(inode);
512 continue; /* blockdev has wrong queue */
513 }
514
515 /* 615 /*
516 * Was this inode dirtied after sync_sb_inodes was called? 616 * Was this inode dirtied after sync_sb_inodes was called?
517 * This keeps sync from extra jobs and livelock. 617 * This keeps sync from extra jobs and livelock.
@@ -519,16 +619,16 @@ void generic_sync_sb_inodes(struct super_block *sb,
519 if (inode_dirtied_after(inode, start)) 619 if (inode_dirtied_after(inode, start))
520 break; 620 break;
521 621
522 /* Is another pdflush already flushing this queue? */ 622 if (pin_sb_for_writeback(wbc, inode)) {
523 if (current_is_pdflush() && !writeback_acquire(bdi)) 623 requeue_io(inode);
524 break; 624 continue;
625 }
525 626
526 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 627 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
527 __iget(inode); 628 __iget(inode);
528 pages_skipped = wbc->pages_skipped; 629 pages_skipped = wbc->pages_skipped;
529 writeback_single_inode(inode, wbc); 630 writeback_single_inode(inode, wbc);
530 if (current_is_pdflush()) 631 unpin_sb_for_writeback(wbc, inode);
531 writeback_release(bdi);
532 if (wbc->pages_skipped != pages_skipped) { 632 if (wbc->pages_skipped != pages_skipped) {
533 /* 633 /*
534 * writeback is not making progress due to locked 634 * writeback is not making progress due to locked
@@ -544,144 +644,520 @@ void generic_sync_sb_inodes(struct super_block *sb,
544 wbc->more_io = 1; 644 wbc->more_io = 1;
545 break; 645 break;
546 } 646 }
547 if (!list_empty(&sb->s_more_io)) 647 if (!list_empty(&wb->b_more_io))
548 wbc->more_io = 1; 648 wbc->more_io = 1;
549 } 649 }
550 650
551 if (sync) { 651 spin_unlock(&inode_lock);
552 struct inode *inode, *old_inode = NULL; 652 /* Leave any unwritten inodes on b_io */
653}
654
655void writeback_inodes_wbc(struct writeback_control *wbc)
656{
657 struct backing_dev_info *bdi = wbc->bdi;
553 658
659 writeback_inodes_wb(&bdi->wb, wbc);
660}
661
662/*
663 * The maximum number of pages to writeout in a single bdi flush/kupdate
664 * operation. We do this so we don't hold I_SYNC against an inode for
665 * enormous amounts of time, which would block a userspace task which has
666 * been forced to throttle against that inode. Also, the code reevaluates
667 * the dirty each time it has written this many pages.
668 */
669#define MAX_WRITEBACK_PAGES 1024
670
671static inline bool over_bground_thresh(void)
672{
673 unsigned long background_thresh, dirty_thresh;
674
675 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
676
677 return (global_page_state(NR_FILE_DIRTY) +
678 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
679}
680
681/*
682 * Explicit flushing or periodic writeback of "old" data.
683 *
684 * Define "old": the first time one of an inode's pages is dirtied, we mark the
685 * dirtying-time in the inode's address_space. So this periodic writeback code
686 * just walks the superblock inode list, writing back any inodes which are
687 * older than a specific point in time.
688 *
689 * Try to run once per dirty_writeback_interval. But if a writeback event
690 * takes longer than a dirty_writeback_interval interval, then leave a
691 * one-second gap.
692 *
693 * older_than_this takes precedence over nr_to_write. So we'll only write back
694 * all dirty pages if they are all attached to "old" mappings.
695 */
696static long wb_writeback(struct bdi_writeback *wb,
697 struct wb_writeback_args *args)
698{
699 struct writeback_control wbc = {
700 .bdi = wb->bdi,
701 .sb = args->sb,
702 .sync_mode = args->sync_mode,
703 .older_than_this = NULL,
704 .for_kupdate = args->for_kupdate,
705 .range_cyclic = args->range_cyclic,
706 };
707 unsigned long oldest_jif;
708 long wrote = 0;
709
710 if (wbc.for_kupdate) {
711 wbc.older_than_this = &oldest_jif;
712 oldest_jif = jiffies -
713 msecs_to_jiffies(dirty_expire_interval * 10);
714 }
715 if (!wbc.range_cyclic) {
716 wbc.range_start = 0;
717 wbc.range_end = LLONG_MAX;
718 }
719
720 for (;;) {
554 /* 721 /*
555 * Data integrity sync. Must wait for all pages under writeback, 722 * Don't flush anything for non-integrity writeback where
556 * because there may have been pages dirtied before our sync 723 * no nr_pages was given
557 * call, but which had writeout started before we write it out.
558 * In which case, the inode may not be on the dirty list, but
559 * we still have to wait for that writeout.
560 */ 724 */
561 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 725 if (!args->for_kupdate && args->nr_pages <= 0 &&
562 struct address_space *mapping; 726 args->sync_mode == WB_SYNC_NONE)
727 break;
563 728
564 if (inode->i_state & 729 /*
565 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 730 * If no specific pages were given and this is just a
566 continue; 731 * periodic background writeout and we are below the
567 mapping = inode->i_mapping; 732 * background dirty threshold, don't do anything
568 if (mapping->nrpages == 0) 733 */
734 if (args->for_kupdate && args->nr_pages <= 0 &&
735 !over_bground_thresh())
736 break;
737
738 wbc.more_io = 0;
739 wbc.encountered_congestion = 0;
740 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
741 wbc.pages_skipped = 0;
742 writeback_inodes_wb(wb, &wbc);
743 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
744 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
745
746 /*
747 * If we ran out of stuff to write, bail unless more_io got set
748 */
749 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
750 if (wbc.more_io && !wbc.for_kupdate)
569 continue; 751 continue;
570 __iget(inode); 752 break;
571 spin_unlock(&inode_lock); 753 }
572 /* 754 }
573 * We hold a reference to 'inode' so it couldn't have 755
574 * been removed from s_inodes list while we dropped the 756 return wrote;
575 * inode_lock. We cannot iput the inode now as we can 757}
576 * be holding the last reference and we cannot iput it 758
577 * under inode_lock. So we keep the reference and iput 759/*
578 * it later. 760 * Return the next bdi_work struct that hasn't been processed by this
579 */ 761 * wb thread yet. ->seen is initially set for each thread that exists
580 iput(old_inode); 762 * for this device, when a thread first notices a piece of work it
581 old_inode = inode; 763 * clears its bit. Depending on writeback type, the thread will notify
764 * completion on either receiving the work (WB_SYNC_NONE) or after
765 * it is done (WB_SYNC_ALL).
766 */
767static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
768 struct bdi_writeback *wb)
769{
770 struct bdi_work *work, *ret = NULL;
771
772 rcu_read_lock();
773
774 list_for_each_entry_rcu(work, &bdi->work_list, list) {
775 if (!test_bit(wb->nr, &work->seen))
776 continue;
777 clear_bit(wb->nr, &work->seen);
778
779 ret = work;
780 break;
781 }
782
783 rcu_read_unlock();
784 return ret;
785}
786
787static long wb_check_old_data_flush(struct bdi_writeback *wb)
788{
789 unsigned long expired;
790 long nr_pages;
791
792 expired = wb->last_old_flush +
793 msecs_to_jiffies(dirty_writeback_interval * 10);
794 if (time_before(jiffies, expired))
795 return 0;
796
797 wb->last_old_flush = jiffies;
798 nr_pages = global_page_state(NR_FILE_DIRTY) +
799 global_page_state(NR_UNSTABLE_NFS) +
800 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
801
802 if (nr_pages) {
803 struct wb_writeback_args args = {
804 .nr_pages = nr_pages,
805 .sync_mode = WB_SYNC_NONE,
806 .for_kupdate = 1,
807 .range_cyclic = 1,
808 };
809
810 return wb_writeback(wb, &args);
811 }
812
813 return 0;
814}
815
816/*
817 * Retrieve work items and do the writeback they describe
818 */
819long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
820{
821 struct backing_dev_info *bdi = wb->bdi;
822 struct bdi_work *work;
823 long wrote = 0;
582 824
583 filemap_fdatawait(mapping); 825 while ((work = get_next_work_item(bdi, wb)) != NULL) {
826 struct wb_writeback_args args = work->args;
584 827
585 cond_resched(); 828 /*
829 * Override sync mode, in case we must wait for completion
830 */
831 if (force_wait)
832 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
586 833
587 spin_lock(&inode_lock); 834 /*
835 * If this isn't a data integrity operation, just notify
836 * that we have seen this work and we are now starting it.
837 */
838 if (args.sync_mode == WB_SYNC_NONE)
839 wb_clear_pending(wb, work);
840
841 wrote += wb_writeback(wb, &args);
842
843 /*
844 * This is a data integrity writeback, so only do the
845 * notification when we have completed the work.
846 */
847 if (args.sync_mode == WB_SYNC_ALL)
848 wb_clear_pending(wb, work);
849 }
850
851 /*
852 * Check for periodic writeback, kupdated() style
853 */
854 wrote += wb_check_old_data_flush(wb);
855
856 return wrote;
857}
858
859/*
860 * Handle writeback of dirty data for the device backed by this bdi. Also
861 * wakes up periodically and does kupdated style flushing.
862 */
863int bdi_writeback_task(struct bdi_writeback *wb)
864{
865 unsigned long last_active = jiffies;
866 unsigned long wait_jiffies = -1UL;
867 long pages_written;
868
869 while (!kthread_should_stop()) {
870 pages_written = wb_do_writeback(wb, 0);
871
872 if (pages_written)
873 last_active = jiffies;
874 else if (wait_jiffies != -1UL) {
875 unsigned long max_idle;
876
877 /*
878 * Longest period of inactivity that we tolerate. If we
879 * see dirty data again later, the task will get
880 * recreated automatically.
881 */
882 max_idle = max(5UL * 60 * HZ, wait_jiffies);
883 if (time_after(jiffies, max_idle + last_active))
884 break;
588 } 885 }
589 spin_unlock(&inode_lock);
590 iput(old_inode);
591 } else
592 spin_unlock(&inode_lock);
593 886
594 return; /* Leave any unwritten inodes on s_io */ 887 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
888 schedule_timeout_interruptible(wait_jiffies);
889 try_to_freeze();
890 }
891
892 return 0;
595} 893}
596EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
597 894
598static void sync_sb_inodes(struct super_block *sb, 895/*
599 struct writeback_control *wbc) 896 * Schedule writeback for all backing devices. This does WB_SYNC_NONE
897 * writeback, for integrity writeback see bdi_sync_writeback().
898 */
899static void bdi_writeback_all(struct super_block *sb, long nr_pages)
600{ 900{
601 generic_sync_sb_inodes(sb, wbc); 901 struct wb_writeback_args args = {
902 .sb = sb,
903 .nr_pages = nr_pages,
904 .sync_mode = WB_SYNC_NONE,
905 };
906 struct backing_dev_info *bdi;
907
908 rcu_read_lock();
909
910 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
911 if (!bdi_has_dirty_io(bdi))
912 continue;
913
914 bdi_alloc_queue_work(bdi, &args);
915 }
916
917 rcu_read_unlock();
602} 918}
603 919
604/* 920/*
605 * Start writeback of dirty pagecache data against all unlocked inodes. 921 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
922 * the whole world.
923 */
924void wakeup_flusher_threads(long nr_pages)
925{
926 if (nr_pages == 0)
927 nr_pages = global_page_state(NR_FILE_DIRTY) +
928 global_page_state(NR_UNSTABLE_NFS);
929 bdi_writeback_all(NULL, nr_pages);
930}
931
932static noinline void block_dump___mark_inode_dirty(struct inode *inode)
933{
934 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
935 struct dentry *dentry;
936 const char *name = "?";
937
938 dentry = d_find_alias(inode);
939 if (dentry) {
940 spin_lock(&dentry->d_lock);
941 name = (const char *) dentry->d_name.name;
942 }
943 printk(KERN_DEBUG
944 "%s(%d): dirtied inode %lu (%s) on %s\n",
945 current->comm, task_pid_nr(current), inode->i_ino,
946 name, inode->i_sb->s_id);
947 if (dentry) {
948 spin_unlock(&dentry->d_lock);
949 dput(dentry);
950 }
951 }
952}
953
954/**
955 * __mark_inode_dirty - internal function
956 * @inode: inode to mark
957 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
958 * Mark an inode as dirty. Callers should use mark_inode_dirty or
959 * mark_inode_dirty_sync.
960 *
961 * Put the inode on the super block's dirty list.
606 * 962 *
607 * Note: 963 * CAREFUL! We mark it dirty unconditionally, but move it onto the
608 * We don't need to grab a reference to superblock here. If it has non-empty 964 * dirty list only if it is hashed or if it refers to a blockdev.
609 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed 965 * If it was not hashed, it will never be added to the dirty list
610 * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all 966 * even if it is later hashed, as it will have been marked dirty already.
611 * empty. Since __sync_single_inode() regains inode_lock before it finally moves
612 * inode from superblock lists we are OK.
613 * 967 *
614 * If `older_than_this' is non-zero then only flush inodes which have a 968 * In short, make sure you hash any inodes _before_ you start marking
615 * flushtime older than *older_than_this. 969 * them dirty.
616 * 970 *
617 * If `bdi' is non-zero then we will scan the first inode against each 971 * This function *must* be atomic for the I_DIRTY_PAGES case -
618 * superblock until we find the matching ones. One group will be the dirty 972 * set_page_dirty() is called under spinlock in several places.
619 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 973 *
620 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 974 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
621 * super-efficient but we're about to do a ton of I/O... 975 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
976 * the kernel-internal blockdev inode represents the dirtying time of the
977 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
978 * page->mapping->host, so the page-dirtying time is recorded in the internal
979 * blockdev inode.
622 */ 980 */
623void 981void __mark_inode_dirty(struct inode *inode, int flags)
624writeback_inodes(struct writeback_control *wbc)
625{ 982{
626 struct super_block *sb; 983 struct super_block *sb = inode->i_sb;
627 984
628 might_sleep(); 985 /*
629 spin_lock(&sb_lock); 986 * Don't do this for I_DIRTY_PAGES - that doesn't actually
630restart: 987 * dirty the inode itself
631 list_for_each_entry_reverse(sb, &super_blocks, s_list) { 988 */
632 if (sb_has_dirty_inodes(sb)) { 989 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
633 /* we're making our own get_super here */ 990 if (sb->s_op->dirty_inode)
634 sb->s_count++; 991 sb->s_op->dirty_inode(inode);
635 spin_unlock(&sb_lock); 992 }
636 /* 993
637 * If we can't get the readlock, there's no sense in 994 /*
638 * waiting around, most of the time the FS is going to 995 * make sure that changes are seen by all cpus before we test i_state
639 * be unmounted by the time it is released. 996 * -- mikulas
640 */ 997 */
641 if (down_read_trylock(&sb->s_umount)) { 998 smp_mb();
642 if (sb->s_root) 999
643 sync_sb_inodes(sb, wbc); 1000 /* avoid the locking if we can */
644 up_read(&sb->s_umount); 1001 if ((inode->i_state & flags) == flags)
1002 return;
1003
1004 if (unlikely(block_dump))
1005 block_dump___mark_inode_dirty(inode);
1006
1007 spin_lock(&inode_lock);
1008 if ((inode->i_state & flags) != flags) {
1009 const int was_dirty = inode->i_state & I_DIRTY;
1010
1011 inode->i_state |= flags;
1012
1013 /*
1014 * If the inode is being synced, just update its dirty state.
1015 * The unlocker will place the inode on the appropriate
1016 * superblock list, based upon its state.
1017 */
1018 if (inode->i_state & I_SYNC)
1019 goto out;
1020
1021 /*
1022 * Only add valid (hashed) inodes to the superblock's
1023 * dirty list. Add blockdev inodes as well.
1024 */
1025 if (!S_ISBLK(inode->i_mode)) {
1026 if (hlist_unhashed(&inode->i_hash))
1027 goto out;
1028 }
1029 if (inode->i_state & (I_FREEING|I_CLEAR))
1030 goto out;
1031
1032 /*
1033 * If the inode was already on b_dirty/b_io/b_more_io, don't
1034 * reposition it (that would break b_dirty time-ordering).
1035 */
1036 if (!was_dirty) {
1037 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1038 struct backing_dev_info *bdi = wb->bdi;
1039
1040 if (bdi_cap_writeback_dirty(bdi) &&
1041 !test_bit(BDI_registered, &bdi->state)) {
1042 WARN_ON(1);
1043 printk(KERN_ERR "bdi-%s not registered\n",
1044 bdi->name);
645 } 1045 }
646 spin_lock(&sb_lock); 1046
647 if (__put_super_and_need_restart(sb)) 1047 inode->dirtied_when = jiffies;
648 goto restart; 1048 list_move(&inode->i_list, &wb->b_dirty);
649 } 1049 }
650 if (wbc->nr_to_write <= 0)
651 break;
652 } 1050 }
653 spin_unlock(&sb_lock); 1051out:
1052 spin_unlock(&inode_lock);
654} 1053}
1054EXPORT_SYMBOL(__mark_inode_dirty);
655 1055
656/* 1056/*
657 * writeback and wait upon the filesystem's dirty inodes. The caller will 1057 * Write out a superblock's list of dirty inodes. A wait will be performed
658 * do this in two passes - one to write, and one to wait. 1058 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1059 *
1060 * If older_than_this is non-NULL, then only write out inodes which
1061 * had their first dirtying at a time earlier than *older_than_this.
1062 *
1063 * If we're a pdlfush thread, then implement pdflush collision avoidance
1064 * against the entire list.
659 * 1065 *
660 * A finite limit is set on the number of pages which will be written. 1066 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
661 * To prevent infinite livelock of sys_sync(). 1067 * This function assumes that the blockdev superblock's inodes are backed by
1068 * a variety of queues, so all inodes are searched. For other superblocks,
1069 * assume that all inodes are backed by the same queue.
662 * 1070 *
663 * We add in the number of potentially dirty inodes, because each inode write 1071 * The inodes to be written are parked on bdi->b_io. They are moved back onto
664 * can dirty pagecache in the underlying blockdev. 1072 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1073 * on the writer throttling path, and we get decent balancing between many
1074 * throttled threads: we don't want them all piling up on inode_sync_wait.
665 */ 1075 */
666void sync_inodes_sb(struct super_block *sb, int wait) 1076static void wait_sb_inodes(struct super_block *sb)
667{ 1077{
668 struct writeback_control wbc = { 1078 struct inode *inode, *old_inode = NULL;
669 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 1079
670 .range_start = 0, 1080 /*
671 .range_end = LLONG_MAX, 1081 * We need to be protected against the filesystem going from
672 }; 1082 * r/o to r/w or vice versa.
1083 */
1084 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1085
1086 spin_lock(&inode_lock);
1087
1088 /*
1089 * Data integrity sync. Must wait for all pages under writeback,
1090 * because there may have been pages dirtied before our sync
1091 * call, but which had writeout started before we write it out.
1092 * In which case, the inode may not be on the dirty list, but
1093 * we still have to wait for that writeout.
1094 */
1095 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1096 struct address_space *mapping;
1097
1098 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1099 continue;
1100 mapping = inode->i_mapping;
1101 if (mapping->nrpages == 0)
1102 continue;
1103 __iget(inode);
1104 spin_unlock(&inode_lock);
1105 /*
1106 * We hold a reference to 'inode' so it couldn't have
1107 * been removed from s_inodes list while we dropped the
1108 * inode_lock. We cannot iput the inode now as we can
1109 * be holding the last reference and we cannot iput it
1110 * under inode_lock. So we keep the reference and iput
1111 * it later.
1112 */
1113 iput(old_inode);
1114 old_inode = inode;
1115
1116 filemap_fdatawait(mapping);
1117
1118 cond_resched();
673 1119
674 if (!wait) { 1120 spin_lock(&inode_lock);
675 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1121 }
676 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1122 spin_unlock(&inode_lock);
1123 iput(old_inode);
1124}
677 1125
678 wbc.nr_to_write = nr_dirty + nr_unstable + 1126/**
1127 * writeback_inodes_sb - writeback dirty inodes from given super_block
1128 * @sb: the superblock
1129 *
1130 * Start writeback on some inodes on this super_block. No guarantees are made
1131 * on how many (if any) will be written, and this function does not wait
1132 * for IO completion of submitted IO. The number of pages submitted is
1133 * returned.
1134 */
1135void writeback_inodes_sb(struct super_block *sb)
1136{
1137 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1138 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1139 long nr_to_write;
1140
1141 nr_to_write = nr_dirty + nr_unstable +
679 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1142 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
680 } else
681 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
682 1143
683 sync_sb_inodes(sb, &wbc); 1144 bdi_writeback_all(sb, nr_to_write);
1145}
1146EXPORT_SYMBOL(writeback_inodes_sb);
1147
1148/**
1149 * sync_inodes_sb - sync sb inode pages
1150 * @sb: the superblock
1151 *
1152 * This function writes and waits on any dirty inode belonging to this
1153 * super_block. The number of pages synced is returned.
1154 */
1155void sync_inodes_sb(struct super_block *sb)
1156{
1157 bdi_sync_writeback(sb->s_bdi, sb);
1158 wait_sb_inodes(sb);
684} 1159}
1160EXPORT_SYMBOL(sync_inodes_sb);
685 1161
686/** 1162/**
687 * write_inode_now - write an inode to disk 1163 * write_inode_now - write an inode to disk
@@ -737,57 +1213,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
737 return ret; 1213 return ret;
738} 1214}
739EXPORT_SYMBOL(sync_inode); 1215EXPORT_SYMBOL(sync_inode);
740
741/**
742 * generic_osync_inode - flush all dirty data for a given inode to disk
743 * @inode: inode to write
744 * @mapping: the address_space that should be flushed
745 * @what: what to write and wait upon
746 *
747 * This can be called by file_write functions for files which have the
748 * O_SYNC flag set, to flush dirty writes to disk.
749 *
750 * @what is a bitmask, specifying which part of the inode's data should be
751 * written and waited upon.
752 *
753 * OSYNC_DATA: i_mapping's dirty data
754 * OSYNC_METADATA: the buffers at i_mapping->private_list
755 * OSYNC_INODE: the inode itself
756 */
757
758int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
759{
760 int err = 0;
761 int need_write_inode_now = 0;
762 int err2;
763
764 if (what & OSYNC_DATA)
765 err = filemap_fdatawrite(mapping);
766 if (what & (OSYNC_METADATA|OSYNC_DATA)) {
767 err2 = sync_mapping_buffers(mapping);
768 if (!err)
769 err = err2;
770 }
771 if (what & OSYNC_DATA) {
772 err2 = filemap_fdatawait(mapping);
773 if (!err)
774 err = err2;
775 }
776
777 spin_lock(&inode_lock);
778 if ((inode->i_state & I_DIRTY) &&
779 ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
780 need_write_inode_now = 1;
781 spin_unlock(&inode_lock);
782
783 if (need_write_inode_now) {
784 err2 = write_inode_now(inode, 1);
785 if (!err)
786 err = err2;
787 }
788 else
789 inode_sync_wait(inode);
790
791 return err;
792}
793EXPORT_SYMBOL(generic_osync_inode);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
61 return simple_read_from_buffer(buf, len, ppos, tmp, size); 61 return simple_read_from_buffer(buf, len, ppos, tmp, size);
62} 62}
63 63
64static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
65 size_t len, loff_t *ppos, unsigned val)
66{
67 char tmp[32];
68 size_t size = sprintf(tmp, "%u\n", val);
69
70 return simple_read_from_buffer(buf, len, ppos, tmp, size);
71}
72
73static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
74 size_t count, loff_t *ppos, unsigned *val,
75 unsigned global_limit)
76{
77 unsigned long t;
78 char tmp[32];
79 unsigned limit = (1 << 16) - 1;
80 int err;
81
82 if (*ppos || count >= sizeof(tmp) - 1)
83 return -EINVAL;
84
85 if (copy_from_user(tmp, buf, count))
86 return -EINVAL;
87
88 tmp[count] = '\0';
89
90 err = strict_strtoul(tmp, 0, &t);
91 if (err)
92 return err;
93
94 if (!capable(CAP_SYS_ADMIN))
95 limit = min(limit, global_limit);
96
97 if (t > limit)
98 return -EINVAL;
99
100 *val = t;
101
102 return count;
103}
104
105static ssize_t fuse_conn_max_background_read(struct file *file,
106 char __user *buf, size_t len,
107 loff_t *ppos)
108{
109 struct fuse_conn *fc;
110 unsigned val;
111
112 fc = fuse_ctl_file_conn_get(file);
113 if (!fc)
114 return 0;
115
116 val = fc->max_background;
117 fuse_conn_put(fc);
118
119 return fuse_conn_limit_read(file, buf, len, ppos, val);
120}
121
122static ssize_t fuse_conn_max_background_write(struct file *file,
123 const char __user *buf,
124 size_t count, loff_t *ppos)
125{
126 unsigned val;
127 ssize_t ret;
128
129 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
130 max_user_bgreq);
131 if (ret > 0) {
132 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
133 if (fc) {
134 fc->max_background = val;
135 fuse_conn_put(fc);
136 }
137 }
138
139 return ret;
140}
141
142static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
143 char __user *buf, size_t len,
144 loff_t *ppos)
145{
146 struct fuse_conn *fc;
147 unsigned val;
148
149 fc = fuse_ctl_file_conn_get(file);
150 if (!fc)
151 return 0;
152
153 val = fc->congestion_threshold;
154 fuse_conn_put(fc);
155
156 return fuse_conn_limit_read(file, buf, len, ppos, val);
157}
158
159static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
160 const char __user *buf,
161 size_t count, loff_t *ppos)
162{
163 unsigned val;
164 ssize_t ret;
165
166 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
167 max_user_congthresh);
168 if (ret > 0) {
169 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
170 if (fc) {
171 fc->congestion_threshold = val;
172 fuse_conn_put(fc);
173 }
174 }
175
176 return ret;
177}
178
64static const struct file_operations fuse_ctl_abort_ops = { 179static const struct file_operations fuse_ctl_abort_ops = {
65 .open = nonseekable_open, 180 .open = nonseekable_open,
66 .write = fuse_conn_abort_write, 181 .write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
71 .read = fuse_conn_waiting_read, 186 .read = fuse_conn_waiting_read,
72}; 187};
73 188
189static const struct file_operations fuse_conn_max_background_ops = {
190 .open = nonseekable_open,
191 .read = fuse_conn_max_background_read,
192 .write = fuse_conn_max_background_write,
193};
194
195static const struct file_operations fuse_conn_congestion_threshold_ops = {
196 .open = nonseekable_open,
197 .read = fuse_conn_congestion_threshold_read,
198 .write = fuse_conn_congestion_threshold_write,
199};
200
74static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, 201static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
75 struct fuse_conn *fc, 202 struct fuse_conn *fc,
76 const char *name, 203 const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
127 goto err; 254 goto err;
128 255
129 if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, 256 if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
130 NULL, &fuse_ctl_waiting_ops) || 257 NULL, &fuse_ctl_waiting_ops) ||
131 !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, 258 !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
132 NULL, &fuse_ctl_abort_ops)) 259 NULL, &fuse_ctl_abort_ops) ||
260 !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
261 1, NULL, &fuse_conn_max_background_ops) ||
262 !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
263 S_IFREG | 0600, 1, NULL,
264 &fuse_conn_congestion_threshold_ops))
133 goto err; 265 goto err;
134 266
135 return 0; 267 return 0;
@@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
156 d_drop(dentry); 288 d_drop(dentry);
157 dput(dentry); 289 dput(dentry);
158 } 290 }
159 fuse_control_sb->s_root->d_inode->i_nlink--; 291 drop_nlink(fuse_control_sb->s_root->d_inode);
160} 292}
161 293
162static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) 294static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6484eb75acd6..51d9e33d634f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
250 250
251static void flush_bg_queue(struct fuse_conn *fc) 251static void flush_bg_queue(struct fuse_conn *fc)
252{ 252{
253 while (fc->active_background < FUSE_MAX_BACKGROUND && 253 while (fc->active_background < fc->max_background &&
254 !list_empty(&fc->bg_queue)) { 254 !list_empty(&fc->bg_queue)) {
255 struct fuse_req *req; 255 struct fuse_req *req;
256 256
@@ -280,11 +280,11 @@ __releases(&fc->lock)
280 list_del(&req->intr_entry); 280 list_del(&req->intr_entry);
281 req->state = FUSE_REQ_FINISHED; 281 req->state = FUSE_REQ_FINISHED;
282 if (req->background) { 282 if (req->background) {
283 if (fc->num_background == FUSE_MAX_BACKGROUND) { 283 if (fc->num_background == fc->max_background) {
284 fc->blocked = 0; 284 fc->blocked = 0;
285 wake_up_all(&fc->blocked_waitq); 285 wake_up_all(&fc->blocked_waitq);
286 } 286 }
287 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 287 if (fc->num_background == fc->congestion_threshold &&
288 fc->connected && fc->bdi_initialized) { 288 fc->connected && fc->bdi_initialized) {
289 clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); 289 clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
290 clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); 290 clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
410{ 410{
411 req->background = 1; 411 req->background = 1;
412 fc->num_background++; 412 fc->num_background++;
413 if (fc->num_background == FUSE_MAX_BACKGROUND) 413 if (fc->num_background == fc->max_background)
414 fc->blocked = 1; 414 fc->blocked = 1;
415 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 415 if (fc->num_background == fc->congestion_threshold &&
416 fc->bdi_initialized) { 416 fc->bdi_initialized) {
417 set_bdi_congested(&fc->bdi, BLK_RW_SYNC); 417 set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
418 set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); 418 set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..fc9c79feb5f7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
25/** Max number of pages that can be used in a single read request */ 25/** Max number of pages that can be used in a single read request */
26#define FUSE_MAX_PAGES_PER_REQ 32 26#define FUSE_MAX_PAGES_PER_REQ 32
27 27
28/** Maximum number of outstanding background requests */
29#define FUSE_MAX_BACKGROUND 12
30
31/** Congestion starts at 75% of maximum */
32#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
33
34/** Bias for fi->writectr, meaning new writepages must not be sent */ 28/** Bias for fi->writectr, meaning new writepages must not be sent */
35#define FUSE_NOWRITE INT_MIN 29#define FUSE_NOWRITE INT_MIN
36 30
@@ -38,7 +32,7 @@
38#define FUSE_NAME_MAX 1024 32#define FUSE_NAME_MAX 1024
39 33
40/** Number of dentries for each connection in the control filesystem */ 34/** Number of dentries for each connection in the control filesystem */
41#define FUSE_CTL_NUM_DENTRIES 3 35#define FUSE_CTL_NUM_DENTRIES 5
42 36
43/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem 37/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
44 module will check permissions based on the file mode. Otherwise no 38 module will check permissions based on the file mode. Otherwise no
@@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list;
55/** Global mutex protecting fuse_conn_list and the control filesystem */ 49/** Global mutex protecting fuse_conn_list and the control filesystem */
56extern struct mutex fuse_mutex; 50extern struct mutex fuse_mutex;
57 51
52/** Module parameters */
53extern unsigned max_user_bgreq;
54extern unsigned max_user_congthresh;
55
58/** FUSE inode */ 56/** FUSE inode */
59struct fuse_inode { 57struct fuse_inode {
60 /** Inode data */ 58 /** Inode data */
@@ -349,6 +347,12 @@ struct fuse_conn {
349 /** rbtree of fuse_files waiting for poll events indexed by ph */ 347 /** rbtree of fuse_files waiting for poll events indexed by ph */
350 struct rb_root polled_files; 348 struct rb_root polled_files;
351 349
350 /** Maximum number of outstanding background requests */
351 unsigned max_background;
352
353 /** Number of background requests at which congestion starts */
354 unsigned congestion_threshold;
355
352 /** Number of requests currently in the background */ 356 /** Number of requests currently in the background */
353 unsigned num_background; 357 unsigned num_background;
354 358
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..6da947daabda 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/moduleparam.h>
17#include <linux/parser.h> 18#include <linux/parser.h>
18#include <linux/statfs.h> 19#include <linux/statfs.h>
19#include <linux/random.h> 20#include <linux/random.h>
@@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep;
28struct list_head fuse_conn_list; 29struct list_head fuse_conn_list;
29DEFINE_MUTEX(fuse_mutex); 30DEFINE_MUTEX(fuse_mutex);
30 31
32static int set_global_limit(const char *val, struct kernel_param *kp);
33
34unsigned max_user_bgreq;
35module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
36 &max_user_bgreq, 0644);
37__MODULE_PARM_TYPE(max_user_bgreq, "uint");
38MODULE_PARM_DESC(max_user_bgreq,
39 "Global limit for the maximum number of backgrounded requests an "
40 "unprivileged user can set");
41
42unsigned max_user_congthresh;
43module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
44 &max_user_congthresh, 0644);
45__MODULE_PARM_TYPE(max_user_congthresh, "uint");
46MODULE_PARM_DESC(max_user_congthresh,
47 "Global limit for the maximum congestion threshold an "
48 "unprivileged user can set");
49
31#define FUSE_SUPER_MAGIC 0x65735546 50#define FUSE_SUPER_MAGIC 0x65735546
32 51
33#define FUSE_DEFAULT_BLKSIZE 512 52#define FUSE_DEFAULT_BLKSIZE 512
34 53
54/** Maximum number of outstanding background requests */
55#define FUSE_DEFAULT_MAX_BACKGROUND 12
56
57/** Congestion starts at 75% of maximum */
58#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
59
35struct fuse_mount_data { 60struct fuse_mount_data {
36 int fd; 61 int fd;
37 unsigned rootmode; 62 unsigned rootmode;
@@ -517,6 +542,8 @@ void fuse_conn_init(struct fuse_conn *fc)
517 INIT_LIST_HEAD(&fc->bg_queue); 542 INIT_LIST_HEAD(&fc->bg_queue);
518 INIT_LIST_HEAD(&fc->entry); 543 INIT_LIST_HEAD(&fc->entry);
519 atomic_set(&fc->num_waiting, 0); 544 atomic_set(&fc->num_waiting, 0);
545 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
546 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
520 fc->khctr = 0; 547 fc->khctr = 0;
521 fc->polled_files = RB_ROOT; 548 fc->polled_files = RB_ROOT;
522 fc->reqctr = 0; 549 fc->reqctr = 0;
@@ -727,6 +754,54 @@ static const struct super_operations fuse_super_operations = {
727 .show_options = fuse_show_options, 754 .show_options = fuse_show_options,
728}; 755};
729 756
757static void sanitize_global_limit(unsigned *limit)
758{
759 if (*limit == 0)
760 *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
761 sizeof(struct fuse_req);
762
763 if (*limit >= 1 << 16)
764 *limit = (1 << 16) - 1;
765}
766
767static int set_global_limit(const char *val, struct kernel_param *kp)
768{
769 int rv;
770
771 rv = param_set_uint(val, kp);
772 if (rv)
773 return rv;
774
775 sanitize_global_limit((unsigned *)kp->arg);
776
777 return 0;
778}
779
780static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
781{
782 int cap_sys_admin = capable(CAP_SYS_ADMIN);
783
784 if (arg->minor < 13)
785 return;
786
787 sanitize_global_limit(&max_user_bgreq);
788 sanitize_global_limit(&max_user_congthresh);
789
790 if (arg->max_background) {
791 fc->max_background = arg->max_background;
792
793 if (!cap_sys_admin && fc->max_background > max_user_bgreq)
794 fc->max_background = max_user_bgreq;
795 }
796 if (arg->congestion_threshold) {
797 fc->congestion_threshold = arg->congestion_threshold;
798
799 if (!cap_sys_admin &&
800 fc->congestion_threshold > max_user_congthresh)
801 fc->congestion_threshold = max_user_congthresh;
802 }
803}
804
730static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) 805static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
731{ 806{
732 struct fuse_init_out *arg = &req->misc.init_out; 807 struct fuse_init_out *arg = &req->misc.init_out;
@@ -736,6 +811,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
736 else { 811 else {
737 unsigned long ra_pages; 812 unsigned long ra_pages;
738 813
814 process_init_limits(fc, arg);
815
739 if (arg->minor >= 6) { 816 if (arg->minor >= 6) {
740 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; 817 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
741 if (arg->flags & FUSE_ASYNC_READ) 818 if (arg->flags & FUSE_ASYNC_READ)
@@ -801,6 +878,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
801{ 878{
802 int err; 879 int err;
803 880
881 fc->bdi.name = "fuse";
804 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 882 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
805 fc->bdi.unplug_io_fn = default_unplug_io_fn; 883 fc->bdi.unplug_io_fn = default_unplug_io_fn;
806 /* fuse does it's own writeback accounting */ 884 /* fuse does it's own writeback accounting */
@@ -893,6 +971,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
893 if (err) 971 if (err)
894 goto err_put_conn; 972 goto err_put_conn;
895 973
974 sb->s_bdi = &fc->bdi;
975
896 /* Handle umasking inside the fuse code */ 976 /* Handle umasking inside the fuse code */
897 if (sb->s_flags & MS_POSIXACL) 977 if (sb->s_flags & MS_POSIXACL)
898 fc->dont_mask = 1; 978 fc->dont_mask = 1;
@@ -1147,6 +1227,9 @@ static int __init fuse_init(void)
1147 if (res) 1227 if (res)
1148 goto err_sysfs_cleanup; 1228 goto err_sysfs_cleanup;
1149 1229
1230 sanitize_global_limit(&max_user_bgreq);
1231 sanitize_global_limit(&max_user_congthresh);
1232
1150 return 0; 1233 return 0;
1151 1234
1152 err_sysfs_cleanup: 1235 err_sysfs_cleanup:
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f738..21f7e46da4c0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
1EXTRA_CFLAGS := -I$(src) 1EXTRA_CFLAGS := -I$(src)
2obj-$(CONFIG_GFS2_FS) += gfs2.o 2obj-$(CONFIG_GFS2_FS) += gfs2.o
3gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ 3gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
4 glops.o inode.o log.o lops.o main.o meta_io.o \ 4 glops.o inode.o log.o lops.o main.o meta_io.o \
5 aops.o dentry.o export.o file.o \ 5 aops.o dentry.o export.o file.o \
6 ops_fstype.o ops_inode.o quota.o \ 6 ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d85..3fc4e3ac7d84 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,8 +19,7 @@
19#include "gfs2.h" 19#include "gfs2.h"
20#include "incore.h" 20#include "incore.h"
21#include "acl.h" 21#include "acl.h"
22#include "eaops.h" 22#include "xattr.h"
23#include "eattr.h"
24#include "glock.h" 23#include "glock.h"
25#include "inode.h" 24#include "inode.h"
26#include "meta_io.h" 25#include "meta_io.h"
@@ -31,8 +30,7 @@
31#define ACL_DEFAULT 0 30#define ACL_DEFAULT 0
32 31
33int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, 32int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
34 struct gfs2_ea_request *er, 33 struct gfs2_ea_request *er, int *remove, mode_t *mode)
35 int *remove, mode_t *mode)
36{ 34{
37 struct posix_acl *acl; 35 struct posix_acl *acl;
38 int error; 36 int error;
@@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
83 return 0; 81 return 0;
84} 82}
85 83
86static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, 84static int acl_get(struct gfs2_inode *ip, const char *name,
87 struct gfs2_ea_location *el, char **data, unsigned int *len) 85 struct posix_acl **acl, struct gfs2_ea_location *el,
86 char **datap, unsigned int *lenp)
88{ 87{
89 struct gfs2_ea_request er; 88 char *data;
90 struct gfs2_ea_location el_this; 89 unsigned int len;
91 int error; 90 int error;
92 91
92 el->el_bh = NULL;
93
93 if (!ip->i_eattr) 94 if (!ip->i_eattr)
94 return 0; 95 return 0;
95 96
96 memset(&er, 0, sizeof(struct gfs2_ea_request)); 97 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
97 if (access) {
98 er.er_name = GFS2_POSIX_ACL_ACCESS;
99 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
100 } else {
101 er.er_name = GFS2_POSIX_ACL_DEFAULT;
102 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
103 }
104 er.er_type = GFS2_EATYPE_SYS;
105
106 if (!el)
107 el = &el_this;
108
109 error = gfs2_ea_find(ip, &er, el);
110 if (error) 98 if (error)
111 return error; 99 return error;
112 if (!el->el_ea) 100 if (!el->el_ea)
@@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
114 if (!GFS2_EA_DATA_LEN(el->el_ea)) 102 if (!GFS2_EA_DATA_LEN(el->el_ea))
115 goto out; 103 goto out;
116 104
117 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); 105 len = GFS2_EA_DATA_LEN(el->el_ea);
118 er.er_data = kmalloc(er.er_data_len, GFP_NOFS); 106 data = kmalloc(len, GFP_NOFS);
119 error = -ENOMEM; 107 error = -ENOMEM;
120 if (!er.er_data) 108 if (!data)
121 goto out; 109 goto out;
122 110
123 error = gfs2_ea_get_copy(ip, el, er.er_data); 111 error = gfs2_ea_get_copy(ip, el, data, len);
124 if (error) 112 if (error < 0)
125 goto out_kfree; 113 goto out_kfree;
114 error = 0;
126 115
127 if (acl) { 116 if (acl) {
128 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); 117 *acl = posix_acl_from_xattr(data, len);
129 if (IS_ERR(*acl)) 118 if (IS_ERR(*acl))
130 error = PTR_ERR(*acl); 119 error = PTR_ERR(*acl);
131 } 120 }
132 121
133out_kfree: 122out_kfree:
134 if (error || !data) 123 if (error || !datap) {
135 kfree(er.er_data); 124 kfree(data);
136 else { 125 } else {
137 *data = er.er_data; 126 *datap = data;
138 *len = er.er_data_len; 127 *lenp = len;
139 } 128 }
140out: 129out:
141 if (error || el == &el_this)
142 brelse(el->el_bh);
143 return error; 130 return error;
144} 131}
145 132
@@ -153,10 +140,12 @@ out:
153 140
154int gfs2_check_acl(struct inode *inode, int mask) 141int gfs2_check_acl(struct inode *inode, int mask)
155{ 142{
143 struct gfs2_ea_location el;
156 struct posix_acl *acl = NULL; 144 struct posix_acl *acl = NULL;
157 int error; 145 int error;
158 146
159 error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL); 147 error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
148 brelse(el.el_bh);
160 if (error) 149 if (error)
161 return error; 150 return error;
162 151
@@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
196 185
197int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) 186int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
198{ 187{
188 struct gfs2_ea_location el;
199 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 189 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
200 struct posix_acl *acl = NULL, *clone; 190 struct posix_acl *acl = NULL, *clone;
201 struct gfs2_ea_request er;
202 mode_t mode = ip->i_inode.i_mode; 191 mode_t mode = ip->i_inode.i_mode;
192 char *data = NULL;
193 unsigned int len;
203 int error; 194 int error;
204 195
205 if (!sdp->sd_args.ar_posix_acl) 196 if (!sdp->sd_args.ar_posix_acl)
@@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
207 if (S_ISLNK(ip->i_inode.i_mode)) 198 if (S_ISLNK(ip->i_inode.i_mode))
208 return 0; 199 return 0;
209 200
210 memset(&er, 0, sizeof(struct gfs2_ea_request)); 201 error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
211 er.er_type = GFS2_EATYPE_SYS; 202 brelse(el.el_bh);
212
213 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
214 &er.er_data, &er.er_data_len);
215 if (error) 203 if (error)
216 return error; 204 return error;
217 if (!acl) { 205 if (!acl) {
@@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
229 acl = clone; 217 acl = clone;
230 218
231 if (S_ISDIR(ip->i_inode.i_mode)) { 219 if (S_ISDIR(ip->i_inode.i_mode)) {
232 er.er_name = GFS2_POSIX_ACL_DEFAULT; 220 error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
233 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; 221 GFS2_POSIX_ACL_DEFAULT, data, len, 0);
234 error = gfs2_system_eaops.eo_set(ip, &er);
235 if (error) 222 if (error)
236 goto out; 223 goto out;
237 } 224 }
@@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
239 error = posix_acl_create_masq(acl, &mode); 226 error = posix_acl_create_masq(acl, &mode);
240 if (error < 0) 227 if (error < 0)
241 goto out; 228 goto out;
242 if (error > 0) { 229 if (error == 0)
243 er.er_name = GFS2_POSIX_ACL_ACCESS; 230 goto munge;
244 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
245 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
246 er.er_mode = mode;
247 er.er_flags = GFS2_ERF_MODE;
248 error = gfs2_system_eaops.eo_set(ip, &er);
249 if (error)
250 goto out;
251 } else
252 munge_mode(ip, mode);
253 231
232 posix_acl_to_xattr(acl, data, len);
233 error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
234 GFS2_POSIX_ACL_ACCESS, data, len, 0);
235 if (error)
236 goto out;
237munge:
238 error = munge_mode(ip, mode);
254out: 239out:
255 posix_acl_release(acl); 240 posix_acl_release(acl);
256 kfree(er.er_data); 241 kfree(data);
257 return error; 242 return error;
258} 243}
259 244
@@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
265 unsigned int len; 250 unsigned int len;
266 int error; 251 int error;
267 252
268 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); 253 error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
269 if (error) 254 if (error)
270 return error; 255 goto out_brelse;
271 if (!acl) 256 if (!acl)
272 return gfs2_setattr_simple(ip, attr); 257 return gfs2_setattr_simple(ip, attr);
273 258
@@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
286 271
287out: 272out:
288 posix_acl_release(acl); 273 posix_acl_release(acl);
289 brelse(el.el_bh);
290 kfree(data); 274 kfree(data);
275out_brelse:
276 brelse(el.el_bh);
291 return error; 277 return error;
292} 278}
293 279
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 03ebb439ace0..7ebae9a4ecc0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -624,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
624{ 624{
625 struct gfs2_inode *ip = GFS2_I(mapping->host); 625 struct gfs2_inode *ip = GFS2_I(mapping->host);
626 struct gfs2_sbd *sdp = GFS2_SB(mapping->host); 626 struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
627 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
627 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 628 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
628 int alloc_required; 629 int alloc_required;
629 int error = 0; 630 int error = 0;
@@ -637,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
637 error = gfs2_glock_nq(&ip->i_gh); 638 error = gfs2_glock_nq(&ip->i_gh);
638 if (unlikely(error)) 639 if (unlikely(error))
639 goto out_uninit; 640 goto out_uninit;
641 if (&ip->i_inode == sdp->sd_rindex) {
642 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
643 GL_NOCACHE, &m_ip->i_gh);
644 if (unlikely(error)) {
645 gfs2_glock_dq(&ip->i_gh);
646 goto out_uninit;
647 }
648 }
640 649
641 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); 650 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
642 if (error) 651 if (error)
@@ -667,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
667 rblocks += data_blocks ? data_blocks : 1; 676 rblocks += data_blocks ? data_blocks : 1;
668 if (ind_blocks || data_blocks) 677 if (ind_blocks || data_blocks)
669 rblocks += RES_STATFS + RES_QUOTA; 678 rblocks += RES_STATFS + RES_QUOTA;
679 if (&ip->i_inode == sdp->sd_rindex)
680 rblocks += 2 * RES_STATFS;
670 681
671 error = gfs2_trans_begin(sdp, rblocks, 682 error = gfs2_trans_begin(sdp, rblocks,
672 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 683 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -712,6 +723,10 @@ out_alloc_put:
712 gfs2_alloc_put(ip); 723 gfs2_alloc_put(ip);
713 } 724 }
714out_unlock: 725out_unlock:
726 if (&ip->i_inode == sdp->sd_rindex) {
727 gfs2_glock_dq(&m_ip->i_gh);
728 gfs2_holder_uninit(&m_ip->i_gh);
729 }
715 gfs2_glock_dq(&ip->i_gh); 730 gfs2_glock_dq(&ip->i_gh);
716out_uninit: 731out_uninit:
717 gfs2_holder_uninit(&ip->i_gh); 732 gfs2_holder_uninit(&ip->i_gh);
@@ -725,14 +740,21 @@ out_uninit:
725static void adjust_fs_space(struct inode *inode) 740static void adjust_fs_space(struct inode *inode)
726{ 741{
727 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; 742 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
743 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
744 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
728 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; 745 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
729 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; 746 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
747 struct buffer_head *m_bh, *l_bh;
730 u64 fs_total, new_free; 748 u64 fs_total, new_free;
731 749
732 /* Total up the file system space, according to the latest rindex. */ 750 /* Total up the file system space, according to the latest rindex. */
733 fs_total = gfs2_ri_total(sdp); 751 fs_total = gfs2_ri_total(sdp);
752 if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
753 return;
734 754
735 spin_lock(&sdp->sd_statfs_spin); 755 spin_lock(&sdp->sd_statfs_spin);
756 gfs2_statfs_change_in(m_sc, m_bh->b_data +
757 sizeof(struct gfs2_dinode));
736 if (fs_total > (m_sc->sc_total + l_sc->sc_total)) 758 if (fs_total > (m_sc->sc_total + l_sc->sc_total))
737 new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); 759 new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
738 else 760 else
@@ -741,6 +763,13 @@ static void adjust_fs_space(struct inode *inode)
741 fs_warn(sdp, "File system extended by %llu blocks.\n", 763 fs_warn(sdp, "File system extended by %llu blocks.\n",
742 (unsigned long long)new_free); 764 (unsigned long long)new_free);
743 gfs2_statfs_change(sdp, new_free, new_free, 0); 765 gfs2_statfs_change(sdp, new_free, new_free, 0);
766
767 if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
768 goto out;
769 update_statfs(sdp, m_bh, l_bh);
770 brelse(l_bh);
771out:
772 brelse(m_bh);
744} 773}
745 774
746/** 775/**
@@ -763,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
763{ 792{
764 struct gfs2_inode *ip = GFS2_I(inode); 793 struct gfs2_inode *ip = GFS2_I(inode);
765 struct gfs2_sbd *sdp = GFS2_SB(inode); 794 struct gfs2_sbd *sdp = GFS2_SB(inode);
795 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
766 u64 to = pos + copied; 796 u64 to = pos + copied;
767 void *kaddr; 797 void *kaddr;
768 unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); 798 unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
@@ -794,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
794 824
795 brelse(dibh); 825 brelse(dibh);
796 gfs2_trans_end(sdp); 826 gfs2_trans_end(sdp);
827 if (inode == sdp->sd_rindex) {
828 gfs2_glock_dq(&m_ip->i_gh);
829 gfs2_holder_uninit(&m_ip->i_gh);
830 }
797 gfs2_glock_dq(&ip->i_gh); 831 gfs2_glock_dq(&ip->i_gh);
798 gfs2_holder_uninit(&ip->i_gh); 832 gfs2_holder_uninit(&ip->i_gh);
799 return copied; 833 return copied;
@@ -823,6 +857,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
823 struct inode *inode = page->mapping->host; 857 struct inode *inode = page->mapping->host;
824 struct gfs2_inode *ip = GFS2_I(inode); 858 struct gfs2_inode *ip = GFS2_I(inode);
825 struct gfs2_sbd *sdp = GFS2_SB(inode); 859 struct gfs2_sbd *sdp = GFS2_SB(inode);
860 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
826 struct buffer_head *dibh; 861 struct buffer_head *dibh;
827 struct gfs2_alloc *al = ip->i_alloc; 862 struct gfs2_alloc *al = ip->i_alloc;
828 unsigned int from = pos & (PAGE_CACHE_SIZE - 1); 863 unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
@@ -865,6 +900,10 @@ failed:
865 gfs2_quota_unlock(ip); 900 gfs2_quota_unlock(ip);
866 gfs2_alloc_put(ip); 901 gfs2_alloc_put(ip);
867 } 902 }
903 if (inode == sdp->sd_rindex) {
904 gfs2_glock_dq(&m_ip->i_gh);
905 gfs2_holder_uninit(&m_ip->i_gh);
906 }
868 gfs2_glock_dq(&ip->i_gh); 907 gfs2_glock_dq(&ip->i_gh);
869 gfs2_holder_uninit(&ip->i_gh); 908 gfs2_holder_uninit(&ip->i_gh);
870 return ret; 909 return ret;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..91beddadd388 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
107 return 0; 107 return 0;
108} 108}
109 109
110static int gfs2_dentry_delete(struct dentry *dentry)
111{
112 struct gfs2_inode *ginode;
113
114 if (!dentry->d_inode)
115 return 0;
116
117 ginode = GFS2_I(dentry->d_inode);
118 if (!ginode->i_iopen_gh.gh_gl)
119 return 0;
120
121 if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
122 return 1;
123
124 return 0;
125}
126
110const struct dentry_operations gfs2_dops = { 127const struct dentry_operations gfs2_dops = {
111 .d_revalidate = gfs2_drevalidate, 128 .d_revalidate = gfs2_drevalidate,
112 .d_hash = gfs2_dhash, 129 .d_hash = gfs2_dhash,
130 .d_delete = gfs2_dentry_delete,
113}; 131};
114 132
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b37..000000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/completion.h>
13#include <linux/buffer_head.h>
14#include <linux/capability.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "acl.h"
22#include "eaops.h"
23#include "eattr.h"
24#include "util.h"
25
26/**
27 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
28 * @namep: ea name, possibly with type appended
29 *
30 * Returns: GFS2_EATYPE_XXX
31 */
32
33unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
34{
35 unsigned int type;
36
37 if (strncmp(name, "system.", 7) == 0) {
38 type = GFS2_EATYPE_SYS;
39 if (truncated_name)
40 *truncated_name = name + sizeof("system.") - 1;
41 } else if (strncmp(name, "user.", 5) == 0) {
42 type = GFS2_EATYPE_USR;
43 if (truncated_name)
44 *truncated_name = name + sizeof("user.") - 1;
45 } else if (strncmp(name, "security.", 9) == 0) {
46 type = GFS2_EATYPE_SECURITY;
47 if (truncated_name)
48 *truncated_name = name + sizeof("security.") - 1;
49 } else {
50 type = GFS2_EATYPE_UNUSED;
51 if (truncated_name)
52 *truncated_name = NULL;
53 }
54
55 return type;
56}
57
58static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
59{
60 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
61 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
62 !capable(CAP_SYS_ADMIN))
63 return -EPERM;
64
65 if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
66 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
67 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
68 return -EOPNOTSUPP;
69
70 return gfs2_ea_get_i(ip, er);
71}
72
73static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
74{
75 int remove = 0;
76 int error;
77
78 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
79 if (!(er->er_flags & GFS2_ERF_MODE)) {
80 er->er_mode = ip->i_inode.i_mode;
81 er->er_flags |= GFS2_ERF_MODE;
82 }
83 error = gfs2_acl_validate_set(ip, 1, er,
84 &remove, &er->er_mode);
85 if (error)
86 return error;
87 error = gfs2_ea_set_i(ip, er);
88 if (error)
89 return error;
90 if (remove)
91 gfs2_ea_remove_i(ip, er);
92 return 0;
93
94 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
95 error = gfs2_acl_validate_set(ip, 0, er,
96 &remove, NULL);
97 if (error)
98 return error;
99 if (!remove)
100 error = gfs2_ea_set_i(ip, er);
101 else {
102 error = gfs2_ea_remove_i(ip, er);
103 if (error == -ENODATA)
104 error = 0;
105 }
106 return error;
107 }
108
109 return -EPERM;
110}
111
112static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
113{
114 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
115 int error = gfs2_acl_validate_remove(ip, 1);
116 if (error)
117 return error;
118
119 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
120 int error = gfs2_acl_validate_remove(ip, 0);
121 if (error)
122 return error;
123
124 } else
125 return -EPERM;
126
127 return gfs2_ea_remove_i(ip, er);
128}
129
130static const struct gfs2_eattr_operations gfs2_user_eaops = {
131 .eo_get = gfs2_ea_get_i,
132 .eo_set = gfs2_ea_set_i,
133 .eo_remove = gfs2_ea_remove_i,
134 .eo_name = "user",
135};
136
137const struct gfs2_eattr_operations gfs2_system_eaops = {
138 .eo_get = system_eo_get,
139 .eo_set = system_eo_set,
140 .eo_remove = system_eo_remove,
141 .eo_name = "system",
142};
143
144static const struct gfs2_eattr_operations gfs2_security_eaops = {
145 .eo_get = gfs2_ea_get_i,
146 .eo_set = gfs2_ea_set_i,
147 .eo_remove = gfs2_ea_remove_i,
148 .eo_name = "security",
149};
150
151const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
152 NULL,
153 &gfs2_user_eaops,
154 &gfs2_system_eaops,
155 &gfs2_security_eaops,
156};
157
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40d..000000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14struct gfs2_inode;
15
16struct gfs2_eattr_operations {
17 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
20 char *eo_name;
21};
22
23unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
24
25extern const struct gfs2_eattr_operations gfs2_system_eaops;
26
27extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
28
29#endif /* __EAOPS_DOT_H__ */
30
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef221716..d15876e9aa26 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
143} 143}
144 144
145static struct dentry *gfs2_get_dentry(struct super_block *sb, 145static struct dentry *gfs2_get_dentry(struct super_block *sb,
146 struct gfs2_inum_host *inum) 146 struct gfs2_inum_host *inum)
147{ 147{
148 struct gfs2_sbd *sdp = sb->s_fs_info; 148 struct gfs2_sbd *sdp = sb->s_fs_info;
149 struct gfs2_holder i_gh, ri_gh, rgd_gh; 149 struct gfs2_holder i_gh;
150 struct gfs2_rgrpd *rgd;
151 struct inode *inode; 150 struct inode *inode;
152 struct dentry *dentry; 151 struct dentry *dentry;
153 int error; 152 int error;
154 153
155 /* System files? */
156
157 inode = gfs2_ilookup(sb, inum->no_addr); 154 inode = gfs2_ilookup(sb, inum->no_addr);
158 if (inode) { 155 if (inode) {
159 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { 156 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
168 if (error) 165 if (error)
169 return ERR_PTR(error); 166 return ERR_PTR(error);
170 167
171 error = gfs2_rindex_hold(sdp, &ri_gh); 168 error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
172 if (error) 169 if (error)
173 goto fail; 170 goto fail;
174 171
175 error = -EINVAL; 172 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
176 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
177 if (!rgd)
178 goto fail_rindex;
179
180 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
181 if (error)
182 goto fail_rindex;
183
184 error = -ESTALE;
185 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
186 goto fail_rgd;
187
188 gfs2_glock_dq_uninit(&rgd_gh);
189 gfs2_glock_dq_uninit(&ri_gh);
190
191 inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
192 inum->no_addr,
193 0, 0);
194 if (IS_ERR(inode)) { 173 if (IS_ERR(inode)) {
195 error = PTR_ERR(inode); 174 error = PTR_ERR(inode);
196 goto fail; 175 goto fail;
@@ -224,13 +203,6 @@ out_inode:
224 if (!IS_ERR(dentry)) 203 if (!IS_ERR(dentry))
225 dentry->d_op = &gfs2_dops; 204 dentry->d_op = &gfs2_dops;
226 return dentry; 205 return dentry;
227
228fail_rgd:
229 gfs2_glock_dq_uninit(&rgd_gh);
230
231fail_rindex:
232 gfs2_glock_dq_uninit(&ri_gh);
233
234fail: 206fail:
235 gfs2_glock_dq_uninit(&i_gh); 207 gfs2_glock_dq_uninit(&i_gh);
236 return ERR_PTR(error); 208 return ERR_PTR(error);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f1..166f38fbd246 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
38#include "rgrp.h" 38#include "rgrp.h"
39#include "trans.h" 39#include "trans.h"
40#include "util.h" 40#include "util.h"
41#include "eaops.h"
42 41
43/** 42/**
44 * gfs2_llseek - seek to a location in a file 43 * gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 297421c0427a..8b674b1f3a55 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
63static DECLARE_RWSEM(gfs2_umount_flush_sem); 63static DECLARE_RWSEM(gfs2_umount_flush_sem);
64static struct dentry *gfs2_root; 64static struct dentry *gfs2_root;
65static struct workqueue_struct *glock_workqueue; 65static struct workqueue_struct *glock_workqueue;
66struct workqueue_struct *gfs2_delete_workqueue;
66static LIST_HEAD(lru_list); 67static LIST_HEAD(lru_list);
67static atomic_t lru_count = ATOMIC_INIT(0); 68static atomic_t lru_count = ATOMIC_INIT(0);
68static DEFINE_SPINLOCK(lru_lock); 69static DEFINE_SPINLOCK(lru_lock);
@@ -167,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl)
167 * 168 *
168 */ 169 */
169 170
170static void gfs2_glock_hold(struct gfs2_glock *gl) 171void gfs2_glock_hold(struct gfs2_glock *gl)
171{ 172{
172 GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); 173 GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
173 atomic_inc(&gl->gl_ref); 174 atomic_inc(&gl->gl_ref);
174} 175}
175 176
176/** 177/**
178 * demote_ok - Check to see if it's ok to unlock a glock
179 * @gl: the glock
180 *
181 * Returns: 1 if it's ok
182 */
183
184static int demote_ok(const struct gfs2_glock *gl)
185{
186 const struct gfs2_glock_operations *glops = gl->gl_ops;
187
188 if (gl->gl_state == LM_ST_UNLOCKED)
189 return 0;
190 if (!list_empty(&gl->gl_holders))
191 return 0;
192 if (glops->go_demote_ok)
193 return glops->go_demote_ok(gl);
194 return 1;
195}
196
197/**
177 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list 198 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
178 * @gl: the glock 199 * @gl: the glock
179 * 200 *
@@ -181,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
181 202
182static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) 203static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
183{ 204{
205 int may_reclaim;
206 may_reclaim = (demote_ok(gl) &&
207 (atomic_read(&gl->gl_ref) == 1 ||
208 (gl->gl_name.ln_type == LM_TYPE_INODE &&
209 atomic_read(&gl->gl_ref) <= 2)));
184 spin_lock(&lru_lock); 210 spin_lock(&lru_lock);
185 if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) { 211 if (list_empty(&gl->gl_lru) && may_reclaim) {
186 list_add_tail(&gl->gl_lru, &lru_list); 212 list_add_tail(&gl->gl_lru, &lru_list);
187 atomic_inc(&lru_count); 213 atomic_inc(&lru_count);
188 } 214 }
@@ -190,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
190} 216}
191 217
192/** 218/**
219 * gfs2_glock_put_nolock() - Decrement reference count on glock
220 * @gl: The glock to put
221 *
222 * This function should only be used if the caller has its own reference
223 * to the glock, in addition to the one it is dropping.
224 */
225
226void gfs2_glock_put_nolock(struct gfs2_glock *gl)
227{
228 if (atomic_dec_and_test(&gl->gl_ref))
229 GLOCK_BUG_ON(gl, 1);
230 gfs2_glock_schedule_for_reclaim(gl);
231}
232
233/**
193 * gfs2_glock_put() - Decrement reference count on glock 234 * gfs2_glock_put() - Decrement reference count on glock
194 * @gl: The glock to put 235 * @gl: The glock to put
195 * 236 *
@@ -214,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl)
214 rv = 1; 255 rv = 1;
215 goto out; 256 goto out;
216 } 257 }
217 /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ 258 spin_lock(&gl->gl_spin);
218 if (atomic_read(&gl->gl_ref) == 2) 259 gfs2_glock_schedule_for_reclaim(gl);
219 gfs2_glock_schedule_for_reclaim(gl); 260 spin_unlock(&gl->gl_spin);
220 write_unlock(gl_lock_addr(gl->gl_hash)); 261 write_unlock(gl_lock_addr(gl->gl_hash));
221out: 262out:
222 return rv; 263 return rv;
@@ -398,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
398 if (held2) 439 if (held2)
399 gfs2_glock_hold(gl); 440 gfs2_glock_hold(gl);
400 else 441 else
401 gfs2_glock_put(gl); 442 gfs2_glock_put_nolock(gl);
402 } 443 }
403 444
404 gl->gl_state = new_state; 445 gl->gl_state = new_state;
@@ -633,12 +674,35 @@ out:
633out_sched: 674out_sched:
634 gfs2_glock_hold(gl); 675 gfs2_glock_hold(gl);
635 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 676 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
636 gfs2_glock_put(gl); 677 gfs2_glock_put_nolock(gl);
637out_unlock: 678out_unlock:
638 clear_bit(GLF_LOCK, &gl->gl_flags); 679 clear_bit(GLF_LOCK, &gl->gl_flags);
639 goto out; 680 goto out;
640} 681}
641 682
683static void delete_work_func(struct work_struct *work)
684{
685 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
686 struct gfs2_sbd *sdp = gl->gl_sbd;
687 struct gfs2_inode *ip = NULL;
688 struct inode *inode;
689 u64 no_addr = 0;
690
691 spin_lock(&gl->gl_spin);
692 ip = (struct gfs2_inode *)gl->gl_object;
693 if (ip)
694 no_addr = ip->i_no_addr;
695 spin_unlock(&gl->gl_spin);
696 if (ip) {
697 inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
698 if (inode) {
699 d_prune_aliases(inode);
700 iput(inode);
701 }
702 }
703 gfs2_glock_put(gl);
704}
705
642static void glock_work_func(struct work_struct *work) 706static void glock_work_func(struct work_struct *work)
643{ 707{
644 unsigned long delay = 0; 708 unsigned long delay = 0;
@@ -717,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
717 gl->gl_sbd = sdp; 781 gl->gl_sbd = sdp;
718 gl->gl_aspace = NULL; 782 gl->gl_aspace = NULL;
719 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); 783 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
784 INIT_WORK(&gl->gl_delete, delete_work_func);
720 785
721 /* If this glock protects actual on-disk data or metadata blocks, 786 /* If this glock protects actual on-disk data or metadata blocks,
722 create a VFS inode to manage the pages/buffers holding them. */ 787 create a VFS inode to manage the pages/buffers holding them. */
@@ -858,6 +923,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
858 gl->gl_demote_state != state) { 923 gl->gl_demote_state != state) {
859 gl->gl_demote_state = LM_ST_UNLOCKED; 924 gl->gl_demote_state = LM_ST_UNLOCKED;
860 } 925 }
926 if (gl->gl_ops->go_callback)
927 gl->gl_ops->go_callback(gl);
861 trace_gfs2_demote_rq(gl); 928 trace_gfs2_demote_rq(gl);
862} 929}
863 930
@@ -1274,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1274 gfs2_glock_put(gl); 1341 gfs2_glock_put(gl);
1275} 1342}
1276 1343
1277/**
1278 * demote_ok - Check to see if it's ok to unlock a glock
1279 * @gl: the glock
1280 *
1281 * Returns: 1 if it's ok
1282 */
1283
1284static int demote_ok(const struct gfs2_glock *gl)
1285{
1286 const struct gfs2_glock_operations *glops = gl->gl_ops;
1287
1288 if (gl->gl_state == LM_ST_UNLOCKED)
1289 return 0;
1290 if (!list_empty(&gl->gl_holders))
1291 return 0;
1292 if (glops->go_demote_ok)
1293 return glops->go_demote_ok(gl);
1294 return 1;
1295}
1296
1297 1344
1298static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) 1345static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1299{ 1346{
1300 struct gfs2_glock *gl; 1347 struct gfs2_glock *gl;
1301 int may_demote; 1348 int may_demote;
1302 int nr_skipped = 0; 1349 int nr_skipped = 0;
1303 int got_ref = 0;
1304 LIST_HEAD(skipped); 1350 LIST_HEAD(skipped);
1305 1351
1306 if (nr == 0) 1352 if (nr == 0)
@@ -1315,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1315 list_del_init(&gl->gl_lru); 1361 list_del_init(&gl->gl_lru);
1316 atomic_dec(&lru_count); 1362 atomic_dec(&lru_count);
1317 1363
1364 /* Check if glock is about to be freed */
1365 if (atomic_read(&gl->gl_ref) == 0)
1366 continue;
1367
1318 /* Test for being demotable */ 1368 /* Test for being demotable */
1319 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { 1369 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
1320 gfs2_glock_hold(gl); 1370 gfs2_glock_hold(gl);
1321 got_ref = 1;
1322 spin_unlock(&lru_lock); 1371 spin_unlock(&lru_lock);
1323 spin_lock(&gl->gl_spin); 1372 spin_lock(&gl->gl_spin);
1324 may_demote = demote_ok(gl); 1373 may_demote = demote_ok(gl);
1325 spin_unlock(&gl->gl_spin);
1326 clear_bit(GLF_LOCK, &gl->gl_flags);
1327 if (may_demote) { 1374 if (may_demote) {
1328 handle_callback(gl, LM_ST_UNLOCKED, 0); 1375 handle_callback(gl, LM_ST_UNLOCKED, 0);
1329 nr--; 1376 nr--;
1330 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1331 gfs2_glock_put(gl);
1332 got_ref = 0;
1333 } 1377 }
1378 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1379 gfs2_glock_put_nolock(gl);
1380 spin_unlock(&gl->gl_spin);
1381 clear_bit(GLF_LOCK, &gl->gl_flags);
1334 spin_lock(&lru_lock); 1382 spin_lock(&lru_lock);
1335 if (may_demote) 1383 continue;
1336 continue;
1337 }
1338 if (list_empty(&gl->gl_lru) &&
1339 (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
1340 nr_skipped++;
1341 list_add(&gl->gl_lru, &skipped);
1342 }
1343 if (got_ref) {
1344 spin_unlock(&lru_lock);
1345 gfs2_glock_put(gl);
1346 spin_lock(&lru_lock);
1347 got_ref = 0;
1348 } 1384 }
1385 nr_skipped++;
1386 list_add(&gl->gl_lru, &skipped);
1349 } 1387 }
1350 list_splice(&skipped, &lru_list); 1388 list_splice(&skipped, &lru_list);
1351 atomic_add(nr_skipped, &lru_count); 1389 atomic_add(nr_skipped, &lru_count);
@@ -1727,6 +1765,11 @@ int __init gfs2_glock_init(void)
1727 glock_workqueue = create_workqueue("glock_workqueue"); 1765 glock_workqueue = create_workqueue("glock_workqueue");
1728 if (IS_ERR(glock_workqueue)) 1766 if (IS_ERR(glock_workqueue))
1729 return PTR_ERR(glock_workqueue); 1767 return PTR_ERR(glock_workqueue);
1768 gfs2_delete_workqueue = create_workqueue("delete_workqueue");
1769 if (IS_ERR(gfs2_delete_workqueue)) {
1770 destroy_workqueue(glock_workqueue);
1771 return PTR_ERR(gfs2_delete_workqueue);
1772 }
1730 1773
1731 register_shrinker(&glock_shrinker); 1774 register_shrinker(&glock_shrinker);
1732 1775
@@ -1737,6 +1780,7 @@ void gfs2_glock_exit(void)
1737{ 1780{
1738 unregister_shrinker(&glock_shrinker); 1781 unregister_shrinker(&glock_shrinker);
1739 destroy_workqueue(glock_workqueue); 1782 destroy_workqueue(glock_workqueue);
1783 destroy_workqueue(gfs2_delete_workqueue);
1740} 1784}
1741 1785
1742static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) 1786static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a602a28f6f08..c609894ec0d0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -143,6 +143,7 @@ struct lm_lockops {
143 143
144#define GLR_TRYFAILED 13 144#define GLR_TRYFAILED 13
145 145
146extern struct workqueue_struct *gfs2_delete_workqueue;
146static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) 147static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
147{ 148{
148 struct gfs2_holder *gh; 149 struct gfs2_holder *gh;
@@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
191int gfs2_glock_get(struct gfs2_sbd *sdp, 192int gfs2_glock_get(struct gfs2_sbd *sdp,
192 u64 number, const struct gfs2_glock_operations *glops, 193 u64 number, const struct gfs2_glock_operations *glops,
193 int create, struct gfs2_glock **glp); 194 int create, struct gfs2_glock **glp);
195void gfs2_glock_hold(struct gfs2_glock *gl);
196void gfs2_glock_put_nolock(struct gfs2_glock *gl);
194int gfs2_glock_put(struct gfs2_glock *gl); 197int gfs2_glock_put(struct gfs2_glock *gl);
195void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, 198void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
196 struct gfs2_holder *gh); 199 struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d5e4ab155ca0..6985eef06c39 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -323,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
323 323
324 if (gl->gl_state != LM_ST_UNLOCKED && 324 if (gl->gl_state != LM_ST_UNLOCKED &&
325 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { 325 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
326 flush_workqueue(gfs2_delete_workqueue);
326 gfs2_meta_syncfs(sdp); 327 gfs2_meta_syncfs(sdp);
327 gfs2_log_shutdown(sdp); 328 gfs2_log_shutdown(sdp);
328 } 329 }
@@ -372,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
372 return 0; 373 return 0;
373} 374}
374 375
376/**
377 * iopen_go_callback - schedule the dcache entry for the inode to be deleted
378 * @gl: the glock
379 *
380 * gl_spin lock is held while calling this
381 */
382static void iopen_go_callback(struct gfs2_glock *gl)
383{
384 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
385
386 if (gl->gl_demote_state == LM_ST_UNLOCKED &&
387 gl->gl_state == LM_ST_SHARED &&
388 ip && test_bit(GIF_USER, &ip->i_flags)) {
389 gfs2_glock_hold(gl);
390 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
391 gfs2_glock_put_nolock(gl);
392 }
393}
394
375const struct gfs2_glock_operations gfs2_meta_glops = { 395const struct gfs2_glock_operations gfs2_meta_glops = {
376 .go_type = LM_TYPE_META, 396 .go_type = LM_TYPE_META,
377}; 397};
@@ -406,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
406 426
407const struct gfs2_glock_operations gfs2_iopen_glops = { 427const struct gfs2_glock_operations gfs2_iopen_glops = {
408 .go_type = LM_TYPE_IOPEN, 428 .go_type = LM_TYPE_IOPEN,
429 .go_callback = iopen_go_callback,
409}; 430};
410 431
411const struct gfs2_glock_operations gfs2_flock_glops = { 432const struct gfs2_glock_operations gfs2_flock_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 225347fbff3c..6edb423f90b3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -159,6 +159,7 @@ struct gfs2_glock_operations {
159 int (*go_lock) (struct gfs2_holder *gh); 159 int (*go_lock) (struct gfs2_holder *gh);
160 void (*go_unlock) (struct gfs2_holder *gh); 160 void (*go_unlock) (struct gfs2_holder *gh);
161 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); 161 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
162 void (*go_callback) (struct gfs2_glock *gl);
162 const int go_type; 163 const int go_type;
163 const unsigned long go_min_hold_time; 164 const unsigned long go_min_hold_time;
164}; 165};
@@ -228,6 +229,7 @@ struct gfs2_glock {
228 struct list_head gl_ail_list; 229 struct list_head gl_ail_list;
229 atomic_t gl_ail_count; 230 atomic_t gl_ail_count;
230 struct delayed_work gl_work; 231 struct delayed_work gl_work;
232 struct work_struct gl_delete;
231}; 233};
232 234
233#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ 235#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
@@ -404,6 +406,12 @@ struct gfs2_statfs_change_host {
404#define GFS2_DATA_WRITEBACK 1 406#define GFS2_DATA_WRITEBACK 1
405#define GFS2_DATA_ORDERED 2 407#define GFS2_DATA_ORDERED 2
406 408
409#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW
410#define GFS2_ERRORS_WITHDRAW 0
411#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */
412#define GFS2_ERRORS_RO 2 /* place holder for future feature */
413#define GFS2_ERRORS_PANIC 3
414
407struct gfs2_args { 415struct gfs2_args {
408 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ 416 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
409 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 417 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
@@ -420,6 +428,7 @@ struct gfs2_args {
420 unsigned int ar_data:2; /* ordered/writeback */ 428 unsigned int ar_data:2; /* ordered/writeback */
421 unsigned int ar_meta:1; /* mount metafs */ 429 unsigned int ar_meta:1; /* mount metafs */
422 unsigned int ar_discard:1; /* discard requests */ 430 unsigned int ar_discard:1; /* discard requests */
431 unsigned int ar_errors:2; /* errors=withdraw | panic */
423 int ar_commit; /* Commit interval */ 432 int ar_commit; /* Commit interval */
424}; 433};
425 434
@@ -487,7 +496,6 @@ struct gfs2_sb_host {
487 */ 496 */
488 497
489struct lm_lockstruct { 498struct lm_lockstruct {
490 u32 ls_id;
491 unsigned int ls_jid; 499 unsigned int ls_jid;
492 unsigned int ls_first; 500 unsigned int ls_first;
493 unsigned int ls_first_done; 501 unsigned int ls_first_done;
@@ -539,18 +547,12 @@ struct gfs2_sbd {
539 struct dentry *sd_root_dir; 547 struct dentry *sd_root_dir;
540 548
541 struct inode *sd_jindex; 549 struct inode *sd_jindex;
542 struct inode *sd_inum_inode;
543 struct inode *sd_statfs_inode; 550 struct inode *sd_statfs_inode;
544 struct inode *sd_ir_inode;
545 struct inode *sd_sc_inode; 551 struct inode *sd_sc_inode;
546 struct inode *sd_qc_inode; 552 struct inode *sd_qc_inode;
547 struct inode *sd_rindex; 553 struct inode *sd_rindex;
548 struct inode *sd_quota_inode; 554 struct inode *sd_quota_inode;
549 555
550 /* Inum stuff */
551
552 struct mutex sd_inum_mutex;
553
554 /* StatFS stuff */ 556 /* StatFS stuff */
555 557
556 spinlock_t sd_statfs_spin; 558 spinlock_t sd_statfs_spin;
@@ -578,7 +580,6 @@ struct gfs2_sbd {
578 struct gfs2_holder sd_journal_gh; 580 struct gfs2_holder sd_journal_gh;
579 struct gfs2_holder sd_jinode_gh; 581 struct gfs2_holder sd_jinode_gh;
580 582
581 struct gfs2_holder sd_ir_gh;
582 struct gfs2_holder sd_sc_gh; 583 struct gfs2_holder sd_sc_gh;
583 struct gfs2_holder sd_qc_gh; 584 struct gfs2_holder sd_qc_gh;
584 585
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd723698..fb15d3b1f409 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
24#include "acl.h" 24#include "acl.h"
25#include "bmap.h" 25#include "bmap.h"
26#include "dir.h" 26#include "dir.h"
27#include "eattr.h" 27#include "xattr.h"
28#include "glock.h" 28#include "glock.h"
29#include "glops.h" 29#include "glops.h"
30#include "inode.h" 30#include "inode.h"
@@ -519,139 +519,6 @@ out:
519 return inode ? inode : ERR_PTR(error); 519 return inode ? inode : ERR_PTR(error);
520} 520}
521 521
522static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
523{
524 const struct gfs2_inum_range *str = buf;
525
526 ir->ir_start = be64_to_cpu(str->ir_start);
527 ir->ir_length = be64_to_cpu(str->ir_length);
528}
529
530static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
531{
532 struct gfs2_inum_range *str = buf;
533
534 str->ir_start = cpu_to_be64(ir->ir_start);
535 str->ir_length = cpu_to_be64(ir->ir_length);
536}
537
538static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
539{
540 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
541 struct buffer_head *bh;
542 struct gfs2_inum_range_host ir;
543 int error;
544
545 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
546 if (error)
547 return error;
548 mutex_lock(&sdp->sd_inum_mutex);
549
550 error = gfs2_meta_inode_buffer(ip, &bh);
551 if (error) {
552 mutex_unlock(&sdp->sd_inum_mutex);
553 gfs2_trans_end(sdp);
554 return error;
555 }
556
557 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
558
559 if (ir.ir_length) {
560 *formal_ino = ir.ir_start++;
561 ir.ir_length--;
562 gfs2_trans_add_bh(ip->i_gl, bh, 1);
563 gfs2_inum_range_out(&ir,
564 bh->b_data + sizeof(struct gfs2_dinode));
565 brelse(bh);
566 mutex_unlock(&sdp->sd_inum_mutex);
567 gfs2_trans_end(sdp);
568 return 0;
569 }
570
571 brelse(bh);
572
573 mutex_unlock(&sdp->sd_inum_mutex);
574 gfs2_trans_end(sdp);
575
576 return 1;
577}
578
579static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
580{
581 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
582 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
583 struct gfs2_holder gh;
584 struct buffer_head *bh;
585 struct gfs2_inum_range_host ir;
586 int error;
587
588 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
589 if (error)
590 return error;
591
592 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
593 if (error)
594 goto out;
595 mutex_lock(&sdp->sd_inum_mutex);
596
597 error = gfs2_meta_inode_buffer(ip, &bh);
598 if (error)
599 goto out_end_trans;
600
601 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
602
603 if (!ir.ir_length) {
604 struct buffer_head *m_bh;
605 u64 x, y;
606 __be64 z;
607
608 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
609 if (error)
610 goto out_brelse;
611
612 z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
613 x = y = be64_to_cpu(z);
614 ir.ir_start = x;
615 ir.ir_length = GFS2_INUM_QUANTUM;
616 x += GFS2_INUM_QUANTUM;
617 if (x < y)
618 gfs2_consist_inode(m_ip);
619 z = cpu_to_be64(x);
620 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
621 *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
622
623 brelse(m_bh);
624 }
625
626 *formal_ino = ir.ir_start++;
627 ir.ir_length--;
628
629 gfs2_trans_add_bh(ip->i_gl, bh, 1);
630 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
631
632out_brelse:
633 brelse(bh);
634out_end_trans:
635 mutex_unlock(&sdp->sd_inum_mutex);
636 gfs2_trans_end(sdp);
637out:
638 gfs2_glock_dq_uninit(&gh);
639 return error;
640}
641
642static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
643{
644 int error;
645
646 error = pick_formal_ino_1(sdp, inum);
647 if (error <= 0)
648 return error;
649
650 error = pick_formal_ino_2(sdp, inum);
651
652 return error;
653}
654
655/** 522/**
656 * create_ok - OK to create a new on-disk inode here? 523 * create_ok - OK to create a new on-disk inode here?
657 * @dip: Directory in which dinode is to be created 524 * @dip: Directory in which dinode is to be created
@@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
731 if (error) 598 if (error)
732 goto out_ipreserv; 599 goto out_ipreserv;
733 600
734 *no_addr = gfs2_alloc_di(dip, generation); 601 error = gfs2_alloc_di(dip, no_addr, generation);
735 602
736 gfs2_trans_end(sdp); 603 gfs2_trans_end(sdp);
737 604
@@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
924 size_t len; 791 size_t len;
925 void *value; 792 void *value;
926 char *name; 793 char *name;
927 struct gfs2_ea_request er;
928 794
929 err = security_inode_init_security(&ip->i_inode, &dip->i_inode, 795 err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
930 &name, &value, &len); 796 &name, &value, &len);
@@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
935 return err; 801 return err;
936 } 802 }
937 803
938 memset(&er, 0, sizeof(struct gfs2_ea_request)); 804 err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
939
940 er.er_type = GFS2_EATYPE_SECURITY;
941 er.er_name = name;
942 er.er_data = value;
943 er.er_name_len = strlen(name);
944 er.er_data_len = len;
945
946 err = gfs2_ea_set_i(ip, &er);
947
948 kfree(value); 805 kfree(value);
949 kfree(name); 806 kfree(name);
950 807
@@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
991 if (error) 848 if (error)
992 goto fail_gunlock; 849 goto fail_gunlock;
993 850
994 error = pick_formal_ino(sdp, &inum.no_formal_ino);
995 if (error)
996 goto fail_gunlock;
997
998 error = alloc_dinode(dip, &inum.no_addr, &generation); 851 error = alloc_dinode(dip, &inum.no_addr, &generation);
999 if (error) 852 if (error)
1000 goto fail_gunlock; 853 goto fail_gunlock;
854 inum.no_formal_ino = generation;
1001 855
1002 error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, 856 error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
1003 LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); 857 LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1008 if (error) 862 if (error)
1009 goto fail_gunlock2; 863 goto fail_gunlock2;
1010 864
1011 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), 865 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
1012 inum.no_addr, 866 inum.no_formal_ino, 0);
1013 inum.no_formal_ino, 0);
1014 if (IS_ERR(inode)) 867 if (IS_ERR(inode))
1015 goto fail_gunlock2; 868 goto fail_gunlock2;
1016 869
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd676..52fb6c048981 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
84 84
85 gfs2_tune_init(&sdp->sd_tune); 85 gfs2_tune_init(&sdp->sd_tune);
86 86
87 mutex_init(&sdp->sd_inum_mutex);
88 spin_lock_init(&sdp->sd_statfs_spin); 87 spin_lock_init(&sdp->sd_statfs_spin);
89 88
90 spin_lock_init(&sdp->sd_rindex_spin); 89 spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
833 if (error) 832 if (error)
834 goto fail; 833 goto fail;
835 834
836 /* Read in the master inode number inode */
837 sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
838 if (IS_ERR(sdp->sd_inum_inode)) {
839 error = PTR_ERR(sdp->sd_inum_inode);
840 fs_err(sdp, "can't read in inum inode: %d\n", error);
841 goto fail_journal;
842 }
843
844
845 /* Read in the master statfs inode */ 835 /* Read in the master statfs inode */
846 sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); 836 sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
847 if (IS_ERR(sdp->sd_statfs_inode)) { 837 if (IS_ERR(sdp->sd_statfs_inode)) {
848 error = PTR_ERR(sdp->sd_statfs_inode); 838 error = PTR_ERR(sdp->sd_statfs_inode);
849 fs_err(sdp, "can't read in statfs inode: %d\n", error); 839 fs_err(sdp, "can't read in statfs inode: %d\n", error);
850 goto fail_inum; 840 goto fail_journal;
851 } 841 }
852 842
853 /* Read in the resource index inode */ 843 /* Read in the resource index inode */
@@ -876,8 +866,6 @@ fail_rindex:
876 iput(sdp->sd_rindex); 866 iput(sdp->sd_rindex);
877fail_statfs: 867fail_statfs:
878 iput(sdp->sd_statfs_inode); 868 iput(sdp->sd_statfs_inode);
879fail_inum:
880 iput(sdp->sd_inum_inode);
881fail_journal: 869fail_journal:
882 init_journal(sdp, UNDO); 870 init_journal(sdp, UNDO);
883fail: 871fail:
@@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
905 return error; 893 return error;
906 } 894 }
907 895
908 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
909 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
910 if (IS_ERR(sdp->sd_ir_inode)) {
911 error = PTR_ERR(sdp->sd_ir_inode);
912 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
913 goto fail;
914 }
915
916 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); 896 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
917 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); 897 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
918 if (IS_ERR(sdp->sd_sc_inode)) { 898 if (IS_ERR(sdp->sd_sc_inode)) {
919 error = PTR_ERR(sdp->sd_sc_inode); 899 error = PTR_ERR(sdp->sd_sc_inode);
920 fs_err(sdp, "can't find local \"sc\" file: %d\n", error); 900 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
921 goto fail_ir_i; 901 goto fail;
922 } 902 }
923 903
924 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); 904 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
932 iput(pn); 912 iput(pn);
933 pn = NULL; 913 pn = NULL;
934 914
935 ip = GFS2_I(sdp->sd_ir_inode);
936 error = gfs2_glock_nq_init(ip->i_gl,
937 LM_ST_EXCLUSIVE, 0,
938 &sdp->sd_ir_gh);
939 if (error) {
940 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
941 goto fail_qc_i;
942 }
943
944 ip = GFS2_I(sdp->sd_sc_inode); 915 ip = GFS2_I(sdp->sd_sc_inode);
945 error = gfs2_glock_nq_init(ip->i_gl, 916 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
946 LM_ST_EXCLUSIVE, 0,
947 &sdp->sd_sc_gh); 917 &sdp->sd_sc_gh);
948 if (error) { 918 if (error) {
949 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); 919 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
950 goto fail_ir_gh; 920 goto fail_qc_i;
951 } 921 }
952 922
953 ip = GFS2_I(sdp->sd_qc_inode); 923 ip = GFS2_I(sdp->sd_qc_inode);
954 error = gfs2_glock_nq_init(ip->i_gl, 924 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
955 LM_ST_EXCLUSIVE, 0,
956 &sdp->sd_qc_gh); 925 &sdp->sd_qc_gh);
957 if (error) { 926 if (error) {
958 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); 927 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +934,10 @@ fail_qc_gh:
965 gfs2_glock_dq_uninit(&sdp->sd_qc_gh); 934 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
966fail_ut_gh: 935fail_ut_gh:
967 gfs2_glock_dq_uninit(&sdp->sd_sc_gh); 936 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
968fail_ir_gh:
969 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
970fail_qc_i: 937fail_qc_i:
971 iput(sdp->sd_qc_inode); 938 iput(sdp->sd_qc_inode);
972fail_ut_i: 939fail_ut_i:
973 iput(sdp->sd_sc_inode); 940 iput(sdp->sd_sc_inode);
974fail_ir_i:
975 iput(sdp->sd_ir_inode);
976fail: 941fail:
977 if (pn) 942 if (pn)
978 iput(pn); 943 iput(pn);
@@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1063 1028
1064 ls->ls_ops = lm; 1029 ls->ls_ops = lm;
1065 ls->ls_first = 1; 1030 ls->ls_first = 1;
1066 ls->ls_id = 0;
1067 1031
1068 for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { 1032 for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
1069 substring_t tmp[MAX_OPT_ARGS]; 1033 substring_t tmp[MAX_OPT_ARGS];
@@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1081 ls->ls_jid = option; 1045 ls->ls_jid = option;
1082 break; 1046 break;
1083 case Opt_id: 1047 case Opt_id:
1084 ret = match_int(&tmp[0], &option); 1048 /* Obsolete, but left for backward compat purposes */
1085 if (ret)
1086 goto hostdata_error;
1087 ls->ls_id = option;
1088 break; 1049 break;
1089 case Opt_first: 1050 case Opt_first:
1090 ret = match_int(&tmp[0], &option); 1051 ret = match_int(&tmp[0], &option);
@@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
1133 lm->lm_unmount(sdp); 1094 lm->lm_unmount(sdp);
1134} 1095}
1135 1096
1097void gfs2_online_uevent(struct gfs2_sbd *sdp)
1098{
1099 struct super_block *sb = sdp->sd_vfs;
1100 char ro[20];
1101 char spectator[20];
1102 char *envp[] = { ro, spectator, NULL };
1103 sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
1104 sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
1105 kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
1106}
1107
1136/** 1108/**
1137 * fill_super - Read in superblock 1109 * fill_super - Read in superblock
1138 * @sb: The VFS superblock 1110 * @sb: The VFS superblock
@@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1157 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; 1129 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
1158 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; 1130 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
1159 sdp->sd_args.ar_commit = 60; 1131 sdp->sd_args.ar_commit = 60;
1132 sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
1160 1133
1161 error = gfs2_mount_args(sdp, &sdp->sd_args, data); 1134 error = gfs2_mount_args(sdp, &sdp->sd_args, data);
1162 if (error) { 1135 if (error) {
@@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1174 sb->s_magic = GFS2_MAGIC; 1147 sb->s_magic = GFS2_MAGIC;
1175 sb->s_op = &gfs2_super_ops; 1148 sb->s_op = &gfs2_super_ops;
1176 sb->s_export_op = &gfs2_export_ops; 1149 sb->s_export_op = &gfs2_export_ops;
1150 sb->s_xattr = gfs2_xattr_handlers;
1177 sb->s_time_gran = 1; 1151 sb->s_time_gran = 1;
1178 sb->s_maxbytes = MAX_LFS_FILESIZE; 1152 sb->s_maxbytes = MAX_LFS_FILESIZE;
1179 1153
@@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1236 } 1210 }
1237 1211
1238 gfs2_glock_dq_uninit(&mount_gh); 1212 gfs2_glock_dq_uninit(&mount_gh);
1239 1213 gfs2_online_uevent(sdp);
1240 return 0; 1214 return 0;
1241 1215
1242fail_threads: 1216fail_threads:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99c..c3ac18054057 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,8 +26,7 @@
26#include "acl.h" 26#include "acl.h"
27#include "bmap.h" 27#include "bmap.h"
28#include "dir.h" 28#include "dir.h"
29#include "eaops.h" 29#include "xattr.h"
30#include "eattr.h"
31#include "glock.h" 30#include "glock.h"
32#include "inode.h" 31#include "inode.h"
33#include "meta_io.h" 32#include "meta_io.h"
@@ -349,7 +348,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
349 348
350 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); 349 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
351 if (error) 350 if (error)
352 goto out_rgrp; 351 goto out_gunlock;
353 352
354 error = gfs2_dir_del(dip, &dentry->d_name); 353 error = gfs2_dir_del(dip, &dentry->d_name);
355 if (error) 354 if (error)
@@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
1302 const void *data, size_t size, int flags) 1301 const void *data, size_t size, int flags)
1303{ 1302{
1304 struct inode *inode = dentry->d_inode; 1303 struct inode *inode = dentry->d_inode;
1305 struct gfs2_ea_request er; 1304 struct gfs2_inode *ip = GFS2_I(inode);
1306 1305 struct gfs2_holder gh;
1307 memset(&er, 0, sizeof(struct gfs2_ea_request)); 1306 int ret;
1308 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1309 if (er.er_type == GFS2_EATYPE_UNUSED)
1310 return -EOPNOTSUPP;
1311 er.er_data = (char *)data;
1312 er.er_name_len = strlen(er.er_name);
1313 er.er_data_len = size;
1314 er.er_flags = flags;
1315
1316 gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
1317 1307
1318 return gfs2_ea_set(GFS2_I(inode), &er); 1308 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1309 ret = gfs2_glock_nq(&gh);
1310 if (ret == 0) {
1311 ret = generic_setxattr(dentry, name, data, size, flags);
1312 gfs2_glock_dq(&gh);
1313 }
1314 gfs2_holder_uninit(&gh);
1315 return ret;
1319} 1316}
1320 1317
1321static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, 1318static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1322 void *data, size_t size) 1319 void *data, size_t size)
1323{ 1320{
1324 struct gfs2_ea_request er; 1321 struct inode *inode = dentry->d_inode;
1325 1322 struct gfs2_inode *ip = GFS2_I(inode);
1326 memset(&er, 0, sizeof(struct gfs2_ea_request)); 1323 struct gfs2_holder gh;
1327 er.er_type = gfs2_ea_name2type(name, &er.er_name); 1324 int ret;
1328 if (er.er_type == GFS2_EATYPE_UNUSED)
1329 return -EOPNOTSUPP;
1330 er.er_data = data;
1331 er.er_name_len = strlen(er.er_name);
1332 er.er_data_len = size;
1333
1334 return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
1335}
1336
1337static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1338{
1339 struct gfs2_ea_request er;
1340
1341 memset(&er, 0, sizeof(struct gfs2_ea_request));
1342 er.er_data = (size) ? buffer : NULL;
1343 er.er_data_len = size;
1344 1325
1345 return gfs2_ea_list(GFS2_I(dentry->d_inode), &er); 1326 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1327 ret = gfs2_glock_nq(&gh);
1328 if (ret == 0) {
1329 ret = generic_getxattr(dentry, name, data, size);
1330 gfs2_glock_dq(&gh);
1331 }
1332 gfs2_holder_uninit(&gh);
1333 return ret;
1346} 1334}
1347 1335
1348static int gfs2_removexattr(struct dentry *dentry, const char *name) 1336static int gfs2_removexattr(struct dentry *dentry, const char *name)
1349{ 1337{
1350 struct gfs2_ea_request er; 1338 struct inode *inode = dentry->d_inode;
1351 1339 struct gfs2_inode *ip = GFS2_I(inode);
1352 memset(&er, 0, sizeof(struct gfs2_ea_request)); 1340 struct gfs2_holder gh;
1353 er.er_type = gfs2_ea_name2type(name, &er.er_name); 1341 int ret;
1354 if (er.er_type == GFS2_EATYPE_UNUSED)
1355 return -EOPNOTSUPP;
1356 er.er_name_len = strlen(er.er_name);
1357 1342
1358 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); 1343 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1344 ret = gfs2_glock_nq(&gh);
1345 if (ret == 0) {
1346 ret = generic_removexattr(dentry, name);
1347 gfs2_glock_dq(&gh);
1348 }
1349 gfs2_holder_uninit(&gh);
1350 return ret;
1359} 1351}
1360 1352
1361static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1353static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index daa4ae341a29..28c590b7c9da 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
285 } 285 }
286 286
287 tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; 287 tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
288 if (count[1] + count[2] != tmp) { 288 if (count[1] != tmp) {
289 if (gfs2_consist_rgrpd(rgd)) 289 if (gfs2_consist_rgrpd(rgd))
290 fs_err(sdp, "used data mismatch: %u != %u\n", 290 fs_err(sdp, "used data mismatch: %u != %u\n",
291 count[1], tmp); 291 count[1], tmp);
292 return; 292 return;
293 } 293 }
294 294
295 if (count[3] != rgd->rd_dinodes) { 295 if (count[2] + count[3] != rgd->rd_dinodes) {
296 if (gfs2_consist_rgrpd(rgd)) 296 if (gfs2_consist_rgrpd(rgd))
297 fs_err(sdp, "used metadata mismatch: %u != %u\n", 297 fs_err(sdp, "used metadata mismatch: %u != %u\n",
298 count[3], rgd->rd_dinodes); 298 count[2] + count[3], rgd->rd_dinodes);
299 return; 299 return;
300 } 300 }
301
302 if (count[2] > count[3]) {
303 if (gfs2_consist_rgrpd(rgd))
304 fs_err(sdp, "unlinked inodes > inodes: %u\n",
305 count[2]);
306 return;
307 }
308
309} 301}
310 302
311static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) 303static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
@@ -865,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
865 goto start_new_extent; 857 goto start_new_extent;
866 if ((start + nr_sects) != blk) { 858 if ((start + nr_sects) != blk) {
867 rv = blkdev_issue_discard(bdev, start, 859 rv = blkdev_issue_discard(bdev, start,
868 nr_sects, GFP_NOFS); 860 nr_sects, GFP_NOFS,
861 DISCARD_FL_BARRIER);
869 if (rv) 862 if (rv)
870 goto fail; 863 goto fail;
871 nr_sects = 0; 864 nr_sects = 0;
@@ -879,7 +872,8 @@ start_new_extent:
879 } 872 }
880 } 873 }
881 if (nr_sects) { 874 if (nr_sects) {
882 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS); 875 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
876 DISCARD_FL_BARRIER);
883 if (rv) 877 if (rv)
884 goto fail; 878 goto fail;
885 } 879 }
@@ -961,7 +955,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
961 * Returns: The inode, if one has been found 955 * Returns: The inode, if one has been found
962 */ 956 */
963 957
964static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) 958static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
959 u64 skip)
965{ 960{
966 struct inode *inode; 961 struct inode *inode;
967 u32 goal = 0, block; 962 u32 goal = 0, block;
@@ -985,6 +980,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
985 goal++; 980 goal++;
986 if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked) 981 if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
987 continue; 982 continue;
983 if (no_addr == skip)
984 continue;
988 *last_unlinked = no_addr; 985 *last_unlinked = no_addr;
989 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, 986 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
990 no_addr, -1, 1); 987 no_addr, -1, 1);
@@ -1104,7 +1101,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1104 if (try_rgrp_fit(rgd, al)) 1101 if (try_rgrp_fit(rgd, al))
1105 goto out; 1102 goto out;
1106 if (rgd->rd_flags & GFS2_RDF_CHECK) 1103 if (rgd->rd_flags & GFS2_RDF_CHECK)
1107 inode = try_rgrp_unlink(rgd, last_unlinked); 1104 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1108 if (!rg_locked) 1105 if (!rg_locked)
1109 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1106 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1110 if (inode) 1107 if (inode)
@@ -1138,7 +1135,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1138 if (try_rgrp_fit(rgd, al)) 1135 if (try_rgrp_fit(rgd, al))
1139 goto out; 1136 goto out;
1140 if (rgd->rd_flags & GFS2_RDF_CHECK) 1137 if (rgd->rd_flags & GFS2_RDF_CHECK)
1141 inode = try_rgrp_unlink(rgd, last_unlinked); 1138 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1142 if (!rg_locked) 1139 if (!rg_locked)
1143 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1140 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1144 if (inode) 1141 if (inode)
@@ -1261,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1261 * Returns: The block type (GFS2_BLKST_*) 1258 * Returns: The block type (GFS2_BLKST_*)
1262 */ 1259 */
1263 1260
1264unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) 1261static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1265{ 1262{
1266 struct gfs2_bitmap *bi = NULL; 1263 struct gfs2_bitmap *bi = NULL;
1267 u32 length, rgrp_block, buf_block; 1264 u32 length, rgrp_block, buf_block;
@@ -1464,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
1464 return 0; 1461 return 0;
1465} 1462}
1466 1463
1464static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
1465{
1466 struct gfs2_sbd *sdp = rgd->rd_sbd;
1467 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
1468 (unsigned long long)rgd->rd_addr);
1469 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
1470 gfs2_rgrp_dump(NULL, rgd->rd_gl);
1471 rgd->rd_flags |= GFS2_RDF_ERROR;
1472}
1473
1467/** 1474/**
1468 * gfs2_alloc_block - Allocate one or more blocks 1475 * gfs2_alloc_block - Allocate one or more blocks
1469 * @ip: the inode to allocate the block for 1476 * @ip: the inode to allocate the block for
@@ -1525,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1525 return 0; 1532 return 0;
1526 1533
1527rgrp_error: 1534rgrp_error:
1528 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", 1535 gfs2_rgrp_error(rgd);
1529 (unsigned long long)rgd->rd_addr);
1530 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
1531 gfs2_rgrp_dump(NULL, rgd->rd_gl);
1532 rgd->rd_flags |= GFS2_RDF_ERROR;
1533 return -EIO; 1536 return -EIO;
1534} 1537}
1535 1538
1536/** 1539/**
1537 * gfs2_alloc_di - Allocate a dinode 1540 * gfs2_alloc_di - Allocate a dinode
1538 * @dip: the directory that the inode is going in 1541 * @dip: the directory that the inode is going in
1542 * @bn: the block number which is allocated
1543 * @generation: the generation number of the inode
1539 * 1544 *
1540 * Returns: the block allocated 1545 * Returns: 0 on success or error
1541 */ 1546 */
1542 1547
1543u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) 1548int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
1544{ 1549{
1545 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 1550 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1546 struct gfs2_alloc *al = dip->i_alloc; 1551 struct gfs2_alloc *al = dip->i_alloc;
@@ -1551,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1551 1556
1552 blk = rgblk_search(rgd, rgd->rd_last_alloc, 1557 blk = rgblk_search(rgd, rgd->rd_last_alloc,
1553 GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); 1558 GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
1554 BUG_ON(blk == BFITNOENT);
1555 1559
1556 rgd->rd_last_alloc = blk; 1560 /* Since all blocks are reserved in advance, this shouldn't happen */
1561 if (blk == BFITNOENT)
1562 goto rgrp_error;
1557 1563
1564 rgd->rd_last_alloc = blk;
1558 block = rgd->rd_data0 + blk; 1565 block = rgd->rd_data0 + blk;
1566 if (rgd->rd_free == 0)
1567 goto rgrp_error;
1559 1568
1560 gfs2_assert_withdraw(sdp, rgd->rd_free);
1561 rgd->rd_free--; 1569 rgd->rd_free--;
1562 rgd->rd_dinodes++; 1570 rgd->rd_dinodes++;
1563 *generation = rgd->rd_igeneration++; 1571 *generation = rgd->rd_igeneration++;
1572 if (*generation == 0)
1573 *generation = rgd->rd_igeneration++;
1564 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1574 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1565 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1575 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1566 1576
@@ -1573,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1573 rgd->rd_free_clone--; 1583 rgd->rd_free_clone--;
1574 spin_unlock(&sdp->sd_rindex_spin); 1584 spin_unlock(&sdp->sd_rindex_spin);
1575 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); 1585 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
1576 return block; 1586 *bn = block;
1587 return 0;
1588
1589rgrp_error:
1590 gfs2_rgrp_error(rgd);
1591 return -EIO;
1577} 1592}
1578 1593
1579/** 1594/**
@@ -1681,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1681} 1696}
1682 1697
1683/** 1698/**
1699 * gfs2_check_blk_type - Check the type of a block
1700 * @sdp: The superblock
1701 * @no_addr: The block number to check
1702 * @type: The block type we are looking for
1703 *
1704 * Returns: 0 if the block type matches the expected type
1705 * -ESTALE if it doesn't match
1706 * or -ve errno if something went wrong while checking
1707 */
1708
1709int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
1710{
1711 struct gfs2_rgrpd *rgd;
1712 struct gfs2_holder ri_gh, rgd_gh;
1713 int error;
1714
1715 error = gfs2_rindex_hold(sdp, &ri_gh);
1716 if (error)
1717 goto fail;
1718
1719 error = -EINVAL;
1720 rgd = gfs2_blk2rgrpd(sdp, no_addr);
1721 if (!rgd)
1722 goto fail_rindex;
1723
1724 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
1725 if (error)
1726 goto fail_rindex;
1727
1728 if (gfs2_get_block_type(rgd, no_addr) != type)
1729 error = -ESTALE;
1730
1731 gfs2_glock_dq_uninit(&rgd_gh);
1732fail_rindex:
1733 gfs2_glock_dq_uninit(&ri_gh);
1734fail:
1735 return error;
1736}
1737
1738/**
1684 * gfs2_rlist_add - add a RG to a list of RGs 1739 * gfs2_rlist_add - add a RG to a list of RGs
1685 * @sdp: the filesystem 1740 * @sdp: the filesystem
1686 * @rlist: the list of resource groups 1741 * @rlist: the list of resource groups
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e00..b4106ddaaa98 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
44 44
45extern void gfs2_inplace_release(struct gfs2_inode *ip); 45extern void gfs2_inplace_release(struct gfs2_inode *ip);
46 46
47extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
48
49extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 47extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
50extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); 48extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
51 49
52extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); 50extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
53extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); 51extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
54extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); 52extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
55extern void gfs2_unlink_di(struct inode *inode); 53extern void gfs2_unlink_di(struct inode *inode);
54extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
55 unsigned int type);
56 56
57struct gfs2_rgrp_list { 57struct gfs2_rgrp_list {
58 unsigned int rl_rgrps; 58 unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0a6801336470..0ec3ec672de1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
38#include "trans.h" 38#include "trans.h"
39#include "util.h" 39#include "util.h"
40#include "sys.h" 40#include "sys.h"
41#include "eattr.h" 41#include "xattr.h"
42 42
43#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) 43#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
44 44
@@ -68,6 +68,8 @@ enum {
68 Opt_discard, 68 Opt_discard,
69 Opt_nodiscard, 69 Opt_nodiscard,
70 Opt_commit, 70 Opt_commit,
71 Opt_err_withdraw,
72 Opt_err_panic,
71 Opt_error, 73 Opt_error,
72}; 74};
73 75
@@ -97,6 +99,8 @@ static const match_table_t tokens = {
97 {Opt_discard, "discard"}, 99 {Opt_discard, "discard"},
98 {Opt_nodiscard, "nodiscard"}, 100 {Opt_nodiscard, "nodiscard"},
99 {Opt_commit, "commit=%d"}, 101 {Opt_commit, "commit=%d"},
102 {Opt_err_withdraw, "errors=withdraw"},
103 {Opt_err_panic, "errors=panic"},
100 {Opt_error, NULL} 104 {Opt_error, NULL}
101}; 105};
102 106
@@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
152 args->ar_localcaching = 1; 156 args->ar_localcaching = 1;
153 break; 157 break;
154 case Opt_debug: 158 case Opt_debug:
159 if (args->ar_errors == GFS2_ERRORS_PANIC) {
160 fs_info(sdp, "-o debug and -o errors=panic "
161 "are mutually exclusive.\n");
162 return -EINVAL;
163 }
155 args->ar_debug = 1; 164 args->ar_debug = 1;
156 break; 165 break;
157 case Opt_nodebug: 166 case Opt_nodebug:
@@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
205 return rv ? rv : -EINVAL; 214 return rv ? rv : -EINVAL;
206 } 215 }
207 break; 216 break;
217 case Opt_err_withdraw:
218 args->ar_errors = GFS2_ERRORS_WITHDRAW;
219 break;
220 case Opt_err_panic:
221 if (args->ar_debug) {
222 fs_info(sdp, "-o debug and -o errors=panic "
223 "are mutually exclusive.\n");
224 return -EINVAL;
225 }
226 args->ar_errors = GFS2_ERRORS_PANIC;
227 break;
208 case Opt_error: 228 case Opt_error:
209 default: 229 default:
210 fs_info(sdp, "invalid mount option: %s\n", o); 230 fs_info(sdp, "invalid mount option: %s\n", o);
@@ -353,7 +373,7 @@ fail:
353 return error; 373 return error;
354} 374}
355 375
356static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) 376void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
357{ 377{
358 const struct gfs2_statfs_change *str = buf; 378 const struct gfs2_statfs_change *str = buf;
359 379
@@ -441,6 +461,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
441 brelse(l_bh); 461 brelse(l_bh);
442} 462}
443 463
464void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
465 struct buffer_head *l_bh)
466{
467 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
468 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
469 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
470 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
471
472 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
473
474 spin_lock(&sdp->sd_statfs_spin);
475 m_sc->sc_total += l_sc->sc_total;
476 m_sc->sc_free += l_sc->sc_free;
477 m_sc->sc_dinodes += l_sc->sc_dinodes;
478 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
479 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
480 0, sizeof(struct gfs2_statfs_change));
481 spin_unlock(&sdp->sd_statfs_spin);
482
483 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
484 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
485}
486
444int gfs2_statfs_sync(struct gfs2_sbd *sdp) 487int gfs2_statfs_sync(struct gfs2_sbd *sdp)
445{ 488{
446 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); 489 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -477,19 +520,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
477 if (error) 520 if (error)
478 goto out_bh2; 521 goto out_bh2;
479 522
480 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); 523 update_statfs(sdp, m_bh, l_bh);
481
482 spin_lock(&sdp->sd_statfs_spin);
483 m_sc->sc_total += l_sc->sc_total;
484 m_sc->sc_free += l_sc->sc_free;
485 m_sc->sc_dinodes += l_sc->sc_dinodes;
486 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
487 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
488 0, sizeof(struct gfs2_statfs_change));
489 spin_unlock(&sdp->sd_statfs_spin);
490
491 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
492 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
493 524
494 gfs2_trans_end(sdp); 525 gfs2_trans_end(sdp);
495 526
@@ -680,6 +711,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
680 struct gfs2_holder t_gh; 711 struct gfs2_holder t_gh;
681 int error; 712 int error;
682 713
714 flush_workqueue(gfs2_delete_workqueue);
683 gfs2_quota_sync(sdp); 715 gfs2_quota_sync(sdp);
684 gfs2_statfs_sync(sdp); 716 gfs2_statfs_sync(sdp);
685 717
@@ -756,7 +788,6 @@ restart:
756 /* Release stuff */ 788 /* Release stuff */
757 789
758 iput(sdp->sd_jindex); 790 iput(sdp->sd_jindex);
759 iput(sdp->sd_inum_inode);
760 iput(sdp->sd_statfs_inode); 791 iput(sdp->sd_statfs_inode);
761 iput(sdp->sd_rindex); 792 iput(sdp->sd_rindex);
762 iput(sdp->sd_quota_inode); 793 iput(sdp->sd_quota_inode);
@@ -767,10 +798,8 @@ restart:
767 if (!sdp->sd_args.ar_spectator) { 798 if (!sdp->sd_args.ar_spectator) {
768 gfs2_glock_dq_uninit(&sdp->sd_journal_gh); 799 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
769 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); 800 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
770 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
771 gfs2_glock_dq_uninit(&sdp->sd_sc_gh); 801 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
772 gfs2_glock_dq_uninit(&sdp->sd_qc_gh); 802 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
773 iput(sdp->sd_ir_inode);
774 iput(sdp->sd_sc_inode); 803 iput(sdp->sd_sc_inode);
775 iput(sdp->sd_qc_inode); 804 iput(sdp->sd_qc_inode);
776 } 805 }
@@ -1072,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1072 gt->gt_log_flush_secs = args.ar_commit; 1101 gt->gt_log_flush_secs = args.ar_commit;
1073 spin_unlock(&gt->gt_spin); 1102 spin_unlock(&gt->gt_spin);
1074 1103
1104 gfs2_online_uevent(sdp);
1075 return 0; 1105 return 0;
1076} 1106}
1077 1107
@@ -1213,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1213 lfsecs = sdp->sd_tune.gt_log_flush_secs; 1243 lfsecs = sdp->sd_tune.gt_log_flush_secs;
1214 if (lfsecs != 60) 1244 if (lfsecs != 60)
1215 seq_printf(s, ",commit=%d", lfsecs); 1245 seq_printf(s, ",commit=%d", lfsecs);
1246 if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
1247 const char *state;
1248
1249 switch (args->ar_errors) {
1250 case GFS2_ERRORS_WITHDRAW:
1251 state = "withdraw";
1252 break;
1253 case GFS2_ERRORS_PANIC:
1254 state = "panic";
1255 break;
1256 default:
1257 state = "unknown";
1258 break;
1259 }
1260 seq_printf(s, ",errors=%s", state);
1261 }
1216 return 0; 1262 return 0;
1217} 1263}
1218 1264
@@ -1240,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode)
1240 goto out; 1286 goto out;
1241 } 1287 }
1242 1288
1289 error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
1290 if (error)
1291 goto out_truncate;
1292
1243 gfs2_glock_dq_wait(&ip->i_iopen_gh); 1293 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1244 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 1294 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
1245 error = gfs2_glock_nq(&ip->i_iopen_gh); 1295 error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b56413e3e40d..235db3682885 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
25 return x; 25 return x;
26} 26}
27 27
28void gfs2_jindex_free(struct gfs2_sbd *sdp); 28extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
29 29
30extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data); 30extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
31 31
@@ -36,10 +36,14 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
36 struct gfs2_inode **ipp); 36 struct gfs2_inode **ipp);
37 37
38extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); 38extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
39 39extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
40extern int gfs2_statfs_init(struct gfs2_sbd *sdp); 40extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, 41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
42 s64 dinodes); 42 s64 dinodes);
43extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
44 const void *buf);
45extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
46 struct buffer_head *l_bh);
43extern int gfs2_statfs_sync(struct gfs2_sbd *sdp); 47extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
44 48
45extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); 49extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
@@ -50,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
50extern const struct export_operations gfs2_export_ops; 54extern const struct export_operations gfs2_export_ops;
51extern const struct super_operations gfs2_super_ops; 55extern const struct super_operations gfs2_super_ops;
52extern const struct dentry_operations gfs2_dops; 56extern const struct dentry_operations gfs2_dops;
57extern struct xattr_handler *gfs2_xattr_handlers[];
53 58
54#endif /* __SUPER_DOT_H__ */ 59#endif /* __SUPER_DOT_H__ */
55 60
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 23419dc3027b..446329728d52 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/genhd.h>
19 20
20#include "gfs2.h" 21#include "gfs2.h"
21#include "incore.h" 22#include "incore.h"
@@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
319 return ret; 320 return ret;
320} 321}
321 322
322static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
323{
324 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
325 return sprintf(buf, "%u\n", ls->ls_id);
326}
327
328static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) 323static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
329{ 324{
330 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 325 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -386,22 +381,20 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
386#define GDLM_ATTR(_name,_mode,_show,_store) \ 381#define GDLM_ATTR(_name,_mode,_show,_store) \
387static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) 382static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
388 383
389GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); 384GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
390GDLM_ATTR(block, 0644, block_show, block_store); 385GDLM_ATTR(block, 0644, block_show, block_store);
391GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 386GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
392GDLM_ATTR(id, 0444, lkid_show, NULL); 387GDLM_ATTR(jid, 0444, jid_show, NULL);
393GDLM_ATTR(jid, 0444, jid_show, NULL); 388GDLM_ATTR(first, 0444, lkfirst_show, NULL);
394GDLM_ATTR(first, 0444, lkfirst_show, NULL); 389GDLM_ATTR(first_done, 0444, first_done_show, NULL);
395GDLM_ATTR(first_done, 0444, first_done_show, NULL); 390GDLM_ATTR(recover, 0600, NULL, recover_store);
396GDLM_ATTR(recover, 0200, NULL, recover_store); 391GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
397GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); 392GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
398GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
399 393
400static struct attribute *lock_module_attrs[] = { 394static struct attribute *lock_module_attrs[] = {
401 &gdlm_attr_proto_name.attr, 395 &gdlm_attr_proto_name.attr,
402 &gdlm_attr_block.attr, 396 &gdlm_attr_block.attr,
403 &gdlm_attr_withdraw.attr, 397 &gdlm_attr_withdraw.attr,
404 &gdlm_attr_id.attr,
405 &gdlm_attr_jid.attr, 398 &gdlm_attr_jid.attr,
406 &gdlm_attr_first.attr, 399 &gdlm_attr_first.attr,
407 &gdlm_attr_first_done.attr, 400 &gdlm_attr_first_done.attr,
@@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = {
519 512
520int gfs2_sys_fs_add(struct gfs2_sbd *sdp) 513int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
521{ 514{
515 struct super_block *sb = sdp->sd_vfs;
522 int error; 516 int error;
517 char ro[20];
518 char spectator[20];
519 char *envp[] = { ro, spectator, NULL };
520
521 sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
522 sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
523 523
524 sdp->sd_kobj.kset = gfs2_kset; 524 sdp->sd_kobj.kset = gfs2_kset;
525 error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, 525 error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
535 if (error) 535 if (error)
536 goto fail_tune; 536 goto fail_tune;
537 537
538 kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); 538 error = sysfs_create_link(&sdp->sd_kobj,
539 &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
540 "device");
541 if (error)
542 goto fail_lock_module;
543
544 kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
539 return 0; 545 return 0;
540 546
547fail_lock_module:
548 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
541fail_tune: 549fail_tune:
542 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 550 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
543fail_reg: 551fail_reg:
@@ -549,12 +557,12 @@ fail:
549 557
550void gfs2_sys_fs_del(struct gfs2_sbd *sdp) 558void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
551{ 559{
560 sysfs_remove_link(&sdp->sd_kobj, "device");
552 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 561 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
553 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); 562 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
554 kobject_put(&sdp->sd_kobj); 563 kobject_put(&sdp->sd_kobj);
555} 564}
556 565
557
558static int gfs2_uevent(struct kset *kset, struct kobject *kobj, 566static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
559 struct kobj_uevent_env *env) 567 struct kobj_uevent_env *env)
560{ 568{
@@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
563 571
564 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 572 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
565 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 573 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
574 if (!sdp->sd_args.ar_spectator)
575 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
566 if (gfs2_uuid_valid(uuid)) { 576 if (gfs2_uuid_valid(uuid)) {
567 add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" 577 add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
568 "%02X%02X-%02X%02X%02X%02X%02X%02X", 578 "%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
578 .uevent = gfs2_uevent, 588 .uevent = gfs2_uevent,
579}; 589};
580 590
581
582int gfs2_sys_init(void) 591int gfs2_sys_init(void)
583{ 592{
584 gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); 593 gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba0..f6a7efa34eb9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
38 const struct lm_lockops *lm = ls->ls_ops; 38 const struct lm_lockops *lm = ls->ls_ops;
39 va_list args; 39 va_list args;
40 40
41 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 41 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
42 test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
42 return 0; 43 return 0;
43 44
44 va_start(args, fmt); 45 va_start(args, fmt);
45 vprintk(fmt, args); 46 vprintk(fmt, args);
46 va_end(args); 47 va_end(args);
47 48
48 fs_err(sdp, "about to withdraw this file system\n"); 49 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
49 BUG_ON(sdp->sd_args.ar_debug); 50 fs_err(sdp, "about to withdraw this file system\n");
51 BUG_ON(sdp->sd_args.ar_debug);
50 52
51 kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); 53 kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
52 54
53 if (lm->lm_unmount) { 55 if (lm->lm_unmount) {
54 fs_err(sdp, "telling LM to unmount\n"); 56 fs_err(sdp, "telling LM to unmount\n");
55 lm->lm_unmount(sdp); 57 lm->lm_unmount(sdp);
58 }
59 fs_err(sdp, "withdrawn\n");
60 dump_stack();
56 } 61 }
57 fs_err(sdp, "withdrawn\n"); 62
58 dump_stack(); 63 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
64 panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
59 65
60 return -1; 66 return -1;
61} 67}
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
93 gfs2_tune_get(sdp, gt_complain_secs) * HZ)) 99 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
94 return -2; 100 return -2;
95 101
96 printk(KERN_WARNING 102 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
97 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" 103 printk(KERN_WARNING
98 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 104 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
99 sdp->sd_fsname, assertion, 105 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
100 sdp->sd_fsname, function, file, line); 106 sdp->sd_fsname, assertion,
107 sdp->sd_fsname, function, file, line);
101 108
102 if (sdp->sd_args.ar_debug) 109 if (sdp->sd_args.ar_debug)
103 BUG(); 110 BUG();
104 else 111 else
105 dump_stack(); 112 dump_stack();
106 113
114 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
115 panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
116 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
117 sdp->sd_fsname, assertion,
118 sdp->sd_fsname, function, file, line);
119
107 sdp->sd_last_warning = jiffies; 120 sdp->sd_last_warning = jiffies;
108 121
109 return -1; 122 return -1;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c
index 07ea9529adda..8a0f8ef6ee27 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/xattr.c
@@ -18,8 +18,7 @@
18#include "gfs2.h" 18#include "gfs2.h"
19#include "incore.h" 19#include "incore.h"
20#include "acl.h" 20#include "acl.h"
21#include "eaops.h" 21#include "xattr.h"
22#include "eattr.h"
23#include "glock.h" 22#include "glock.h"
24#include "inode.h" 23#include "inode.h"
25#include "meta_io.h" 24#include "meta_io.h"
@@ -38,26 +37,32 @@
38 * Returns: 1 if the EA should be stuffed 37 * Returns: 1 if the EA should be stuffed
39 */ 38 */
40 39
41static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er, 40static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
42 unsigned int *size) 41 unsigned int *size)
43{ 42{
44 *size = GFS2_EAREQ_SIZE_STUFFED(er); 43 unsigned int jbsize = sdp->sd_jbsize;
45 if (*size <= sdp->sd_jbsize) 44
45 /* Stuffed */
46 *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
47
48 if (*size <= jbsize)
46 return 1; 49 return 1;
47 50
48 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er); 51 /* Unstuffed */
52 *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
53 (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
49 54
50 return 0; 55 return 0;
51} 56}
52 57
53static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er) 58static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
54{ 59{
55 unsigned int size; 60 unsigned int size;
56 61
57 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN) 62 if (dsize > GFS2_EA_MAX_DATA_LEN)
58 return -ERANGE; 63 return -ERANGE;
59 64
60 ea_calc_size(sdp, er, &size); 65 ea_calc_size(sdp, nsize, dsize, &size);
61 66
62 /* This can only happen with 512 byte blocks */ 67 /* This can only happen with 512 byte blocks */
63 if (size > sdp->sd_jbsize) 68 if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
151} 156}
152 157
153struct ea_find { 158struct ea_find {
154 struct gfs2_ea_request *ef_er; 159 int type;
160 const char *name;
161 size_t namel;
155 struct gfs2_ea_location *ef_el; 162 struct gfs2_ea_location *ef_el;
156}; 163};
157 164
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
160 void *private) 167 void *private)
161{ 168{
162 struct ea_find *ef = private; 169 struct ea_find *ef = private;
163 struct gfs2_ea_request *er = ef->ef_er;
164 170
165 if (ea->ea_type == GFS2_EATYPE_UNUSED) 171 if (ea->ea_type == GFS2_EATYPE_UNUSED)
166 return 0; 172 return 0;
167 173
168 if (ea->ea_type == er->er_type) { 174 if (ea->ea_type == ef->type) {
169 if (ea->ea_name_len == er->er_name_len && 175 if (ea->ea_name_len == ef->namel &&
170 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { 176 !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
171 struct gfs2_ea_location *el = ef->ef_el; 177 struct gfs2_ea_location *el = ef->ef_el;
172 get_bh(bh); 178 get_bh(bh);
173 el->el_bh = bh; 179 el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
180 return 0; 186 return 0;
181} 187}
182 188
183int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er, 189int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
184 struct gfs2_ea_location *el) 190 struct gfs2_ea_location *el)
185{ 191{
186 struct ea_find ef; 192 struct ea_find ef;
187 int error; 193 int error;
188 194
189 ef.ef_er = er; 195 ef.type = type;
196 ef.name = name;
197 ef.namel = strlen(name);
190 ef.ef_el = el; 198 ef.ef_el = el;
191 199
192 memset(el, 0, sizeof(struct gfs2_ea_location)); 200 memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
344 unsigned int ei_size; 352 unsigned int ei_size;
345}; 353};
346 354
355static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
356{
357 switch (ea->ea_type) {
358 case GFS2_EATYPE_USR:
359 return 5 + ea->ea_name_len + 1;
360 case GFS2_EATYPE_SYS:
361 return 7 + ea->ea_name_len + 1;
362 case GFS2_EATYPE_SECURITY:
363 return 9 + ea->ea_name_len + 1;
364 default:
365 return 0;
366 }
367}
368
347static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, 369static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
348 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, 370 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
349 void *private) 371 void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
392} 414}
393 415
394/** 416/**
395 * gfs2_ea_list - 417 * gfs2_listxattr - List gfs2 extended attributes
396 * @ip: 418 * @dentry: The dentry whose inode we are interested in
397 * @er: 419 * @buffer: The buffer to write the results
420 * @size: The size of the buffer
398 * 421 *
399 * Returns: actual size of data on success, -errno on error 422 * Returns: actual size of data on success, -errno on error
400 */ 423 */
401 424
402int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) 425ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
403{ 426{
427 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
428 struct gfs2_ea_request er;
404 struct gfs2_holder i_gh; 429 struct gfs2_holder i_gh;
405 int error; 430 int error;
406 431
407 if (!er->er_data || !er->er_data_len) { 432 memset(&er, 0, sizeof(struct gfs2_ea_request));
408 er->er_data = NULL; 433 if (size) {
409 er->er_data_len = 0; 434 er.er_data = buffer;
435 er.er_data_len = size;
410 } 436 }
411 437
412 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 438 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
414 return error; 440 return error;
415 441
416 if (ip->i_eattr) { 442 if (ip->i_eattr) {
417 struct ea_list ei = { .ei_er = er, .ei_size = 0 }; 443 struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
418 444
419 error = ea_foreach(ip, ea_list_i, &ei); 445 error = ea_foreach(ip, ea_list_i, &ei);
420 if (!error) 446 if (!error)
@@ -491,84 +517,61 @@ out:
491} 517}
492 518
493int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, 519int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
494 char *data) 520 char *data, size_t size)
495{ 521{
522 int ret;
523 size_t len = GFS2_EA_DATA_LEN(el->el_ea);
524 if (len > size)
525 return -ERANGE;
526
496 if (GFS2_EA_IS_STUFFED(el->el_ea)) { 527 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
497 memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea)); 528 memcpy(data, GFS2_EA2DATA(el->el_ea), len);
498 return 0; 529 return len;
499 } else 530 }
500 return ea_get_unstuffed(ip, el->el_ea, data); 531 ret = ea_get_unstuffed(ip, el->el_ea, data);
532 if (ret < 0)
533 return ret;
534 return len;
501} 535}
502 536
503/** 537/**
504 * gfs2_ea_get_i - 538 * gfs2_xattr_get - Get a GFS2 extended attribute
505 * @ip: The GFS2 inode 539 * @inode: The inode
506 * @er: The request structure 540 * @type: The type of extended attribute
541 * @name: The name of the extended attribute
542 * @buffer: The buffer to write the result into
543 * @size: The size of the buffer
507 * 544 *
508 * Returns: actual size of data on success, -errno on error 545 * Returns: actual size of data on success, -errno on error
509 */ 546 */
510 547
511int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) 548int gfs2_xattr_get(struct inode *inode, int type, const char *name,
549 void *buffer, size_t size)
512{ 550{
551 struct gfs2_inode *ip = GFS2_I(inode);
513 struct gfs2_ea_location el; 552 struct gfs2_ea_location el;
514 int error; 553 int error;
515 554
516 if (!ip->i_eattr) 555 if (!ip->i_eattr)
517 return -ENODATA; 556 return -ENODATA;
557 if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
558 return -EINVAL;
518 559
519 error = gfs2_ea_find(ip, er, &el); 560 error = gfs2_ea_find(ip, type, name, &el);
520 if (error) 561 if (error)
521 return error; 562 return error;
522 if (!el.el_ea) 563 if (!el.el_ea)
523 return -ENODATA; 564 return -ENODATA;
524 565 if (size)
525 if (er->er_data_len) { 566 error = gfs2_ea_get_copy(ip, &el, buffer, size);
526 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len) 567 else
527 error = -ERANGE;
528 else
529 error = gfs2_ea_get_copy(ip, &el, er->er_data);
530 }
531 if (!error)
532 error = GFS2_EA_DATA_LEN(el.el_ea); 568 error = GFS2_EA_DATA_LEN(el.el_ea);
533
534 brelse(el.el_bh); 569 brelse(el.el_bh);
535 570
536 return error; 571 return error;
537} 572}
538 573
539/** 574/**
540 * gfs2_ea_get -
541 * @ip: The GFS2 inode
542 * @er: The request structure
543 *
544 * Returns: actual size of data on success, -errno on error
545 */
546
547int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
548{
549 struct gfs2_holder i_gh;
550 int error;
551
552 if (!er->er_name_len ||
553 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
554 return -EINVAL;
555 if (!er->er_data || !er->er_data_len) {
556 er->er_data = NULL;
557 er->er_data_len = 0;
558 }
559
560 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
561 if (error)
562 return error;
563
564 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
565
566 gfs2_glock_dq_uninit(&i_gh);
567
568 return error;
569}
570
571/**
572 * ea_alloc_blk - allocates a new block for extended attributes. 575 * ea_alloc_blk - allocates a new block for extended attributes.
573 * @ip: A pointer to the inode that's getting extended attributes 576 * @ip: A pointer to the inode that's getting extended attributes
574 * @bhp: Pointer to pointer to a struct buffer_head 577 * @bhp: Pointer to pointer to a struct buffer_head
@@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
713 716
714 error = gfs2_meta_inode_buffer(ip, &dibh); 717 error = gfs2_meta_inode_buffer(ip, &dibh);
715 if (!error) { 718 if (!error) {
716 if (er->er_flags & GFS2_ERF_MODE) {
717 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
718 (ip->i_inode.i_mode & S_IFMT) ==
719 (er->er_mode & S_IFMT));
720 ip->i_inode.i_mode = er->er_mode;
721 }
722 ip->i_inode.i_ctime = CURRENT_TIME; 719 ip->i_inode.i_ctime = CURRENT_TIME;
723 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 720 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
724 gfs2_dinode_out(ip, dibh->b_data); 721 gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
762 * Returns: errno 759 * Returns: errno
763 */ 760 */
764 761
765static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er) 762static int ea_init(struct gfs2_inode *ip, int type, const char *name,
763 const void *data, size_t size)
766{ 764{
765 struct gfs2_ea_request er;
767 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; 766 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
768 unsigned int blks = 1; 767 unsigned int blks = 1;
769 768
770 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize) 769 er.er_type = type;
771 blks += DIV_ROUND_UP(er->er_data_len, jbsize); 770 er.er_name = name;
771 er.er_name_len = strlen(name);
772 er.er_data = (void *)data;
773 er.er_data_len = size;
774
775 if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
776 blks += DIV_ROUND_UP(er.er_data_len, jbsize);
772 777
773 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL); 778 return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
774} 779}
775 780
776static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) 781static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
848 error = gfs2_meta_inode_buffer(ip, &dibh); 853 error = gfs2_meta_inode_buffer(ip, &dibh);
849 if (error) 854 if (error)
850 goto out; 855 goto out;
851
852 if (er->er_flags & GFS2_ERF_MODE) {
853 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
854 (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
855 ip->i_inode.i_mode = er->er_mode;
856 }
857 ip->i_inode.i_ctime = CURRENT_TIME; 856 ip->i_inode.i_ctime = CURRENT_TIME;
858 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 857 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
859 gfs2_dinode_out(ip, dibh->b_data); 858 gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
894 int stuffed; 893 int stuffed;
895 int error; 894 int error;
896 895
897 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size); 896 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
897 es->es_er->er_data_len, &size);
898 898
899 if (ea->ea_type == GFS2_EATYPE_UNUSED) { 899 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
900 if (GFS2_EA_REC_LEN(ea) < size) 900 if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1005,22 @@ out:
1005 return error; 1005 return error;
1006} 1006}
1007 1007
1008static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, 1008static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
1009 struct gfs2_ea_location *el) 1009 const void *value, size_t size, struct gfs2_ea_location *el)
1010{ 1010{
1011 struct gfs2_ea_request er;
1011 struct ea_set es; 1012 struct ea_set es;
1012 unsigned int blks = 2; 1013 unsigned int blks = 2;
1013 int error; 1014 int error;
1014 1015
1016 er.er_type = type;
1017 er.er_name = name;
1018 er.er_data = (void *)value;
1019 er.er_name_len = strlen(name);
1020 er.er_data_len = size;
1021
1015 memset(&es, 0, sizeof(struct ea_set)); 1022 memset(&es, 0, sizeof(struct ea_set));
1016 es.es_er = er; 1023 es.es_er = &er;
1017 es.es_el = el; 1024 es.es_el = el;
1018 1025
1019 error = ea_foreach(ip, ea_set_simple, &es); 1026 error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1024 1031
1025 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) 1032 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
1026 blks++; 1033 blks++;
1027 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) 1034 if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1028 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); 1035 blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
1029 1036
1030 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); 1037 return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
1031} 1038}
1032 1039
1033static int ea_set_remove_unstuffed(struct gfs2_inode *ip, 1040static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1039 GFS2_EA2NEXT(el->el_prev) == el->el_ea); 1046 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1040 } 1047 }
1041 1048
1042 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0); 1049 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
1043}
1044
1045int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1046{
1047 struct gfs2_ea_location el;
1048 int error;
1049
1050 if (!ip->i_eattr) {
1051 if (er->er_flags & XATTR_REPLACE)
1052 return -ENODATA;
1053 return ea_init(ip, er);
1054 }
1055
1056 error = gfs2_ea_find(ip, er, &el);
1057 if (error)
1058 return error;
1059
1060 if (el.el_ea) {
1061 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
1062 brelse(el.el_bh);
1063 return -EPERM;
1064 }
1065
1066 error = -EEXIST;
1067 if (!(er->er_flags & XATTR_CREATE)) {
1068 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1069 error = ea_set_i(ip, er, &el);
1070 if (!error && unstuffed)
1071 ea_set_remove_unstuffed(ip, &el);
1072 }
1073
1074 brelse(el.el_bh);
1075 } else {
1076 error = -ENODATA;
1077 if (!(er->er_flags & XATTR_REPLACE))
1078 error = ea_set_i(ip, er, NULL);
1079 }
1080
1081 return error;
1082}
1083
1084int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1085{
1086 struct gfs2_holder i_gh;
1087 int error;
1088
1089 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1090 return -EINVAL;
1091 if (!er->er_data || !er->er_data_len) {
1092 er->er_data = NULL;
1093 er->er_data_len = 0;
1094 }
1095 error = ea_check_size(GFS2_SB(&ip->i_inode), er);
1096 if (error)
1097 return error;
1098
1099 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1100 if (error)
1101 return error;
1102
1103 if (IS_IMMUTABLE(&ip->i_inode))
1104 error = -EPERM;
1105 else
1106 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1107
1108 gfs2_glock_dq_uninit(&i_gh);
1109
1110 return error;
1111} 1050}
1112 1051
1113static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) 1052static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
@@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1131 1070
1132 if (GFS2_EA_IS_LAST(ea)) 1071 if (GFS2_EA_IS_LAST(ea))
1133 prev->ea_flags |= GFS2_EAFLAG_LAST; 1072 prev->ea_flags |= GFS2_EAFLAG_LAST;
1134 } else 1073 } else {
1135 ea->ea_type = GFS2_EATYPE_UNUSED; 1074 ea->ea_type = GFS2_EATYPE_UNUSED;
1075 }
1136 1076
1137 error = gfs2_meta_inode_buffer(ip, &dibh); 1077 error = gfs2_meta_inode_buffer(ip, &dibh);
1138 if (!error) { 1078 if (!error) {
@@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1147 return error; 1087 return error;
1148} 1088}
1149 1089
1150int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) 1090/**
1091 * gfs2_xattr_remove - Remove a GFS2 extended attribute
1092 * @inode: The inode
1093 * @type: The type of the extended attribute
1094 * @name: The name of the extended attribute
1095 *
1096 * This is not called directly by the VFS since we use the (common)
1097 * scheme of making a "set with NULL data" mean a remove request. Note
1098 * that this is different from a set with zero length data.
1099 *
1100 * Returns: 0, or errno on failure
1101 */
1102
1103static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
1151{ 1104{
1105 struct gfs2_inode *ip = GFS2_I(inode);
1152 struct gfs2_ea_location el; 1106 struct gfs2_ea_location el;
1153 int error; 1107 int error;
1154 1108
1155 if (!ip->i_eattr) 1109 if (!ip->i_eattr)
1156 return -ENODATA; 1110 return -ENODATA;
1157 1111
1158 error = gfs2_ea_find(ip, er, &el); 1112 error = gfs2_ea_find(ip, type, name, &el);
1159 if (error) 1113 if (error)
1160 return error; 1114 return error;
1161 if (!el.el_ea) 1115 if (!el.el_ea)
@@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1164 if (GFS2_EA_IS_STUFFED(el.el_ea)) 1118 if (GFS2_EA_IS_STUFFED(el.el_ea))
1165 error = ea_remove_stuffed(ip, &el); 1119 error = ea_remove_stuffed(ip, &el);
1166 else 1120 else
1167 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 1121 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
1168 0);
1169 1122
1170 brelse(el.el_bh); 1123 brelse(el.el_bh);
1171 1124
@@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1173} 1126}
1174 1127
1175/** 1128/**
1176 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute 1129 * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
1177 * @ip: pointer to the inode of the target file 1130 * @inode: The inode
1178 * @er: request information 1131 * @type: The type of the extended attribute
1132 * @name: The name of the extended attribute
1133 * @value: The value of the extended attribute (NULL for remove)
1134 * @size: The size of the @value argument
1135 * @flags: Create or Replace
1179 * 1136 *
1180 * Returns: errno 1137 * See gfs2_xattr_remove() for details of the removal of xattrs.
1138 *
1139 * Returns: 0 or errno on failure
1181 */ 1140 */
1182 1141
1183int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) 1142int gfs2_xattr_set(struct inode *inode, int type, const char *name,
1143 const void *value, size_t size, int flags)
1184{ 1144{
1185 struct gfs2_holder i_gh; 1145 struct gfs2_sbd *sdp = GFS2_SB(inode);
1146 struct gfs2_inode *ip = GFS2_I(inode);
1147 struct gfs2_ea_location el;
1148 unsigned int namel = strlen(name);
1186 int error; 1149 int error;
1187 1150
1188 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) 1151 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
1189 return -EINVAL; 1152 return -EPERM;
1153 if (namel > GFS2_EA_MAX_NAME_LEN)
1154 return -ERANGE;
1190 1155
1191 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); 1156 if (value == NULL)
1157 return gfs2_xattr_remove(inode, type, name);
1158
1159 if (ea_check_size(sdp, namel, size))
1160 return -ERANGE;
1161
1162 if (!ip->i_eattr) {
1163 if (flags & XATTR_REPLACE)
1164 return -ENODATA;
1165 return ea_init(ip, type, name, value, size);
1166 }
1167
1168 error = gfs2_ea_find(ip, type, name, &el);
1192 if (error) 1169 if (error)
1193 return error; 1170 return error;
1194 1171
1195 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) 1172 if (el.el_ea) {
1196 error = -EPERM; 1173 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
1197 else 1174 brelse(el.el_bh);
1198 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er); 1175 return -EPERM;
1176 }
1199 1177
1200 gfs2_glock_dq_uninit(&i_gh); 1178 error = -EEXIST;
1179 if (!(flags & XATTR_CREATE)) {
1180 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1181 error = ea_set_i(ip, type, name, value, size, &el);
1182 if (!error && unstuffed)
1183 ea_set_remove_unstuffed(ip, &el);
1184 }
1185
1186 brelse(el.el_bh);
1187 return error;
1188 }
1189
1190 error = -ENODATA;
1191 if (!(flags & XATTR_REPLACE))
1192 error = ea_set_i(ip, type, name, value, size, NULL);
1201 1193
1202 return error; 1194 return error;
1203} 1195}
@@ -1503,3 +1495,64 @@ out_alloc:
1503 return error; 1495 return error;
1504} 1496}
1505 1497
1498static int gfs2_xattr_user_get(struct inode *inode, const char *name,
1499 void *buffer, size_t size)
1500{
1501 return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
1502}
1503
1504static int gfs2_xattr_user_set(struct inode *inode, const char *name,
1505 const void *value, size_t size, int flags)
1506{
1507 return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
1508}
1509
1510static int gfs2_xattr_system_get(struct inode *inode, const char *name,
1511 void *buffer, size_t size)
1512{
1513 return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
1514}
1515
1516static int gfs2_xattr_system_set(struct inode *inode, const char *name,
1517 const void *value, size_t size, int flags)
1518{
1519 return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
1520}
1521
1522static int gfs2_xattr_security_get(struct inode *inode, const char *name,
1523 void *buffer, size_t size)
1524{
1525 return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
1526}
1527
1528static int gfs2_xattr_security_set(struct inode *inode, const char *name,
1529 const void *value, size_t size, int flags)
1530{
1531 return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
1532}
1533
1534static struct xattr_handler gfs2_xattr_user_handler = {
1535 .prefix = XATTR_USER_PREFIX,
1536 .get = gfs2_xattr_user_get,
1537 .set = gfs2_xattr_user_set,
1538};
1539
1540static struct xattr_handler gfs2_xattr_security_handler = {
1541 .prefix = XATTR_SECURITY_PREFIX,
1542 .get = gfs2_xattr_security_get,
1543 .set = gfs2_xattr_security_set,
1544};
1545
1546static struct xattr_handler gfs2_xattr_system_handler = {
1547 .prefix = XATTR_SYSTEM_PREFIX,
1548 .get = gfs2_xattr_system_get,
1549 .set = gfs2_xattr_system_set,
1550};
1551
1552struct xattr_handler *gfs2_xattr_handlers[] = {
1553 &gfs2_xattr_user_handler,
1554 &gfs2_xattr_security_handler,
1555 &gfs2_xattr_system_handler,
1556 NULL,
1557};
1558
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h
index c82dbe01d713..cbdfd7743733 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
19#define GFS2_EA_SIZE(ea) \ 19#define GFS2_EA_SIZE(ea) \
20ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ 20ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
21 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ 21 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
22 (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) 22 (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
23 23
24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) 24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) 25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
27#define GFS2_EAREQ_SIZE_STUFFED(er) \ 27#define GFS2_EAREQ_SIZE_STUFFED(er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) 28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
29 29
30#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
31ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
32 sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
33
34#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) 30#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
35#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) 31#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
36 32
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
43#define GFS2_EA_BH2FIRST(bh) \ 39#define GFS2_EA_BH2FIRST(bh) \
44((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) 40((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
45 41
46#define GFS2_ERF_MODE 0x80000000
47
48struct gfs2_ea_request { 42struct gfs2_ea_request {
49 const char *er_name; 43 const char *er_name;
50 char *er_data; 44 char *er_data;
51 unsigned int er_name_len; 45 unsigned int er_name_len;
52 unsigned int er_data_len; 46 unsigned int er_data_len;
53 unsigned int er_type; /* GFS2_EATYPE_... */ 47 unsigned int er_type; /* GFS2_EATYPE_... */
54 int er_flags;
55 mode_t er_mode;
56}; 48};
57 49
58struct gfs2_ea_location { 50struct gfs2_ea_location {
@@ -61,40 +53,20 @@ struct gfs2_ea_location {
61 struct gfs2_ea_header *el_prev; 53 struct gfs2_ea_header *el_prev;
62}; 54};
63 55
64int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); 56extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
65int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); 57 void *buffer, size_t size);
66int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); 58extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
67 59 const void *value, size_t size, int flags);
68int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er); 60extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
69int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er); 61extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
70int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
71int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
72
73int gfs2_ea_dealloc(struct gfs2_inode *ip);
74 62
75/* Exported to acl.c */ 63/* Exported to acl.c */
76 64
77int gfs2_ea_find(struct gfs2_inode *ip, 65extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
78 struct gfs2_ea_request *er, 66 struct gfs2_ea_location *el);
79 struct gfs2_ea_location *el); 67extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
80int gfs2_ea_get_copy(struct gfs2_inode *ip, 68 char *data, size_t size);
81 struct gfs2_ea_location *el, 69extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
82 char *data); 70 struct iattr *attr, char *data);
83int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
84 struct iattr *attr, char *data);
85
86static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
87{
88 switch (ea->ea_type) {
89 case GFS2_EATYPE_USR:
90 return 5 + ea->ea_name_len + 1;
91 case GFS2_EATYPE_SYS:
92 return 7 + ea->ea_name_len + 1;
93 case GFS2_EATYPE_SECURITY:
94 return 9 + ea->ea_name_len + 1;
95 default:
96 return 0;
97 }
98}
99 71
100#endif /* __EATTR_DOT_H__ */ 72#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 941c8425c10b..a93b885311d8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
44static const struct inode_operations hugetlbfs_inode_operations; 44static const struct inode_operations hugetlbfs_inode_operations;
45 45
46static struct backing_dev_info hugetlbfs_backing_dev_info = { 46static struct backing_dev_info hugetlbfs_backing_dev_info = {
47 .name = "hugetlbfs",
47 .ra_pages = 0, /* No readahead */ 48 .ra_pages = 0, /* No readahead */
48 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 49 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
49}; 50};
@@ -935,26 +936,28 @@ static int can_do_hugetlb_shm(void)
935 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); 936 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
936} 937}
937 938
938struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) 939struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
940 struct user_struct **user)
939{ 941{
940 int error = -ENOMEM; 942 int error = -ENOMEM;
941 int unlock_shm = 0;
942 struct file *file; 943 struct file *file;
943 struct inode *inode; 944 struct inode *inode;
944 struct dentry *dentry, *root; 945 struct dentry *dentry, *root;
945 struct qstr quick_string; 946 struct qstr quick_string;
946 struct user_struct *user = current_user();
947 947
948 *user = NULL;
948 if (!hugetlbfs_vfsmount) 949 if (!hugetlbfs_vfsmount)
949 return ERR_PTR(-ENOENT); 950 return ERR_PTR(-ENOENT);
950 951
951 if (!can_do_hugetlb_shm()) { 952 if (!can_do_hugetlb_shm()) {
952 if (user_shm_lock(size, user)) { 953 *user = current_user();
953 unlock_shm = 1; 954 if (user_shm_lock(size, *user)) {
954 WARN_ONCE(1, 955 WARN_ONCE(1,
955 "Using mlock ulimits for SHM_HUGETLB deprecated\n"); 956 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
956 } else 957 } else {
958 *user = NULL;
957 return ERR_PTR(-EPERM); 959 return ERR_PTR(-EPERM);
960 }
958 } 961 }
959 962
960 root = hugetlbfs_vfsmount->mnt_root; 963 root = hugetlbfs_vfsmount->mnt_root;
@@ -996,8 +999,10 @@ out_inode:
996out_dentry: 999out_dentry:
997 dput(dentry); 1000 dput(dentry);
998out_shm_unlock: 1001out_shm_unlock:
999 if (unlock_shm) 1002 if (*user) {
1000 user_shm_unlock(size, user); 1003 user_shm_unlock(size, *user);
1004 *user = NULL;
1005 }
1001 return ERR_PTR(error); 1006 return ERR_PTR(error);
1002} 1007}
1003 1008
diff --git a/fs/inode.c b/fs/inode.c
index 901bad1e5f12..b2ba83d2c4e1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode)
120 * These are initializations that need to be done on every inode 120 * These are initializations that need to be done on every inode
121 * allocation as the fields are not initialised by slab allocation. 121 * allocation as the fields are not initialised by slab allocation.
122 */ 122 */
123struct inode *inode_init_always(struct super_block *sb, struct inode *inode) 123int inode_init_always(struct super_block *sb, struct inode *inode)
124{ 124{
125 static const struct address_space_operations empty_aops; 125 static const struct address_space_operations empty_aops;
126 static struct inode_operations empty_iops; 126 static struct inode_operations empty_iops;
127 static const struct file_operations empty_fops; 127 static const struct file_operations empty_fops;
128
129 struct address_space *const mapping = &inode->i_data; 128 struct address_space *const mapping = &inode->i_data;
130 129
131 inode->i_sb = sb; 130 inode->i_sb = sb;
@@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
152 inode->dirtied_when = 0; 151 inode->dirtied_when = 0;
153 152
154 if (security_inode_alloc(inode)) 153 if (security_inode_alloc(inode))
155 goto out_free_inode; 154 goto out;
156 155
157 /* allocate and initialize an i_integrity */ 156 /* allocate and initialize an i_integrity */
158 if (ima_inode_alloc(inode)) 157 if (ima_inode_alloc(inode))
@@ -183,9 +182,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
183 if (sb->s_bdev) { 182 if (sb->s_bdev) {
184 struct backing_dev_info *bdi; 183 struct backing_dev_info *bdi;
185 184
186 bdi = sb->s_bdev->bd_inode_backing_dev_info; 185 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
187 if (!bdi)
188 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
189 mapping->backing_dev_info = bdi; 186 mapping->backing_dev_info = bdi;
190 } 187 }
191 inode->i_private = NULL; 188 inode->i_private = NULL;
@@ -198,16 +195,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
198 inode->i_fsnotify_mask = 0; 195 inode->i_fsnotify_mask = 0;
199#endif 196#endif
200 197
201 return inode; 198 return 0;
202 199
203out_free_security: 200out_free_security:
204 security_inode_free(inode); 201 security_inode_free(inode);
205out_free_inode: 202out:
206 if (inode->i_sb->s_op->destroy_inode) 203 return -ENOMEM;
207 inode->i_sb->s_op->destroy_inode(inode);
208 else
209 kmem_cache_free(inode_cachep, (inode));
210 return NULL;
211} 204}
212EXPORT_SYMBOL(inode_init_always); 205EXPORT_SYMBOL(inode_init_always);
213 206
@@ -220,12 +213,21 @@ static struct inode *alloc_inode(struct super_block *sb)
220 else 213 else
221 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); 214 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
222 215
223 if (inode) 216 if (!inode)
224 return inode_init_always(sb, inode); 217 return NULL;
225 return NULL; 218
219 if (unlikely(inode_init_always(sb, inode))) {
220 if (inode->i_sb->s_op->destroy_inode)
221 inode->i_sb->s_op->destroy_inode(inode);
222 else
223 kmem_cache_free(inode_cachep, inode);
224 return NULL;
225 }
226
227 return inode;
226} 228}
227 229
228void destroy_inode(struct inode *inode) 230void __destroy_inode(struct inode *inode)
229{ 231{
230 BUG_ON(inode_has_buffers(inode)); 232 BUG_ON(inode_has_buffers(inode));
231 ima_inode_free(inode); 233 ima_inode_free(inode);
@@ -237,13 +239,17 @@ void destroy_inode(struct inode *inode)
237 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 239 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
238 posix_acl_release(inode->i_default_acl); 240 posix_acl_release(inode->i_default_acl);
239#endif 241#endif
242}
243EXPORT_SYMBOL(__destroy_inode);
244
245void destroy_inode(struct inode *inode)
246{
247 __destroy_inode(inode);
240 if (inode->i_sb->s_op->destroy_inode) 248 if (inode->i_sb->s_op->destroy_inode)
241 inode->i_sb->s_op->destroy_inode(inode); 249 inode->i_sb->s_op->destroy_inode(inode);
242 else 250 else
243 kmem_cache_free(inode_cachep, (inode)); 251 kmem_cache_free(inode_cachep, (inode));
244} 252}
245EXPORT_SYMBOL(destroy_inode);
246
247 253
248/* 254/*
249 * These are initializations that only need to be done 255 * These are initializations that only need to be done
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
456{ 456{
457 transaction_t * transaction; 457 transaction_t * transaction;
458 tid_t first_tid; 458 tid_t first_tid;
459 unsigned long blocknr, freed; 459 unsigned int blocknr, freed;
460 460
461 if (is_journal_aborted(journal)) 461 if (is_journal_aborted(journal))
462 return 1; 462 return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
502 freed = freed + journal->j_last - journal->j_first; 502 freed = freed + journal->j_last - journal->j_first;
503 503
504 jbd_debug(1, 504 jbd_debug(1,
505 "Cleaning journal tail from %d to %d (offset %lu), " 505 "Cleaning journal tail from %d to %d (offset %u), "
506 "freeing %lu\n", 506 "freeing %u\n",
507 journal->j_tail_sequence, first_tid, blocknr, freed); 507 journal->j_tail_sequence, first_tid, blocknr, freed);
508 508
509 journal->j_free += freed; 509 journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
308 int bufs; 308 int bufs;
309 int flags; 309 int flags;
310 int err; 310 int err;
311 unsigned long blocknr; 311 unsigned int blocknr;
312 ktime_t start_time; 312 ktime_t start_time;
313 u64 commit_time; 313 u64 commit_time;
314 char *tagp = NULL; 314 char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b5..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
276int journal_write_metadata_buffer(transaction_t *transaction, 276int journal_write_metadata_buffer(transaction_t *transaction,
277 struct journal_head *jh_in, 277 struct journal_head *jh_in,
278 struct journal_head **jh_out, 278 struct journal_head **jh_out,
279 unsigned long blocknr) 279 unsigned int blocknr)
280{ 280{
281 int need_copy_out = 0; 281 int need_copy_out = 0;
282 int done_copy_out = 0; 282 int done_copy_out = 0;
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
287 struct page *new_page; 287 struct page *new_page;
288 unsigned int new_offset; 288 unsigned int new_offset;
289 struct buffer_head *bh_in = jh2bh(jh_in); 289 struct buffer_head *bh_in = jh2bh(jh_in);
290 journal_t *journal = transaction->t_journal;
290 291
291 /* 292 /*
292 * The buffer really shouldn't be locked: only the current committing 293 * The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
300 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 301 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
301 302
302 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 303 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
304 /* keep subsequent assertions sane */
305 new_bh->b_state = 0;
306 init_buffer(new_bh, NULL, NULL);
307 atomic_set(&new_bh->b_count, 1);
308 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
303 309
304 /* 310 /*
305 * If a new transaction has already done a buffer copy-out, then 311 * If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
361 kunmap_atomic(mapped_data, KM_USER0); 367 kunmap_atomic(mapped_data, KM_USER0);
362 } 368 }
363 369
364 /* keep subsequent assertions sane */
365 new_bh->b_state = 0;
366 init_buffer(new_bh, NULL, NULL);
367 atomic_set(&new_bh->b_count, 1);
368 jbd_unlock_bh_state(bh_in);
369
370 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
371
372 set_bh_page(new_bh, new_page, new_offset); 370 set_bh_page(new_bh, new_page, new_offset);
373 new_jh->b_transaction = NULL; 371 new_jh->b_transaction = NULL;
374 new_bh->b_size = jh2bh(jh_in)->b_size; 372 new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
385 * copying is moved to the transaction's shadow queue. 383 * copying is moved to the transaction's shadow queue.
386 */ 384 */
387 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 385 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
388 journal_file_buffer(jh_in, transaction, BJ_Shadow); 386 spin_lock(&journal->j_list_lock);
387 __journal_file_buffer(jh_in, transaction, BJ_Shadow);
388 spin_unlock(&journal->j_list_lock);
389 jbd_unlock_bh_state(bh_in);
390
389 JBUFFER_TRACE(new_jh, "file as BJ_IO"); 391 JBUFFER_TRACE(new_jh, "file as BJ_IO");
390 journal_file_buffer(new_jh, transaction, BJ_IO); 392 journal_file_buffer(new_jh, transaction, BJ_IO);
391 393
@@ -565,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
565 * Log buffer allocation routines: 567 * Log buffer allocation routines:
566 */ 568 */
567 569
568int journal_next_log_block(journal_t *journal, unsigned long *retp) 570int journal_next_log_block(journal_t *journal, unsigned int *retp)
569{ 571{
570 unsigned long blocknr; 572 unsigned int blocknr;
571 573
572 spin_lock(&journal->j_state_lock); 574 spin_lock(&journal->j_state_lock);
573 J_ASSERT(journal->j_free > 1); 575 J_ASSERT(journal->j_free > 1);
@@ -588,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
588 * this is a no-op. If needed, we can use j_blk_offset - everything is 590 * this is a no-op. If needed, we can use j_blk_offset - everything is
589 * ready. 591 * ready.
590 */ 592 */
591int journal_bmap(journal_t *journal, unsigned long blocknr, 593int journal_bmap(journal_t *journal, unsigned int blocknr,
592 unsigned long *retp) 594 unsigned int *retp)
593{ 595{
594 int err = 0; 596 int err = 0;
595 unsigned long ret; 597 unsigned int ret;
596 598
597 if (journal->j_inode) { 599 if (journal->j_inode) {
598 ret = bmap(journal->j_inode, blocknr); 600 ret = bmap(journal->j_inode, blocknr);
@@ -602,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
602 char b[BDEVNAME_SIZE]; 604 char b[BDEVNAME_SIZE];
603 605
604 printk(KERN_ALERT "%s: journal block not found " 606 printk(KERN_ALERT "%s: journal block not found "
605 "at offset %lu on %s\n", 607 "at offset %u on %s\n",
606 __func__, 608 __func__,
607 blocknr, 609 blocknr,
608 bdevname(journal->j_dev, b)); 610 bdevname(journal->j_dev, b));
@@ -628,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
628struct journal_head *journal_get_descriptor_buffer(journal_t *journal) 630struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
629{ 631{
630 struct buffer_head *bh; 632 struct buffer_head *bh;
631 unsigned long blocknr; 633 unsigned int blocknr;
632 int err; 634 int err;
633 635
634 err = journal_next_log_block(journal, &blocknr); 636 err = journal_next_log_block(journal, &blocknr);
@@ -772,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
772 journal_t *journal = journal_init_common(); 774 journal_t *journal = journal_init_common();
773 int err; 775 int err;
774 int n; 776 int n;
775 unsigned long blocknr; 777 unsigned int blocknr;
776 778
777 if (!journal) 779 if (!journal)
778 return NULL; 780 return NULL;
@@ -844,10 +846,16 @@ static void journal_fail_superblock (journal_t *journal)
844static int journal_reset(journal_t *journal) 846static int journal_reset(journal_t *journal)
845{ 847{
846 journal_superblock_t *sb = journal->j_superblock; 848 journal_superblock_t *sb = journal->j_superblock;
847 unsigned long first, last; 849 unsigned int first, last;
848 850
849 first = be32_to_cpu(sb->s_first); 851 first = be32_to_cpu(sb->s_first);
850 last = be32_to_cpu(sb->s_maxlen); 852 last = be32_to_cpu(sb->s_maxlen);
853 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
854 printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
855 first, last);
856 journal_fail_superblock(journal);
857 return -EINVAL;
858 }
851 859
852 journal->j_first = first; 860 journal->j_first = first;
853 journal->j_last = last; 861 journal->j_last = last;
@@ -877,7 +885,7 @@ static int journal_reset(journal_t *journal)
877 **/ 885 **/
878int journal_create(journal_t *journal) 886int journal_create(journal_t *journal)
879{ 887{
880 unsigned long blocknr; 888 unsigned int blocknr;
881 struct buffer_head *bh; 889 struct buffer_head *bh;
882 journal_superblock_t *sb; 890 journal_superblock_t *sb;
883 int i, err; 891 int i, err;
@@ -961,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
961 if (sb->s_start == 0 && journal->j_tail_sequence == 969 if (sb->s_start == 0 && journal->j_tail_sequence ==
962 journal->j_transaction_sequence) { 970 journal->j_transaction_sequence) {
963 jbd_debug(1,"JBD: Skipping superblock update on recovered sb " 971 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
964 "(start %ld, seq %d, errno %d)\n", 972 "(start %u, seq %d, errno %d)\n",
965 journal->j_tail, journal->j_tail_sequence, 973 journal->j_tail, journal->j_tail_sequence,
966 journal->j_errno); 974 journal->j_errno);
967 goto out; 975 goto out;
968 } 976 }
969 977
970 spin_lock(&journal->j_state_lock); 978 spin_lock(&journal->j_state_lock);
971 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 979 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
972 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 980 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
973 981
974 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 982 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1363,7 +1371,7 @@ int journal_flush(journal_t *journal)
1363{ 1371{
1364 int err = 0; 1372 int err = 0;
1365 transaction_t *transaction = NULL; 1373 transaction_t *transaction = NULL;
1366 unsigned long old_tail; 1374 unsigned int old_tail;
1367 1375
1368 spin_lock(&journal->j_state_lock); 1376 spin_lock(&journal->j_state_lock);
1369 1377
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
70{ 70{
71 int err; 71 int err;
72 unsigned int max, nbufs, next; 72 unsigned int max, nbufs, next;
73 unsigned long blocknr; 73 unsigned int blocknr;
74 struct buffer_head *bh; 74 struct buffer_head *bh;
75 75
76 struct buffer_head * bufs[MAXBUF]; 76 struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset) 132 unsigned int offset)
133{ 133{
134 int err; 134 int err;
135 unsigned long blocknr; 135 unsigned int blocknr;
136 struct buffer_head *bh; 136 struct buffer_head *bh;
137 137
138 *bhp = NULL; 138 *bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
314 struct recovery_info *info, enum passtype pass) 314 struct recovery_info *info, enum passtype pass)
315{ 315{
316 unsigned int first_commit_ID, next_commit_ID; 316 unsigned int first_commit_ID, next_commit_ID;
317 unsigned long next_log_block; 317 unsigned int next_log_block;
318 int err, success = 0; 318 int err, success = 0;
319 journal_superblock_t * sb; 319 journal_superblock_t * sb;
320 journal_header_t * tmp; 320 journal_header_t * tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
367 if (tid_geq(next_commit_ID, info->end_transaction)) 367 if (tid_geq(next_commit_ID, info->end_transaction))
368 break; 368 break;
369 369
370 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", 370 jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
371 next_commit_ID, next_log_block, journal->j_last); 371 next_commit_ID, next_log_block, journal->j_last);
372 372
373 /* Skip over each chunk of the transaction looking 373 /* Skip over each chunk of the transaction looking
374 * either the next descriptor block or the final commit 374 * either the next descriptor block or the final commit
375 * record. */ 375 * record. */
376 376
377 jbd_debug(3, "JBD: checking block %ld\n", next_log_block); 377 jbd_debug(3, "JBD: checking block %u\n", next_log_block);
378 err = jread(&bh, journal, next_log_block); 378 err = jread(&bh, journal, next_log_block);
379 if (err) 379 if (err)
380 goto failed; 380 goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
429 tagp = &bh->b_data[sizeof(journal_header_t)]; 429 tagp = &bh->b_data[sizeof(journal_header_t)];
430 while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) 430 while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
431 <= journal->j_blocksize) { 431 <= journal->j_blocksize) {
432 unsigned long io_block; 432 unsigned int io_block;
433 433
434 tag = (journal_block_tag_t *) tagp; 434 tag = (journal_block_tag_t *) tagp;
435 flags = be32_to_cpu(tag->t_flags); 435 flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
443 success = err; 443 success = err;
444 printk (KERN_ERR 444 printk (KERN_ERR
445 "JBD: IO error %d recovering " 445 "JBD: IO error %d recovering "
446 "block %ld in log\n", 446 "block %u in log\n",
447 err, io_block); 447 err, io_block);
448 } else { 448 } else {
449 unsigned long blocknr; 449 unsigned int blocknr;
450 450
451 J_ASSERT(obh != NULL); 451 J_ASSERT(obh != NULL);
452 blocknr = be32_to_cpu(tag->t_blocknr); 452 blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
581 max = be32_to_cpu(header->r_count); 581 max = be32_to_cpu(header->r_count);
582 582
583 while (offset < max) { 583 while (offset < max) {
584 unsigned long blocknr; 584 unsigned int blocknr;
585 int err; 585 int err;
586 586
587 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); 587 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
101{ 101{
102 struct list_head hash; 102 struct list_head hash;
103 tid_t sequence; /* Used for recovery only */ 103 tid_t sequence; /* Used for recovery only */
104 unsigned long blocknr; 104 unsigned int blocknr;
105}; 105};
106 106
107 107
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
126/* Utility functions to maintain the revoke table */ 126/* Utility functions to maintain the revoke table */
127 127
128/* Borrowed from buffer.c: this is a tried and tested block hash function */ 128/* Borrowed from buffer.c: this is a tried and tested block hash function */
129static inline int hash(journal_t *journal, unsigned long block) 129static inline int hash(journal_t *journal, unsigned int block)
130{ 130{
131 struct jbd_revoke_table_s *table = journal->j_revoke; 131 struct jbd_revoke_table_s *table = journal->j_revoke;
132 int hash_shift = table->hash_shift; 132 int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
136 (block << (hash_shift - 12))) & (table->hash_size - 1); 136 (block << (hash_shift - 12))) & (table->hash_size - 1);
137} 137}
138 138
139static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, 139static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
140 tid_t seq) 140 tid_t seq)
141{ 141{
142 struct list_head *hash_list; 142 struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
166/* Find a revoke record in the journal's hash table. */ 166/* Find a revoke record in the journal's hash table. */
167 167
168static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, 168static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
169 unsigned long blocknr) 169 unsigned int blocknr)
170{ 170{
171 struct list_head *hash_list; 171 struct list_head *hash_list;
172 struct jbd_revoke_record_s *record; 172 struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
332 * by one. 332 * by one.
333 */ 333 */
334 334
335int journal_revoke(handle_t *handle, unsigned long blocknr, 335int journal_revoke(handle_t *handle, unsigned int blocknr,
336 struct buffer_head *bh_in) 336 struct buffer_head *bh_in)
337{ 337{
338 struct buffer_head *bh = NULL; 338 struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
401 } 401 }
402 } 402 }
403 403
404 jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); 404 jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
405 err = insert_revoke_hash(journal, blocknr, 405 err = insert_revoke_hash(journal, blocknr,
406 handle->h_transaction->t_tid); 406 handle->h_transaction->t_tid);
407 BUFFER_TRACE(bh_in, "exit"); 407 BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
644 */ 644 */
645 645
646int journal_set_revoke(journal_t *journal, 646int journal_set_revoke(journal_t *journal,
647 unsigned long blocknr, 647 unsigned int blocknr,
648 tid_t sequence) 648 tid_t sequence)
649{ 649{
650 struct jbd_revoke_record_s *record; 650 struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
668 */ 668 */
669 669
670int journal_test_revoke(journal_t *journal, 670int journal_test_revoke(journal_t *journal,
671 unsigned long blocknr, 671 unsigned int blocknr,
672 tid_t sequence) 672 tid_t sequence)
673{ 673{
674 struct jbd_revoke_record_s *record; 674 struct jbd_revoke_record_s *record;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73242ba7c7b1..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
56 spin_lock_init(&transaction->t_handle_lock); 56 spin_lock_init(&transaction->t_handle_lock);
57 57
58 /* Set up the commit timer for the new transaction. */ 58 /* Set up the commit timer for the new transaction. */
59 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 59 journal->j_commit_timer.expires =
60 round_jiffies_up(transaction->t_expires);
60 add_timer(&journal->j_commit_timer); 61 add_timer(&journal->j_commit_timer);
61 62
62 J_ASSERT(journal->j_running_transaction == NULL); 63 J_ASSERT(journal->j_running_transaction == NULL);
@@ -228,6 +229,8 @@ repeat_locked:
228 __log_space_left(journal)); 229 __log_space_left(journal));
229 spin_unlock(&transaction->t_handle_lock); 230 spin_unlock(&transaction->t_handle_lock);
230 spin_unlock(&journal->j_state_lock); 231 spin_unlock(&journal->j_state_lock);
232
233 lock_map_acquire(&handle->h_lockdep_map);
231out: 234out:
232 if (unlikely(new_transaction)) /* It's usually NULL */ 235 if (unlikely(new_transaction)) /* It's usually NULL */
233 kfree(new_transaction); 236 kfree(new_transaction);
@@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
292 handle = ERR_PTR(err); 295 handle = ERR_PTR(err);
293 goto out; 296 goto out;
294 } 297 }
295
296 lock_map_acquire(&handle->h_lockdep_map);
297
298out: 298out:
299 return handle; 299 return handle;
300} 300}
@@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
416 __log_start_commit(journal, transaction->t_tid); 416 __log_start_commit(journal, transaction->t_tid);
417 spin_unlock(&journal->j_state_lock); 417 spin_unlock(&journal->j_state_lock);
418 418
419 lock_map_release(&handle->h_lockdep_map);
419 handle->h_buffer_credits = nblocks; 420 handle->h_buffer_credits = nblocks;
420 ret = start_this_handle(journal, handle); 421 ret = start_this_handle(journal, handle);
421 return ret; 422 return ret;
@@ -489,34 +490,15 @@ void journal_unlock_updates (journal_t *journal)
489 wake_up(&journal->j_wait_transaction_locked); 490 wake_up(&journal->j_wait_transaction_locked);
490} 491}
491 492
492/* 493static void warn_dirty_buffer(struct buffer_head *bh)
493 * Report any unexpected dirty buffers which turn up. Normally those
494 * indicate an error, but they can occur if the user is running (say)
495 * tune2fs to modify the live filesystem, so we need the option of
496 * continuing as gracefully as possible. #
497 *
498 * The caller should already hold the journal lock and
499 * j_list_lock spinlock: most callers will need those anyway
500 * in order to probe the buffer's journaling state safely.
501 */
502static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
503{ 494{
504 int jlist; 495 char b[BDEVNAME_SIZE];
505
506 /* If this buffer is one which might reasonably be dirty
507 * --- ie. data, or not part of this journal --- then
508 * we're OK to leave it alone, but otherwise we need to
509 * move the dirty bit to the journal's own internal
510 * JBDDirty bit. */
511 jlist = jh->b_jlist;
512
513 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
514 jlist == BJ_Shadow || jlist == BJ_Forget) {
515 struct buffer_head *bh = jh2bh(jh);
516 496
517 if (test_clear_buffer_dirty(bh)) 497 printk(KERN_WARNING
518 set_buffer_jbddirty(bh); 498 "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
519 } 499 "There's a risk of filesystem corruption in case of system "
500 "crash.\n",
501 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
520} 502}
521 503
522/* 504/*
@@ -583,14 +565,16 @@ repeat:
583 if (jh->b_next_transaction) 565 if (jh->b_next_transaction)
584 J_ASSERT_JH(jh, jh->b_next_transaction == 566 J_ASSERT_JH(jh, jh->b_next_transaction ==
585 transaction); 567 transaction);
568 warn_dirty_buffer(bh);
586 } 569 }
587 /* 570 /*
588 * In any case we need to clean the dirty flag and we must 571 * In any case we need to clean the dirty flag and we must
589 * do it under the buffer lock to be sure we don't race 572 * do it under the buffer lock to be sure we don't race
590 * with running write-out. 573 * with running write-out.
591 */ 574 */
592 JBUFFER_TRACE(jh, "Unexpected dirty buffer"); 575 JBUFFER_TRACE(jh, "Journalling dirty buffer");
593 jbd_unexpected_dirty_buffer(jh); 576 clear_buffer_dirty(bh);
577 set_buffer_jbddirty(bh);
594 } 578 }
595 579
596 unlock_buffer(bh); 580 unlock_buffer(bh);
@@ -826,6 +810,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
826 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 810 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
827 811
828 if (jh->b_transaction == NULL) { 812 if (jh->b_transaction == NULL) {
813 /*
814 * Previous journal_forget() could have left the buffer
815 * with jbddirty bit set because it was being committed. When
816 * the commit finished, we've filed the buffer for
817 * checkpointing and marked it dirty. Now we are reallocating
818 * the buffer so the transaction freeing it must have
819 * committed and so it's safe to clear the dirty bit.
820 */
821 clear_buffer_dirty(jh2bh(jh));
829 jh->b_transaction = transaction; 822 jh->b_transaction = transaction;
830 823
831 /* first access by this transaction */ 824 /* first access by this transaction */
@@ -1782,8 +1775,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1782 1775
1783 if (jh->b_cp_transaction) { 1776 if (jh->b_cp_transaction) {
1784 JBUFFER_TRACE(jh, "on running+cp transaction"); 1777 JBUFFER_TRACE(jh, "on running+cp transaction");
1778 /*
1779 * We don't want to write the buffer anymore, clear the
1780 * bit so that we don't confuse checks in
1781 * __journal_file_buffer
1782 */
1783 clear_buffer_dirty(bh);
1785 __journal_file_buffer(jh, transaction, BJ_Forget); 1784 __journal_file_buffer(jh, transaction, BJ_Forget);
1786 clear_buffer_jbddirty(bh);
1787 may_free = 0; 1785 may_free = 0;
1788 } else { 1786 } else {
1789 JBUFFER_TRACE(jh, "on running transaction"); 1787 JBUFFER_TRACE(jh, "on running transaction");
@@ -2041,12 +2039,17 @@ void __journal_file_buffer(struct journal_head *jh,
2041 if (jh->b_transaction && jh->b_jlist == jlist) 2039 if (jh->b_transaction && jh->b_jlist == jlist)
2042 return; 2040 return;
2043 2041
2044 /* The following list of buffer states needs to be consistent
2045 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
2046 * state. */
2047
2048 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 2042 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2049 jlist == BJ_Shadow || jlist == BJ_Forget) { 2043 jlist == BJ_Shadow || jlist == BJ_Forget) {
2044 /*
2045 * For metadata buffers, we track dirty bit in buffer_jbddirty
2046 * instead of buffer_dirty. We should not see a dirty bit set
2047 * here because we clear it in do_get_write_access but e.g.
2048 * tune2fs can modify the sb and set the dirty bit at any time
2049 * so we try to gracefully handle that.
2050 */
2051 if (buffer_dirty(bh))
2052 warn_dirty_buffer(bh);
2050 if (test_clear_buffer_dirty(bh) || 2053 if (test_clear_buffer_dirty(bh) ||
2051 test_clear_buffer_jbddirty(bh)) 2054 test_clear_buffer_jbddirty(bh))
2052 was_dirty = 1; 2055 was_dirty = 1;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..26d991ddc1e6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
27#include <linux/bio.h> 27#include <linux/bio.h>
28#include <linux/blkdev.h>
28#include <trace/events/jbd2.h> 29#include <trace/events/jbd2.h>
29 30
30/* 31/*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
133 bh->b_end_io = journal_end_buffer_io_sync; 134 bh->b_end_io = journal_end_buffer_io_sync;
134 135
135 if (journal->j_flags & JBD2_BARRIER && 136 if (journal->j_flags & JBD2_BARRIER &&
136 !JBD2_HAS_INCOMPAT_FEATURE(journal, 137 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 138 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
138 set_buffer_ordered(bh); 139 set_buffer_ordered(bh);
139 barrier_done = 1; 140 barrier_done = 1;
140 } 141 }
@@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
220 .nr_to_write = mapping->nrpages * 2, 221 .nr_to_write = mapping->nrpages * 2,
221 .range_start = 0, 222 .range_start = 0,
222 .range_end = i_size_read(mapping->host), 223 .range_end = i_size_read(mapping->host),
223 .for_writepages = 1,
224 }; 224 };
225 225
226 ret = generic_writepages(mapping, &wbc); 226 ret = generic_writepages(mapping, &wbc);
@@ -707,11 +707,13 @@ start_journal_io:
707 /* Done it all: now write the commit record asynchronously. */ 707 /* Done it all: now write the commit record asynchronously. */
708 708
709 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 709 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
711 err = journal_submit_commit_record(journal, commit_transaction, 711 err = journal_submit_commit_record(journal, commit_transaction,
712 &cbh, crc32_sum); 712 &cbh, crc32_sum);
713 if (err) 713 if (err)
714 __jbd2_journal_abort_hard(journal); 714 __jbd2_journal_abort_hard(journal);
715 if (journal->j_flags & JBD2_BARRIER)
716 blkdev_issue_flush(journal->j_dev, NULL);
715 } 717 }
716 718
717 /* 719 /*
@@ -834,7 +836,7 @@ wait_for_iobuf:
834 jbd_debug(3, "JBD: commit phase 5\n"); 836 jbd_debug(3, "JBD: commit phase 5\n");
835 837
836 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 838 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
837 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 839 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
838 err = journal_submit_commit_record(journal, commit_transaction, 840 err = journal_submit_commit_record(journal, commit_transaction,
839 &cbh, crc32_sum); 841 &cbh, crc32_sum);
840 if (err) 842 if (err)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..a8a358bc0f21 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
1187 1187
1188 first = be32_to_cpu(sb->s_first); 1188 first = be32_to_cpu(sb->s_first);
1189 last = be32_to_cpu(sb->s_maxlen); 1189 last = be32_to_cpu(sb->s_maxlen);
1190 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1191 printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
1192 first, last);
1193 journal_fail_superblock(journal);
1194 return -EINVAL;
1195 }
1190 1196
1191 journal->j_first = first; 1197 journal->j_first = first;
1192 journal->j_last = last; 1198 journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
57 INIT_LIST_HEAD(&transaction->t_private_list); 57 INIT_LIST_HEAD(&transaction->t_private_list);
58 58
59 /* Set up the commit timer for the new transaction. */ 59 /* Set up the commit timer for the new transaction. */
60 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 60 journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
61 add_timer(&journal->j_commit_timer); 61 add_timer(&journal->j_commit_timer);
62 62
63 J_ASSERT(journal->j_running_transaction == NULL); 63 J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
238 __jbd2_log_space_left(journal)); 238 __jbd2_log_space_left(journal));
239 spin_unlock(&transaction->t_handle_lock); 239 spin_unlock(&transaction->t_handle_lock);
240 spin_unlock(&journal->j_state_lock); 240 spin_unlock(&journal->j_state_lock);
241
242 lock_map_acquire(&handle->h_lockdep_map);
241out: 243out:
242 if (unlikely(new_transaction)) /* It's usually NULL */ 244 if (unlikely(new_transaction)) /* It's usually NULL */
243 kfree(new_transaction); 245 kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
303 handle = ERR_PTR(err); 305 handle = ERR_PTR(err);
304 goto out; 306 goto out;
305 } 307 }
306
307 lock_map_acquire(&handle->h_lockdep_map);
308out: 308out:
309 return handle; 309 return handle;
310} 310}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
426 __jbd2_log_start_commit(journal, transaction->t_tid); 426 __jbd2_log_start_commit(journal, transaction->t_tid);
427 spin_unlock(&journal->j_state_lock); 427 spin_unlock(&journal->j_state_lock);
428 428
429 lock_map_release(&handle->h_lockdep_map);
429 handle->h_buffer_credits = nblocks; 430 handle->h_buffer_credits = nblocks;
430 ret = start_this_handle(journal, handle); 431 ret = start_this_handle(journal, handle);
431 return ret; 432 return ret;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218e..7edb62e97419 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
258 return rc; 258 return rc;
259} 259}
260 260
261static int jffs2_check_acl(struct inode *inode, int mask) 261int jffs2_check_acl(struct inode *inode, int mask)
262{ 262{
263 struct posix_acl *acl; 263 struct posix_acl *acl;
264 int rc; 264 int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
274 return -EAGAIN; 274 return -EAGAIN;
275} 275}
276 276
277int jffs2_permission(struct inode *inode, int mask)
278{
279 return generic_permission(inode, mask, jffs2_check_acl);
280}
281
282int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) 277int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
283{ 278{
284 struct posix_acl *acl, *clone; 279 struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f6..f0ba63e3c36b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
26 26
27#ifdef CONFIG_JFFS2_FS_POSIX_ACL 27#ifdef CONFIG_JFFS2_FS_POSIX_ACL
28 28
29extern int jffs2_permission(struct inode *, int); 29extern int jffs2_check_acl(struct inode *, int);
30extern int jffs2_acl_chmod(struct inode *); 30extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
36 36
37#else 37#else
38 38
39#define jffs2_permission (NULL) 39#define jffs2_check_acl (NULL)
40#define jffs2_acl_chmod(inode) (0) 40#define jffs2_acl_chmod(inode) (0)
41#define jffs2_init_acl_pre(dir_i,inode,mode) (0) 41#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
42#define jffs2_init_acl_post(inode) (0) 42#define jffs2_init_acl_post(inode) (0)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4c..7aa4417e085f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
55 .rmdir = jffs2_rmdir, 55 .rmdir = jffs2_rmdir,
56 .mknod = jffs2_mknod, 56 .mknod = jffs2_mknod,
57 .rename = jffs2_rename, 57 .rename = jffs2_rename,
58 .permission = jffs2_permission, 58 .check_acl = jffs2_check_acl,
59 .setattr = jffs2_setattr, 59 .setattr = jffs2_setattr,
60 .setxattr = jffs2_setxattr, 60 .setxattr = jffs2_setxattr,
61 .getxattr = jffs2_getxattr, 61 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5edc2bf20581..b7b74e299142 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
56 56
57const struct inode_operations jffs2_file_inode_operations = 57const struct inode_operations jffs2_file_inode_operations =
58{ 58{
59 .permission = jffs2_permission, 59 .check_acl = jffs2_check_acl,
60 .setattr = jffs2_setattr, 60 .setattr = jffs2_setattr,
61 .setxattr = jffs2_setxattr, 61 .setxattr = jffs2_setxattr,
62 .getxattr = jffs2_getxattr, 62 .getxattr = jffs2_getxattr,
@@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
99 kunmap(pg); 99 kunmap(pg);
100 100
101 D2(printk(KERN_DEBUG "readpage finished\n")); 101 D2(printk(KERN_DEBUG "readpage finished\n"));
102 return 0; 102 return ret;
103} 103}
104 104
105int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg) 105int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad9..4ec11e8bda8c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
21{ 21{
22 .readlink = generic_readlink, 22 .readlink = generic_readlink,
23 .follow_link = jffs2_follow_link, 23 .follow_link = jffs2_follow_link,
24 .permission = jffs2_permission, 24 .check_acl = jffs2_check_acl,
25 .setattr = jffs2_setattr, 25 .setattr = jffs2_setattr,
26 .setxattr = jffs2_setxattr, 26 .setxattr = jffs2_setxattr,
27 .getxattr = jffs2_getxattr, 27 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index d9a721e6db70..5ef7bac265e5 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
1268 if (!c->wbuf) 1268 if (!c->wbuf)
1269 return -ENOMEM; 1269 return -ENOMEM;
1270 1270
1271#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
1272 c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
1273 if (!c->wbuf_verify) {
1274 kfree(c->wbuf);
1275 return -ENOMEM;
1276 }
1277#endif
1271 return 0; 1278 return 0;
1272} 1279}
1273 1280
1274void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { 1281void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
1282#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
1283 kfree(c->wbuf_verify);
1284#endif
1275 kfree(c->wbuf); 1285 kfree(c->wbuf);
1276} 1286}
1277 1287
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 91fa3ad6e8c2..d66477c34306 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
67 acl = posix_acl_from_xattr(value, size); 67 acl = posix_acl_from_xattr(value, size);
68 } 68 }
69 kfree(value); 69 kfree(value);
70 if (!IS_ERR(acl)) { 70 if (!IS_ERR(acl))
71 set_cached_acl(inode, type, acl); 71 set_cached_acl(inode, type, acl);
72 posix_acl_release(acl);
73 }
74 return acl; 72 return acl;
75} 73}
76 74
@@ -116,7 +114,7 @@ out:
116 return rc; 114 return rc;
117} 115}
118 116
119static int jfs_check_acl(struct inode *inode, int mask) 117int jfs_check_acl(struct inode *inode, int mask)
120{ 118{
121 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); 119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
122 120
@@ -131,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
131 return -EAGAIN; 129 return -EAGAIN;
132} 130}
133 131
134int jfs_permission(struct inode *inode, int mask)
135{
136 return generic_permission(inode, mask, jfs_check_acl);
137}
138
139int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) 132int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
140{ 133{
141 struct posix_acl *acl = NULL; 134 struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3b..2b70fa78e4a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
96 .removexattr = jfs_removexattr, 96 .removexattr = jfs_removexattr,
97#ifdef CONFIG_JFS_POSIX_ACL 97#ifdef CONFIG_JFS_POSIX_ACL
98 .setattr = jfs_setattr, 98 .setattr = jfs_setattr,
99 .permission = jfs_permission, 99 .check_acl = jfs_check_acl,
100#endif 100#endif
101}; 101};
102 102
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a389..b07bd417ef85 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
20 20
21#ifdef CONFIG_JFS_POSIX_ACL 21#ifdef CONFIG_JFS_POSIX_ACL
22 22
23int jfs_permission(struct inode *, int); 23int jfs_check_acl(struct inode *, int);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 24int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_setattr(struct dentry *, struct iattr *); 25int jfs_setattr(struct dentry *, struct iattr *);
26 26
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92a..c79a4270f083 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
1543 .removexattr = jfs_removexattr, 1543 .removexattr = jfs_removexattr,
1544#ifdef CONFIG_JFS_POSIX_ACL 1544#ifdef CONFIG_JFS_POSIX_ACL
1545 .setattr = jfs_setattr, 1545 .setattr = jfs_setattr,
1546 .permission = jfs_permission, 1546 .check_acl = jfs_check_acl,
1547#endif 1547#endif
1548}; 1548};
1549 1549
diff --git a/fs/libfs.c b/fs/libfs.c
index ddfa89948c3f..dcec3d3ea64f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
217 return PTR_ERR(s); 217 return PTR_ERR(s);
218 218
219 s->s_flags = MS_NOUSER; 219 s->s_flags = MS_NOUSER;
220 s->s_maxbytes = ~0ULL; 220 s->s_maxbytes = MAX_LFS_FILESIZE;
221 s->s_blocksize = PAGE_SIZE; 221 s->s_blocksize = PAGE_SIZE;
222 s->s_blocksize_bits = PAGE_SHIFT; 222 s->s_blocksize_bits = PAGE_SHIFT;
223 s->s_magic = magic; 223 s->s_magic = magic;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 99d737bd4325..7cb076ac6b45 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
87 return hash & (NLM_HOST_NRHASH - 1); 87 return hash & (NLM_HOST_NRHASH - 1);
88} 88}
89 89
90static void nlm_clear_port(struct sockaddr *sap)
91{
92 switch (sap->sa_family) {
93 case AF_INET:
94 ((struct sockaddr_in *)sap)->sin_port = 0;
95 break;
96 case AF_INET6:
97 ((struct sockaddr_in6 *)sap)->sin6_port = 0;
98 break;
99 }
100}
101
102/* 90/*
103 * Common host lookup routine for server & client 91 * Common host lookup routine for server & client
104 */ 92 */
@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
177 host->h_addrbuf = nsm->sm_addrbuf; 165 host->h_addrbuf = nsm->sm_addrbuf;
178 memcpy(nlm_addr(host), ni->sap, ni->salen); 166 memcpy(nlm_addr(host), ni->sap, ni->salen);
179 host->h_addrlen = ni->salen; 167 host->h_addrlen = ni->salen;
180 nlm_clear_port(nlm_addr(host)); 168 rpc_set_port(nlm_addr(host), 0);
181 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); 169 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
182 host->h_version = ni->version; 170 host->h_version = ni->version;
183 host->h_proto = ni->protocol; 171 host->h_proto = ni->protocol;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7fce1b525849..30c933188dd7 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
61 return (struct sockaddr *)&nsm->sm_addr; 61 return (struct sockaddr *)&nsm->sm_addr;
62} 62}
63 63
64static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
65 const size_t len)
66{
67 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
68 snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
69}
70
71static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
72 const size_t len)
73{
74 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
75
76 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
77 snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
78 else if (sin6->sin6_scope_id != 0)
79 snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
80 sin6->sin6_scope_id);
81 else
82 snprintf(buf, len, "%pI6", &sin6->sin6_addr);
83}
84
85static void nsm_display_address(const struct sockaddr *sap,
86 char *buf, const size_t len)
87{
88 switch (sap->sa_family) {
89 case AF_INET:
90 nsm_display_ipv4_address(sap, buf, len);
91 break;
92 case AF_INET6:
93 nsm_display_ipv6_address(sap, buf, len);
94 break;
95 default:
96 snprintf(buf, len, "unsupported address family");
97 break;
98 }
99}
100
101static struct rpc_clnt *nsm_create(void) 64static struct rpc_clnt *nsm_create(void)
102{ 65{
103 struct sockaddr_in sin = { 66 struct sockaddr_in sin = {
@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
307 memcpy(nsm_addr(new), sap, salen); 270 memcpy(nsm_addr(new), sap, salen);
308 new->sm_addrlen = salen; 271 new->sm_addrlen = salen;
309 nsm_init_private(new); 272 nsm_init_private(new);
310 nsm_display_address((const struct sockaddr *)&new->sm_addr, 273
311 new->sm_addrbuf, sizeof(new->sm_addrbuf)); 274 if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
275 sizeof(new->sm_addrbuf)) == 0)
276 (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
277 "unsupported address family");
312 memcpy(new->sm_name, hostname, hostname_len); 278 memcpy(new->sm_name, hostname, hostname_len);
313 new->sm_name[hostname_len] = '\0'; 279 new->sm_name[hostname_len] = '\0';
314 280
diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..19ee18a6829b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
768 * give it the opportunity to lock the file. 768 * give it the opportunity to lock the file.
769 */ 769 */
770 if (found) 770 if (found)
771 cond_resched_bkl(); 771 cond_resched();
772 772
773find_conflict: 773find_conflict:
774 for_each_lock(inode, before) { 774 for_each_lock(inode, before) {
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1591 if (can_sleep) 1591 if (can_sleep)
1592 lock->fl_flags |= FL_SLEEP; 1592 lock->fl_flags |= FL_SLEEP;
1593 1593
1594 error = security_file_lock(filp, cmd); 1594 error = security_file_lock(filp, lock->fl_type);
1595 if (error) 1595 if (error)
1596 goto out_free; 1596 goto out_free;
1597 1597
diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..d11f404667e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
169EXPORT_SYMBOL(putname); 169EXPORT_SYMBOL(putname);
170#endif 170#endif
171 171
172 172/*
173/** 173 * This does basic POSIX ACL permission checking
174 * generic_permission - check for access rights on a Posix-like filesystem
175 * @inode: inode to check access rights for
176 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
177 * @check_acl: optional callback to check for Posix ACLs
178 *
179 * Used to check for read/write/execute permissions on a file.
180 * We use "fsuid" for this, letting us set arbitrary permissions
181 * for filesystem access without changing the "normal" uids which
182 * are used for other things..
183 */ 174 */
184int generic_permission(struct inode *inode, int mask, 175static int acl_permission_check(struct inode *inode, int mask,
185 int (*check_acl)(struct inode *inode, int mask)) 176 int (*check_acl)(struct inode *inode, int mask))
186{ 177{
187 umode_t mode = inode->i_mode; 178 umode_t mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
193 else { 184 else {
194 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 185 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
195 int error = check_acl(inode, mask); 186 int error = check_acl(inode, mask);
196 if (error == -EACCES) 187 if (error != -EAGAIN)
197 goto check_capabilities;
198 else if (error != -EAGAIN)
199 return error; 188 return error;
200 } 189 }
201 190
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
208 */ 197 */
209 if ((mask & ~mode) == 0) 198 if ((mask & ~mode) == 0)
210 return 0; 199 return 0;
200 return -EACCES;
201}
202
203/**
204 * generic_permission - check for access rights on a Posix-like filesystem
205 * @inode: inode to check access rights for
206 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
207 * @check_acl: optional callback to check for Posix ACLs
208 *
209 * Used to check for read/write/execute permissions on a file.
210 * We use "fsuid" for this, letting us set arbitrary permissions
211 * for filesystem access without changing the "normal" uids which
212 * are used for other things..
213 */
214int generic_permission(struct inode *inode, int mask,
215 int (*check_acl)(struct inode *inode, int mask))
216{
217 int ret;
218
219 /*
220 * Do the basic POSIX ACL permission checks.
221 */
222 ret = acl_permission_check(inode, mask, check_acl);
223 if (ret != -EACCES)
224 return ret;
211 225
212 check_capabilities:
213 /* 226 /*
214 * Read/write DACs are always overridable. 227 * Read/write DACs are always overridable.
215 * Executable DACs are overridable if at least one exec bit is set. 228 * Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
262 if (inode->i_op->permission) 275 if (inode->i_op->permission)
263 retval = inode->i_op->permission(inode, mask); 276 retval = inode->i_op->permission(inode, mask);
264 else 277 else
265 retval = generic_permission(inode, mask, NULL); 278 retval = generic_permission(inode, mask, inode->i_op->check_acl);
266 279
267 if (retval) 280 if (retval)
268 return retval; 281 return retval;
@@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
432 */ 445 */
433static int exec_permission_lite(struct inode *inode) 446static int exec_permission_lite(struct inode *inode)
434{ 447{
435 umode_t mode = inode->i_mode; 448 int ret;
436 449
437 if (inode->i_op->permission) 450 if (inode->i_op->permission) {
438 return -EAGAIN; 451 ret = inode->i_op->permission(inode, MAY_EXEC);
439 452 if (!ret)
440 if (current_fsuid() == inode->i_uid) 453 goto ok;
441 mode >>= 6; 454 return ret;
442 else if (in_group_p(inode->i_gid)) 455 }
443 mode >>= 3; 456 ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
444 457 if (!ret)
445 if (mode & MAY_EXEC)
446 goto ok;
447
448 if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
449 goto ok;
450
451 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
452 goto ok; 458 goto ok;
453 459
454 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) 460 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
455 goto ok; 461 goto ok;
456 462
457 return -EACCES; 463 return ret;
458ok: 464ok:
459 return security_inode_permission(inode, MAY_EXEC); 465 return security_inode_permission(inode, MAY_EXEC);
460} 466}
@@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
853 859
854 nd->flags |= LOOKUP_CONTINUE; 860 nd->flags |= LOOKUP_CONTINUE;
855 err = exec_permission_lite(inode); 861 err = exec_permission_lite(inode);
856 if (err == -EAGAIN)
857 err = inode_permission(nd->path.dentry->d_inode,
858 MAY_EXEC);
859 if (!err)
860 err = ima_path_check(&nd->path, MAY_EXEC,
861 IMA_COUNT_UPDATE);
862 if (err) 862 if (err)
863 break; 863 break;
864 864
@@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag)
1533 if (error) 1533 if (error)
1534 return error; 1534 return error;
1535 1535
1536 error = ima_path_check(path, 1536 error = ima_path_check(path, acc_mode ?
1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), 1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
1538 ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
1538 IMA_COUNT_UPDATE); 1539 IMA_COUNT_UPDATE);
1540
1539 if (error) 1541 if (error)
1540 return error; 1542 return error;
1541 /* 1543 /*
1542 * An append-only file must be opened in append mode for writing. 1544 * An append-only file must be opened in append mode for writing.
1543 */ 1545 */
1544 if (IS_APPEND(inode)) { 1546 if (IS_APPEND(inode)) {
1547 error = -EPERM;
1545 if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) 1548 if ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1546 return -EPERM; 1549 goto err_out;
1547 if (flag & O_TRUNC) 1550 if (flag & O_TRUNC)
1548 return -EPERM; 1551 goto err_out;
1549 } 1552 }
1550 1553
1551 /* O_NOATIME can only be set by the owner or superuser */ 1554 /* O_NOATIME can only be set by the owner or superuser */
1552 if (flag & O_NOATIME) 1555 if (flag & O_NOATIME)
1553 if (!is_owner_or_cap(inode)) 1556 if (!is_owner_or_cap(inode)) {
1554 return -EPERM; 1557 error = -EPERM;
1558 goto err_out;
1559 }
1555 1560
1556 /* 1561 /*
1557 * Ensure there are no outstanding leases on the file. 1562 * Ensure there are no outstanding leases on the file.
1558 */ 1563 */
1559 error = break_lease(inode, flag); 1564 error = break_lease(inode, flag);
1560 if (error) 1565 if (error)
1561 return error; 1566 goto err_out;
1562 1567
1563 if (flag & O_TRUNC) { 1568 if (flag & O_TRUNC) {
1564 error = get_write_access(inode); 1569 error = get_write_access(inode);
1565 if (error) 1570 if (error)
1566 return error; 1571 goto err_out;
1567 1572
1568 /* 1573 /*
1569 * Refuse to truncate files with mandatory locks held on them. 1574 * Refuse to truncate files with mandatory locks held on them.
@@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag)
1581 } 1586 }
1582 put_write_access(inode); 1587 put_write_access(inode);
1583 if (error) 1588 if (error)
1584 return error; 1589 goto err_out;
1585 } else 1590 } else
1586 if (flag & FMODE_WRITE) 1591 if (flag & FMODE_WRITE)
1587 vfs_dq_init(inode); 1592 vfs_dq_init(inode);
1588 1593
1589 return 0; 1594 return 0;
1595err_out:
1596 ima_counts_put(path, acc_mode ?
1597 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
1598 ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
1599 return error;
1590} 1600}
1591 1601
1592/* 1602/*
diff --git a/fs/namespace.c b/fs/namespace.c
index 277c28a63ead..7230787d18b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -316,7 +316,8 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
316 */ 316 */
317int mnt_want_write_file(struct file *file) 317int mnt_want_write_file(struct file *file)
318{ 318{
319 if (!(file->f_mode & FMODE_WRITE)) 319 struct inode *inode = file->f_dentry->d_inode;
320 if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
320 return mnt_want_write(file->f_path.mnt); 321 return mnt_want_write(file->f_path.mnt);
321 else 322 else
322 return mnt_clone_write(file->f_path.mnt); 323 return mnt_clone_write(file->f_path.mnt);
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 845159814de2..da7fda639eac 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ 7nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
8 direct.o pagelist.o proc.o read.o symlink.o unlink.o \ 8 direct.o pagelist.o proc.o read.o symlink.o unlink.o \
9 write.o namespace.o mount_clnt.o 9 write.o namespace.o mount_clnt.o \
10 dns_resolve.o cache_lib.o
10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
11nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o 12nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
12nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o 13nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 000000000000..b4ffd0146ea6
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
1/*
2 * linux/fs/nfs/cache_lib.c
3 *
4 * Helper routines for the NFS client caches
5 *
6 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
7 */
8#include <linux/kmod.h>
9#include <linux/module.h>
10#include <linux/moduleparam.h>
11#include <linux/mount.h>
12#include <linux/namei.h>
13#include <linux/sunrpc/cache.h>
14#include <linux/sunrpc/rpc_pipe_fs.h>
15
16#include "cache_lib.h"
17
18#define NFS_CACHE_UPCALL_PATHLEN 256
19#define NFS_CACHE_UPCALL_TIMEOUT 15
20
21static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
22 "/sbin/nfs_cache_getent";
23static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
24
25module_param_string(cache_getent, nfs_cache_getent_prog,
26 sizeof(nfs_cache_getent_prog), 0600);
27MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
28module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
29MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
30 "the cache upcall is assumed to have failed");
31
32int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
33{
34 static char *envp[] = { "HOME=/",
35 "TERM=linux",
36 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
37 NULL
38 };
39 char *argv[] = {
40 nfs_cache_getent_prog,
41 cd->name,
42 entry_name,
43 NULL
44 };
45 int ret = -EACCES;
46
47 if (nfs_cache_getent_prog[0] == '\0')
48 goto out;
49 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
50 /*
51 * Disable the upcall mechanism if we're getting an ENOENT or
52 * EACCES error. The admin can re-enable it on the fly by using
53 * sysfs to set the 'cache_getent' parameter once the problem
54 * has been fixed.
55 */
56 if (ret == -ENOENT || ret == -EACCES)
57 nfs_cache_getent_prog[0] = '\0';
58out:
59 return ret > 0 ? 0 : ret;
60}
61
62/*
63 * Deferred request handling
64 */
65void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
66{
67 if (atomic_dec_and_test(&dreq->count))
68 kfree(dreq);
69}
70
71static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
72{
73 struct nfs_cache_defer_req *dreq;
74
75 dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
76
77 complete_all(&dreq->completion);
78 nfs_cache_defer_req_put(dreq);
79}
80
81static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
82{
83 struct nfs_cache_defer_req *dreq;
84
85 dreq = container_of(req, struct nfs_cache_defer_req, req);
86 dreq->deferred_req.revisit = nfs_dns_cache_revisit;
87 atomic_inc(&dreq->count);
88
89 return &dreq->deferred_req;
90}
91
92struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
93{
94 struct nfs_cache_defer_req *dreq;
95
96 dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
97 if (dreq) {
98 init_completion(&dreq->completion);
99 atomic_set(&dreq->count, 1);
100 dreq->req.defer = nfs_dns_cache_defer;
101 }
102 return dreq;
103}
104
105int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
106{
107 if (wait_for_completion_timeout(&dreq->completion,
108 nfs_cache_getent_timeout * HZ) == 0)
109 return -ETIMEDOUT;
110 return 0;
111}
112
113int nfs_cache_register(struct cache_detail *cd)
114{
115 struct nameidata nd;
116 struct vfsmount *mnt;
117 int ret;
118
119 mnt = rpc_get_mount();
120 if (IS_ERR(mnt))
121 return PTR_ERR(mnt);
122 ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
123 if (ret)
124 goto err;
125 ret = sunrpc_cache_register_pipefs(nd.path.dentry,
126 cd->name, 0600, cd);
127 path_put(&nd.path);
128 if (!ret)
129 return ret;
130err:
131 rpc_put_mount();
132 return ret;
133}
134
135void nfs_cache_unregister(struct cache_detail *cd)
136{
137 sunrpc_cache_unregister_pipefs(cd);
138 rpc_put_mount();
139}
140
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 000000000000..76f856e284e4
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
1/*
2 * Helper routines for the NFS client caches
3 *
4 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
5 */
6
7#include <linux/completion.h>
8#include <linux/sunrpc/cache.h>
9#include <asm/atomic.h>
10
11/*
12 * Deferred request handling
13 */
14struct nfs_cache_defer_req {
15 struct cache_req req;
16 struct cache_deferred_req deferred_req;
17 struct completion completion;
18 atomic_t count;
19};
20
21extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
22extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
23extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
24extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
25
26extern int nfs_cache_register(struct cache_detail *cd);
27extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7f604c7941fb..293fa0528a6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
43unsigned int nfs_callback_set_tcpport; 43unsigned int nfs_callback_set_tcpport;
44unsigned short nfs_callback_tcpport; 44unsigned short nfs_callback_tcpport;
45unsigned short nfs_callback_tcpport6; 45unsigned short nfs_callback_tcpport6;
46static const int nfs_set_port_min = 0; 46#define NFS_CALLBACK_MAXPORTNR (65535U)
47static const int nfs_set_port_max = 65535;
48 47
49static int param_set_port(const char *val, struct kernel_param *kp) 48static int param_set_portnr(const char *val, struct kernel_param *kp)
50{ 49{
51 char *endp; 50 unsigned long num;
52 int num = simple_strtol(val, &endp, 0); 51 int ret;
53 if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) 52
53 if (!val)
54 return -EINVAL;
55 ret = strict_strtoul(val, 0, &num);
56 if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
54 return -EINVAL; 57 return -EINVAL;
55 *((int *)kp->arg) = num; 58 *((unsigned int *)kp->arg) = num;
56 return 0; 59 return 0;
57} 60}
58 61
59module_param_call(callback_tcpport, param_set_port, param_get_int, 62static int param_get_portnr(char *buffer, struct kernel_param *kp)
60 &nfs_callback_set_tcpport, 0644); 63{
64 return param_get_uint(buffer, kp);
65}
66#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
67
68module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
61 69
62/* 70/*
63 * This is the NFSv4 callback kernel thread. 71 * This is the NFSv4 callback kernel thread.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..e350bd6a2334 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server,
809 /* Initialise the client representation from the mount data */ 809 /* Initialise the client representation from the mount data */
810 server->flags = data->flags; 810 server->flags = data->flags;
811 server->options = data->options; 811 server->options = data->options;
812 server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
813 NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
814 NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
812 815
813 if (data->rsize) 816 if (data->rsize)
814 server->rsize = nfs_block_size(data->rsize, NULL); 817 server->rsize = nfs_block_size(data->rsize, NULL);
@@ -879,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
879 server->rsize = NFS_MAX_FILE_IO_SIZE; 882 server->rsize = NFS_MAX_FILE_IO_SIZE;
880 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 883 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
881 884
885 server->backing_dev_info.name = "nfs";
882 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; 886 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
883 887
884 if (server->wsize > max_rpc_payload) 888 if (server->wsize > max_rpc_payload)
@@ -1074,10 +1078,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1074 (unsigned long long) server->fsid.major, 1078 (unsigned long long) server->fsid.major,
1075 (unsigned long long) server->fsid.minor); 1079 (unsigned long long) server->fsid.minor);
1076 1080
1077 BUG_ON(!server->nfs_client);
1078 BUG_ON(!server->nfs_client->rpc_ops);
1079 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1080
1081 spin_lock(&nfs_client_lock); 1081 spin_lock(&nfs_client_lock);
1082 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); 1082 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1083 list_add_tail(&server->master_link, &nfs_volume_list); 1083 list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1274,7 +1274,7 @@ static int nfs4_init_server(struct nfs_server *server,
1274 1274
1275 /* Initialise the client representation from the mount data */ 1275 /* Initialise the client representation from the mount data */
1276 server->flags = data->flags; 1276 server->flags = data->flags;
1277 server->caps |= NFS_CAP_ATOMIC_OPEN; 1277 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
1278 server->options = data->options; 1278 server->options = data->options;
1279 1279
1280 /* Get a client record */ 1280 /* Get a client record */
@@ -1359,10 +1359,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1359 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) 1359 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
1360 server->namelen = NFS4_MAXNAMLEN; 1360 server->namelen = NFS4_MAXNAMLEN;
1361 1361
1362 BUG_ON(!server->nfs_client);
1363 BUG_ON(!server->nfs_client->rpc_ops);
1364 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1365
1366 spin_lock(&nfs_client_lock); 1362 spin_lock(&nfs_client_lock);
1367 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); 1363 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1368 list_add_tail(&server->master_link, &nfs_volume_list); 1364 list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1400,7 +1396,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1400 1396
1401 /* Initialise the client representation from the parent server */ 1397 /* Initialise the client representation from the parent server */
1402 nfs_server_copy_userdata(server, parent_server); 1398 nfs_server_copy_userdata(server, parent_server);
1403 server->caps |= NFS_CAP_ATOMIC_OPEN; 1399 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
1404 1400
1405 /* Get a client representation. 1401 /* Get a client representation.
1406 * Note: NFSv4 always uses TCP, */ 1402 * Note: NFSv4 always uses TCP, */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 489fc01a3204..6c3210099d51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata)
255 255
256 if (put_dreq(dreq)) 256 if (put_dreq(dreq))
257 nfs_direct_complete(dreq); 257 nfs_direct_complete(dreq);
258 nfs_readdata_release(calldata); 258 nfs_readdata_free(data);
259} 259}
260 260
261static const struct rpc_call_ops nfs_read_direct_ops = { 261static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
314 data->npages, 1, 0, data->pagevec, NULL); 314 data->npages, 1, 0, data->pagevec, NULL);
315 up_read(&current->mm->mmap_sem); 315 up_read(&current->mm->mmap_sem);
316 if (result < 0) { 316 if (result < 0) {
317 nfs_readdata_release(data); 317 nfs_readdata_free(data);
318 break; 318 break;
319 } 319 }
320 if ((unsigned)result < data->npages) { 320 if ((unsigned)result < data->npages) {
321 bytes = result * PAGE_SIZE; 321 bytes = result * PAGE_SIZE;
322 if (bytes <= pgbase) { 322 if (bytes <= pgbase) {
323 nfs_direct_release_pages(data->pagevec, result); 323 nfs_direct_release_pages(data->pagevec, result);
324 nfs_readdata_release(data); 324 nfs_readdata_free(data);
325 break; 325 break;
326 } 326 }
327 bytes -= pgbase; 327 bytes -= pgbase;
@@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
334 data->inode = inode; 334 data->inode = inode;
335 data->cred = msg.rpc_cred; 335 data->cred = msg.rpc_cred;
336 data->args.fh = NFS_FH(inode); 336 data->args.fh = NFS_FH(inode);
337 data->args.context = get_nfs_open_context(ctx); 337 data->args.context = ctx;
338 data->args.offset = pos; 338 data->args.offset = pos;
339 data->args.pgbase = pgbase; 339 data->args.pgbase = pgbase;
340 data->args.pages = data->pagevec; 340 data->args.pages = data->pagevec;
@@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
441 struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); 441 struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
442 list_del(&data->pages); 442 list_del(&data->pages);
443 nfs_direct_release_pages(data->pagevec, data->npages); 443 nfs_direct_release_pages(data->pagevec, data->npages);
444 nfs_writedata_release(data); 444 nfs_writedata_free(data);
445 } 445 }
446} 446}
447 447
@@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata)
534 534
535 dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); 535 dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
536 nfs_direct_write_complete(dreq, data->inode); 536 nfs_direct_write_complete(dreq, data->inode);
537 nfs_commitdata_release(calldata); 537 nfs_commit_free(data);
538} 538}
539 539
540static const struct rpc_call_ops nfs_commit_direct_ops = { 540static const struct rpc_call_ops nfs_commit_direct_ops = {
@@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
570 data->args.fh = NFS_FH(data->inode); 570 data->args.fh = NFS_FH(data->inode);
571 data->args.offset = 0; 571 data->args.offset = 0;
572 data->args.count = 0; 572 data->args.count = 0;
573 data->args.context = get_nfs_open_context(dreq->ctx); 573 data->args.context = dreq->ctx;
574 data->res.count = 0; 574 data->res.count = 0;
575 data->res.fattr = &data->fattr; 575 data->res.fattr = &data->fattr;
576 data->res.verf = &data->verf; 576 data->res.verf = &data->verf;
@@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
734 data->npages, 0, 0, data->pagevec, NULL); 734 data->npages, 0, 0, data->pagevec, NULL);
735 up_read(&current->mm->mmap_sem); 735 up_read(&current->mm->mmap_sem);
736 if (result < 0) { 736 if (result < 0) {
737 nfs_writedata_release(data); 737 nfs_writedata_free(data);
738 break; 738 break;
739 } 739 }
740 if ((unsigned)result < data->npages) { 740 if ((unsigned)result < data->npages) {
741 bytes = result * PAGE_SIZE; 741 bytes = result * PAGE_SIZE;
742 if (bytes <= pgbase) { 742 if (bytes <= pgbase) {
743 nfs_direct_release_pages(data->pagevec, result); 743 nfs_direct_release_pages(data->pagevec, result);
744 nfs_writedata_release(data); 744 nfs_writedata_free(data);
745 break; 745 break;
746 } 746 }
747 bytes -= pgbase; 747 bytes -= pgbase;
@@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
756 data->inode = inode; 756 data->inode = inode;
757 data->cred = msg.rpc_cred; 757 data->cred = msg.rpc_cred;
758 data->args.fh = NFS_FH(inode); 758 data->args.fh = NFS_FH(inode);
759 data->args.context = get_nfs_open_context(ctx); 759 data->args.context = ctx;
760 data->args.offset = pos; 760 data->args.offset = pos;
761 data->args.pgbase = pgbase; 761 data->args.pgbase = pgbase;
762 data->args.pages = data->pagevec; 762 data->args.pages = data->pagevec;
@@ -934,9 +934,6 @@ out:
934 * back into its cache. We let the server do generic write 934 * back into its cache. We let the server do generic write
935 * parameter checking and report problems. 935 * parameter checking and report problems.
936 * 936 *
937 * We also avoid an unnecessary invocation of generic_osync_inode(),
938 * as it is fairly meaningless to sync the metadata of an NFS file.
939 *
940 * We eliminate local atime updates, see direct read above. 937 * We eliminate local atime updates, see direct read above.
941 * 938 *
942 * We avoid unnecessary page cache invalidations for normal cached 939 * We avoid unnecessary page cache invalidations for normal cached
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000000..f4d54ba97cc6
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,335 @@
1/*
2 * linux/fs/nfs/dns_resolve.c
3 *
4 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
5 *
6 * Resolves DNS hostnames into valid ip addresses
7 */
8
9#include <linux/hash.h>
10#include <linux/string.h>
11#include <linux/kmod.h>
12#include <linux/module.h>
13#include <linux/socket.h>
14#include <linux/seq_file.h>
15#include <linux/inet.h>
16#include <linux/sunrpc/clnt.h>
17#include <linux/sunrpc/cache.h>
18#include <linux/sunrpc/svcauth.h>
19
20#include "dns_resolve.h"
21#include "cache_lib.h"
22
23#define NFS_DNS_HASHBITS 4
24#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
25
26static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
27
28struct nfs_dns_ent {
29 struct cache_head h;
30
31 char *hostname;
32 size_t namelen;
33
34 struct sockaddr_storage addr;
35 size_t addrlen;
36};
37
38
39static void nfs_dns_ent_init(struct cache_head *cnew,
40 struct cache_head *ckey)
41{
42 struct nfs_dns_ent *new;
43 struct nfs_dns_ent *key;
44
45 new = container_of(cnew, struct nfs_dns_ent, h);
46 key = container_of(ckey, struct nfs_dns_ent, h);
47
48 kfree(new->hostname);
49 new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
50 if (new->hostname) {
51 new->namelen = key->namelen;
52 memcpy(&new->addr, &key->addr, key->addrlen);
53 new->addrlen = key->addrlen;
54 } else {
55 new->namelen = 0;
56 new->addrlen = 0;
57 }
58}
59
60static void nfs_dns_ent_put(struct kref *ref)
61{
62 struct nfs_dns_ent *item;
63
64 item = container_of(ref, struct nfs_dns_ent, h.ref);
65 kfree(item->hostname);
66 kfree(item);
67}
68
69static struct cache_head *nfs_dns_ent_alloc(void)
70{
71 struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
72
73 if (item != NULL) {
74 item->hostname = NULL;
75 item->namelen = 0;
76 item->addrlen = 0;
77 return &item->h;
78 }
79 return NULL;
80};
81
82static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
83{
84 return hash_str(key->hostname, NFS_DNS_HASHBITS);
85}
86
87static void nfs_dns_request(struct cache_detail *cd,
88 struct cache_head *ch,
89 char **bpp, int *blen)
90{
91 struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
92
93 qword_add(bpp, blen, key->hostname);
94 (*bpp)[-1] = '\n';
95}
96
97static int nfs_dns_upcall(struct cache_detail *cd,
98 struct cache_head *ch)
99{
100 struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
101 int ret;
102
103 ret = nfs_cache_upcall(cd, key->hostname);
104 if (ret)
105 ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
106 return ret;
107}
108
109static int nfs_dns_match(struct cache_head *ca,
110 struct cache_head *cb)
111{
112 struct nfs_dns_ent *a;
113 struct nfs_dns_ent *b;
114
115 a = container_of(ca, struct nfs_dns_ent, h);
116 b = container_of(cb, struct nfs_dns_ent, h);
117
118 if (a->namelen == 0 || a->namelen != b->namelen)
119 return 0;
120 return memcmp(a->hostname, b->hostname, a->namelen) == 0;
121}
122
123static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
124 struct cache_head *h)
125{
126 struct nfs_dns_ent *item;
127 long ttl;
128
129 if (h == NULL) {
130 seq_puts(m, "# ip address hostname ttl\n");
131 return 0;
132 }
133 item = container_of(h, struct nfs_dns_ent, h);
134 ttl = (long)item->h.expiry_time - (long)get_seconds();
135 if (ttl < 0)
136 ttl = 0;
137
138 if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
139 char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
140
141 rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
142 seq_printf(m, "%15s ", buf);
143 } else
144 seq_puts(m, "<none> ");
145 seq_printf(m, "%15s %ld\n", item->hostname, ttl);
146 return 0;
147}
148
149struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
150 struct nfs_dns_ent *key)
151{
152 struct cache_head *ch;
153
154 ch = sunrpc_cache_lookup(cd,
155 &key->h,
156 nfs_dns_hash(key));
157 if (!ch)
158 return NULL;
159 return container_of(ch, struct nfs_dns_ent, h);
160}
161
162struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
163 struct nfs_dns_ent *new,
164 struct nfs_dns_ent *key)
165{
166 struct cache_head *ch;
167
168 ch = sunrpc_cache_update(cd,
169 &new->h, &key->h,
170 nfs_dns_hash(key));
171 if (!ch)
172 return NULL;
173 return container_of(ch, struct nfs_dns_ent, h);
174}
175
176static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
177{
178 char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
179 struct nfs_dns_ent key, *item;
180 unsigned long ttl;
181 ssize_t len;
182 int ret = -EINVAL;
183
184 if (buf[buflen-1] != '\n')
185 goto out;
186 buf[buflen-1] = '\0';
187
188 len = qword_get(&buf, buf1, sizeof(buf1));
189 if (len <= 0)
190 goto out;
191 key.addrlen = rpc_pton(buf1, len,
192 (struct sockaddr *)&key.addr,
193 sizeof(key.addr));
194
195 len = qword_get(&buf, buf1, sizeof(buf1));
196 if (len <= 0)
197 goto out;
198
199 key.hostname = buf1;
200 key.namelen = len;
201 memset(&key.h, 0, sizeof(key.h));
202
203 ttl = get_expiry(&buf);
204 if (ttl == 0)
205 goto out;
206 key.h.expiry_time = ttl + get_seconds();
207
208 ret = -ENOMEM;
209 item = nfs_dns_lookup(cd, &key);
210 if (item == NULL)
211 goto out;
212
213 if (key.addrlen == 0)
214 set_bit(CACHE_NEGATIVE, &key.h.flags);
215
216 item = nfs_dns_update(cd, &key, item);
217 if (item == NULL)
218 goto out;
219
220 ret = 0;
221 cache_put(&item->h, cd);
222out:
223 return ret;
224}
225
226static struct cache_detail nfs_dns_resolve = {
227 .owner = THIS_MODULE,
228 .hash_size = NFS_DNS_HASHTBL_SIZE,
229 .hash_table = nfs_dns_table,
230 .name = "dns_resolve",
231 .cache_put = nfs_dns_ent_put,
232 .cache_upcall = nfs_dns_upcall,
233 .cache_parse = nfs_dns_parse,
234 .cache_show = nfs_dns_show,
235 .match = nfs_dns_match,
236 .init = nfs_dns_ent_init,
237 .update = nfs_dns_ent_init,
238 .alloc = nfs_dns_ent_alloc,
239};
240
241static int do_cache_lookup(struct cache_detail *cd,
242 struct nfs_dns_ent *key,
243 struct nfs_dns_ent **item,
244 struct nfs_cache_defer_req *dreq)
245{
246 int ret = -ENOMEM;
247
248 *item = nfs_dns_lookup(cd, key);
249 if (*item) {
250 ret = cache_check(cd, &(*item)->h, &dreq->req);
251 if (ret)
252 *item = NULL;
253 }
254 return ret;
255}
256
257static int do_cache_lookup_nowait(struct cache_detail *cd,
258 struct nfs_dns_ent *key,
259 struct nfs_dns_ent **item)
260{
261 int ret = -ENOMEM;
262
263 *item = nfs_dns_lookup(cd, key);
264 if (!*item)
265 goto out_err;
266 ret = -ETIMEDOUT;
267 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
268 || (*item)->h.expiry_time < get_seconds()
269 || cd->flush_time > (*item)->h.last_refresh)
270 goto out_put;
271 ret = -ENOENT;
272 if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
273 goto out_put;
274 return 0;
275out_put:
276 cache_put(&(*item)->h, cd);
277out_err:
278 *item = NULL;
279 return ret;
280}
281
282static int do_cache_lookup_wait(struct cache_detail *cd,
283 struct nfs_dns_ent *key,
284 struct nfs_dns_ent **item)
285{
286 struct nfs_cache_defer_req *dreq;
287 int ret = -ENOMEM;
288
289 dreq = nfs_cache_defer_req_alloc();
290 if (!dreq)
291 goto out;
292 ret = do_cache_lookup(cd, key, item, dreq);
293 if (ret == -EAGAIN) {
294 ret = nfs_cache_wait_for_upcall(dreq);
295 if (!ret)
296 ret = do_cache_lookup_nowait(cd, key, item);
297 }
298 nfs_cache_defer_req_put(dreq);
299out:
300 return ret;
301}
302
303ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
304 struct sockaddr *sa, size_t salen)
305{
306 struct nfs_dns_ent key = {
307 .hostname = name,
308 .namelen = namelen,
309 };
310 struct nfs_dns_ent *item = NULL;
311 ssize_t ret;
312
313 ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
314 if (ret == 0) {
315 if (salen >= item->addrlen) {
316 memcpy(sa, &item->addr, item->addrlen);
317 ret = item->addrlen;
318 } else
319 ret = -EOVERFLOW;
320 cache_put(&item->h, &nfs_dns_resolve);
321 } else if (ret == -ENOENT)
322 ret = -ESRCH;
323 return ret;
324}
325
326int nfs_dns_resolver_init(void)
327{
328 return nfs_cache_register(&nfs_dns_resolve);
329}
330
331void nfs_dns_resolver_destroy(void)
332{
333 nfs_cache_unregister(&nfs_dns_resolve);
334}
335
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000000..a3f0938babf7
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,14 @@
1/*
2 * Resolve DNS hostnames into valid ip addresses
3 */
4#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
5#define __LINUX_FS_NFS_DNS_RESOLVE_H
6
7#define NFS_DNS_HOSTNAME_MAXLEN (128)
8
9extern int nfs_dns_resolver_init(void);
10extern void nfs_dns_resolver_destroy(void);
11extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
12 struct sockaddr *sa, size_t salen);
13
14#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05062329b678..5021b75d2d1e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
328} 328}
329 329
330/* 330/*
331 * Decide whether a read/modify/write cycle may be more efficient
332 * then a modify/write/read cycle when writing to a page in the
333 * page cache.
334 *
335 * The modify/write/read cycle may occur if a page is read before
336 * being completely filled by the writer. In this situation, the
337 * page must be completely written to stable storage on the server
338 * before it can be refilled by reading in the page from the server.
339 * This can lead to expensive, small, FILE_SYNC mode writes being
340 * done.
341 *
342 * It may be more efficient to read the page first if the file is
343 * open for reading in addition to writing, the page is not marked
344 * as Uptodate, it is not dirty or waiting to be committed,
345 * indicating that it was previously allocated and then modified,
346 * that there were valid bytes of data in that range of the file,
347 * and that the new data won't completely replace the old data in
348 * that range of the file.
349 */
350static int nfs_want_read_modify_write(struct file *file, struct page *page,
351 loff_t pos, unsigned len)
352{
353 unsigned int pglen = nfs_page_length(page);
354 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
355 unsigned int end = offset + len;
356
357 if ((file->f_mode & FMODE_READ) && /* open for read? */
358 !PageUptodate(page) && /* Uptodate? */
359 !PagePrivate(page) && /* i/o request already? */
360 pglen && /* valid bytes of file? */
361 (end < pglen || offset)) /* replace all valid bytes? */
362 return 1;
363 return 0;
364}
365
366/*
331 * This does the "real" work of the write. We must allocate and lock the 367 * This does the "real" work of the write. We must allocate and lock the
332 * page to be sent back to the generic routine, which then copies the 368 * page to be sent back to the generic routine, which then copies the
333 * data from user space. 369 * data from user space.
@@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
340 struct page **pagep, void **fsdata) 376 struct page **pagep, void **fsdata)
341{ 377{
342 int ret; 378 int ret;
343 pgoff_t index; 379 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
344 struct page *page; 380 struct page *page;
345 index = pos >> PAGE_CACHE_SHIFT; 381 int once_thru = 0;
346 382
347 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", 383 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
348 file->f_path.dentry->d_parent->d_name.name, 384 file->f_path.dentry->d_parent->d_name.name,
349 file->f_path.dentry->d_name.name, 385 file->f_path.dentry->d_name.name,
350 mapping->host->i_ino, len, (long long) pos); 386 mapping->host->i_ino, len, (long long) pos);
351 387
388start:
352 /* 389 /*
353 * Prevent starvation issues if someone is doing a consistency 390 * Prevent starvation issues if someone is doing a consistency
354 * sync-to-disk 391 * sync-to-disk
@@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
367 if (ret) { 404 if (ret) {
368 unlock_page(page); 405 unlock_page(page);
369 page_cache_release(page); 406 page_cache_release(page);
407 } else if (!once_thru &&
408 nfs_want_read_modify_write(file, page, pos, len)) {
409 once_thru = 1;
410 ret = nfs_readpage(file, page);
411 page_cache_release(page);
412 if (!ret)
413 goto start;
370 } 414 }
371 return ret; 415 return ret;
372} 416}
@@ -479,6 +523,7 @@ const struct address_space_operations nfs_file_aops = {
479 .invalidatepage = nfs_invalidate_page, 523 .invalidatepage = nfs_invalidate_page,
480 .releasepage = nfs_release_page, 524 .releasepage = nfs_release_page,
481 .direct_IO = nfs_direct_IO, 525 .direct_IO = nfs_direct_IO,
526 .migratepage = nfs_migrate_page,
482 .launder_page = nfs_launder_page, 527 .launder_page = nfs_launder_page,
483}; 528};
484 529
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 86147b0ab2cf..21a84d45916f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
101 101
102static unsigned int fnvhash32(const void *, size_t); 102static unsigned int fnvhash32(const void *, size_t);
103 103
104static struct rpc_pipe_ops idmap_upcall_ops = { 104static const struct rpc_pipe_ops idmap_upcall_ops = {
105 .upcall = idmap_pipe_upcall, 105 .upcall = idmap_pipe_upcall,
106 .downcall = idmap_pipe_downcall, 106 .downcall = idmap_pipe_downcall,
107 .destroy_msg = idmap_pipe_destroy_msg, 107 .destroy_msg = idmap_pipe_destroy_msg,
@@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp)
119 if (idmap == NULL) 119 if (idmap == NULL)
120 return -ENOMEM; 120 return -ENOMEM;
121 121
122 idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", 122 idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
123 idmap, &idmap_upcall_ops, 0); 123 "idmap", idmap, &idmap_upcall_ops, 0);
124 if (IS_ERR(idmap->idmap_dentry)) { 124 if (IS_ERR(idmap->idmap_dentry)) {
125 error = PTR_ERR(idmap->idmap_dentry); 125 error = PTR_ERR(idmap->idmap_dentry);
126 kfree(idmap); 126 kfree(idmap);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd7938eda6a8..060022b4651c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
46#include "iostat.h" 46#include "iostat.h"
47#include "internal.h" 47#include "internal.h"
48#include "fscache.h" 48#include "fscache.h"
49#include "dns_resolve.h"
49 50
50#define NFSDBG_FACILITY NFSDBG_VFS 51#define NFSDBG_FACILITY NFSDBG_VFS
51 52
@@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
286 /* We can't support update_atime(), since the server will reset it */ 287 /* We can't support update_atime(), since the server will reset it */
287 inode->i_flags |= S_NOATIME|S_NOCMTIME; 288 inode->i_flags |= S_NOATIME|S_NOCMTIME;
288 inode->i_mode = fattr->mode; 289 inode->i_mode = fattr->mode;
290 if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
291 && nfs_server_capable(inode, NFS_CAP_MODE))
292 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
293 | NFS_INO_INVALID_ACCESS
294 | NFS_INO_INVALID_ACL;
289 /* Why so? Because we want revalidate for devices/FIFOs, and 295 /* Why so? Because we want revalidate for devices/FIFOs, and
290 * that's precisely what we have in nfs_file_inode_operations. 296 * that's precisely what we have in nfs_file_inode_operations.
291 */ 297 */
@@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
330 nfsi->attr_gencount = fattr->gencount; 336 nfsi->attr_gencount = fattr->gencount;
331 if (fattr->valid & NFS_ATTR_FATTR_ATIME) 337 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
332 inode->i_atime = fattr->atime; 338 inode->i_atime = fattr->atime;
339 else if (nfs_server_capable(inode, NFS_CAP_ATIME))
340 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
333 if (fattr->valid & NFS_ATTR_FATTR_MTIME) 341 if (fattr->valid & NFS_ATTR_FATTR_MTIME)
334 inode->i_mtime = fattr->mtime; 342 inode->i_mtime = fattr->mtime;
343 else if (nfs_server_capable(inode, NFS_CAP_MTIME))
344 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
345 | NFS_INO_INVALID_DATA;
335 if (fattr->valid & NFS_ATTR_FATTR_CTIME) 346 if (fattr->valid & NFS_ATTR_FATTR_CTIME)
336 inode->i_ctime = fattr->ctime; 347 inode->i_ctime = fattr->ctime;
348 else if (nfs_server_capable(inode, NFS_CAP_CTIME))
349 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
350 | NFS_INO_INVALID_ACCESS
351 | NFS_INO_INVALID_ACL;
337 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) 352 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
338 nfsi->change_attr = fattr->change_attr; 353 nfsi->change_attr = fattr->change_attr;
354 else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
355 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
356 | NFS_INO_INVALID_DATA;
339 if (fattr->valid & NFS_ATTR_FATTR_SIZE) 357 if (fattr->valid & NFS_ATTR_FATTR_SIZE)
340 inode->i_size = nfs_size_to_loff_t(fattr->size); 358 inode->i_size = nfs_size_to_loff_t(fattr->size);
359 else
360 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
361 | NFS_INO_INVALID_DATA
362 | NFS_INO_REVAL_PAGECACHE;
341 if (fattr->valid & NFS_ATTR_FATTR_NLINK) 363 if (fattr->valid & NFS_ATTR_FATTR_NLINK)
342 inode->i_nlink = fattr->nlink; 364 inode->i_nlink = fattr->nlink;
365 else if (nfs_server_capable(inode, NFS_CAP_NLINK))
366 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
343 if (fattr->valid & NFS_ATTR_FATTR_OWNER) 367 if (fattr->valid & NFS_ATTR_FATTR_OWNER)
344 inode->i_uid = fattr->uid; 368 inode->i_uid = fattr->uid;
369 else if (nfs_server_capable(inode, NFS_CAP_OWNER))
370 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
371 | NFS_INO_INVALID_ACCESS
372 | NFS_INO_INVALID_ACL;
345 if (fattr->valid & NFS_ATTR_FATTR_GROUP) 373 if (fattr->valid & NFS_ATTR_FATTR_GROUP)
346 inode->i_gid = fattr->gid; 374 inode->i_gid = fattr->gid;
375 else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
376 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
377 | NFS_INO_INVALID_ACCESS
378 | NFS_INO_INVALID_ACL;
347 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) 379 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
348 inode->i_blocks = fattr->du.nfs2.blocks; 380 inode->i_blocks = fattr->du.nfs2.blocks;
349 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { 381 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -1145,6 +1177,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1145 loff_t cur_isize, new_isize; 1177 loff_t cur_isize, new_isize;
1146 unsigned long invalid = 0; 1178 unsigned long invalid = 0;
1147 unsigned long now = jiffies; 1179 unsigned long now = jiffies;
1180 unsigned long save_cache_validity;
1148 1181
1149 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", 1182 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
1150 __func__, inode->i_sb->s_id, inode->i_ino, 1183 __func__, inode->i_sb->s_id, inode->i_ino,
@@ -1171,10 +1204,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1171 */ 1204 */
1172 nfsi->read_cache_jiffies = fattr->time_start; 1205 nfsi->read_cache_jiffies = fattr->time_start;
1173 1206
1174 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) 1207 save_cache_validity = nfsi->cache_validity;
1175 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR 1208 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
1176 | NFS_INO_INVALID_ATIME 1209 | NFS_INO_INVALID_ATIME
1177 | NFS_INO_REVAL_PAGECACHE); 1210 | NFS_INO_REVAL_FORCED
1211 | NFS_INO_REVAL_PAGECACHE);
1178 1212
1179 /* Do atomic weak cache consistency updates */ 1213 /* Do atomic weak cache consistency updates */
1180 nfs_wcc_update_inode(inode, fattr); 1214 nfs_wcc_update_inode(inode, fattr);
@@ -1189,7 +1223,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1189 nfs_force_lookup_revalidate(inode); 1223 nfs_force_lookup_revalidate(inode);
1190 nfsi->change_attr = fattr->change_attr; 1224 nfsi->change_attr = fattr->change_attr;
1191 } 1225 }
1192 } 1226 } else if (server->caps & NFS_CAP_CHANGE_ATTR)
1227 invalid |= save_cache_validity;
1193 1228
1194 if (fattr->valid & NFS_ATTR_FATTR_MTIME) { 1229 if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
1195 /* NFSv2/v3: Check if the mtime agrees */ 1230 /* NFSv2/v3: Check if the mtime agrees */
@@ -1201,7 +1236,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1201 nfs_force_lookup_revalidate(inode); 1236 nfs_force_lookup_revalidate(inode);
1202 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1237 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1203 } 1238 }
1204 } 1239 } else if (server->caps & NFS_CAP_MTIME)
1240 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1241 | NFS_INO_INVALID_DATA
1242 | NFS_INO_REVAL_PAGECACHE
1243 | NFS_INO_REVAL_FORCED);
1244
1205 if (fattr->valid & NFS_ATTR_FATTR_CTIME) { 1245 if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
1206 /* If ctime has changed we should definitely clear access+acl caches */ 1246 /* If ctime has changed we should definitely clear access+acl caches */
1207 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { 1247 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
@@ -1215,7 +1255,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1215 } 1255 }
1216 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1256 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1217 } 1257 }
1218 } 1258 } else if (server->caps & NFS_CAP_CTIME)
1259 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1260 | NFS_INO_INVALID_ACCESS
1261 | NFS_INO_INVALID_ACL
1262 | NFS_INO_REVAL_FORCED);
1219 1263
1220 /* Check if our cached file size is stale */ 1264 /* Check if our cached file size is stale */
1221 if (fattr->valid & NFS_ATTR_FATTR_SIZE) { 1265 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1231,30 +1275,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1231 dprintk("NFS: isize change on server for file %s/%ld\n", 1275 dprintk("NFS: isize change on server for file %s/%ld\n",
1232 inode->i_sb->s_id, inode->i_ino); 1276 inode->i_sb->s_id, inode->i_ino);
1233 } 1277 }
1234 } 1278 } else
1279 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1280 | NFS_INO_REVAL_PAGECACHE
1281 | NFS_INO_REVAL_FORCED);
1235 1282
1236 1283
1237 if (fattr->valid & NFS_ATTR_FATTR_ATIME) 1284 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
1238 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); 1285 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1286 else if (server->caps & NFS_CAP_ATIME)
1287 invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
1288 | NFS_INO_REVAL_FORCED);
1239 1289
1240 if (fattr->valid & NFS_ATTR_FATTR_MODE) { 1290 if (fattr->valid & NFS_ATTR_FATTR_MODE) {
1241 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { 1291 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
1242 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1292 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1243 inode->i_mode = fattr->mode; 1293 inode->i_mode = fattr->mode;
1244 } 1294 }
1245 } 1295 } else if (server->caps & NFS_CAP_MODE)
1296 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1297 | NFS_INO_INVALID_ACCESS
1298 | NFS_INO_INVALID_ACL
1299 | NFS_INO_REVAL_FORCED);
1300
1246 if (fattr->valid & NFS_ATTR_FATTR_OWNER) { 1301 if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
1247 if (inode->i_uid != fattr->uid) { 1302 if (inode->i_uid != fattr->uid) {
1248 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1303 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1249 inode->i_uid = fattr->uid; 1304 inode->i_uid = fattr->uid;
1250 } 1305 }
1251 } 1306 } else if (server->caps & NFS_CAP_OWNER)
1307 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1308 | NFS_INO_INVALID_ACCESS
1309 | NFS_INO_INVALID_ACL
1310 | NFS_INO_REVAL_FORCED);
1311
1252 if (fattr->valid & NFS_ATTR_FATTR_GROUP) { 1312 if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
1253 if (inode->i_gid != fattr->gid) { 1313 if (inode->i_gid != fattr->gid) {
1254 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1314 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1255 inode->i_gid = fattr->gid; 1315 inode->i_gid = fattr->gid;
1256 } 1316 }
1257 } 1317 } else if (server->caps & NFS_CAP_OWNER_GROUP)
1318 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1319 | NFS_INO_INVALID_ACCESS
1320 | NFS_INO_INVALID_ACL
1321 | NFS_INO_REVAL_FORCED);
1258 1322
1259 if (fattr->valid & NFS_ATTR_FATTR_NLINK) { 1323 if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
1260 if (inode->i_nlink != fattr->nlink) { 1324 if (inode->i_nlink != fattr->nlink) {
@@ -1263,7 +1327,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1263 invalid |= NFS_INO_INVALID_DATA; 1327 invalid |= NFS_INO_INVALID_DATA;
1264 inode->i_nlink = fattr->nlink; 1328 inode->i_nlink = fattr->nlink;
1265 } 1329 }
1266 } 1330 } else if (server->caps & NFS_CAP_NLINK)
1331 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1332 | NFS_INO_REVAL_FORCED);
1267 1333
1268 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { 1334 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
1269 /* 1335 /*
@@ -1293,9 +1359,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1293 || S_ISLNK(inode->i_mode))) 1359 || S_ISLNK(inode->i_mode)))
1294 invalid &= ~NFS_INO_INVALID_DATA; 1360 invalid &= ~NFS_INO_INVALID_DATA;
1295 if (!nfs_have_delegation(inode, FMODE_READ) || 1361 if (!nfs_have_delegation(inode, FMODE_READ) ||
1296 (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) 1362 (save_cache_validity & NFS_INO_REVAL_FORCED))
1297 nfsi->cache_validity |= invalid; 1363 nfsi->cache_validity |= invalid;
1298 nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
1299 1364
1300 return 0; 1365 return 0;
1301 out_changed: 1366 out_changed:
@@ -1442,6 +1507,10 @@ static int __init init_nfs_fs(void)
1442{ 1507{
1443 int err; 1508 int err;
1444 1509
1510 err = nfs_dns_resolver_init();
1511 if (err < 0)
1512 goto out8;
1513
1445 err = nfs_fscache_register(); 1514 err = nfs_fscache_register();
1446 if (err < 0) 1515 if (err < 0)
1447 goto out7; 1516 goto out7;
@@ -1500,6 +1569,8 @@ out5:
1500out6: 1569out6:
1501 nfs_fscache_unregister(); 1570 nfs_fscache_unregister();
1502out7: 1571out7:
1572 nfs_dns_resolver_destroy();
1573out8:
1503 return err; 1574 return err;
1504} 1575}
1505 1576
@@ -1511,6 +1582,7 @@ static void __exit exit_nfs_fs(void)
1511 nfs_destroy_inodecache(); 1582 nfs_destroy_inodecache();
1512 nfs_destroy_nfspagecache(); 1583 nfs_destroy_nfspagecache();
1513 nfs_fscache_unregister(); 1584 nfs_fscache_unregister();
1585 nfs_dns_resolver_destroy();
1514#ifdef CONFIG_PROC_FS 1586#ifdef CONFIG_PROC_FS
1515 rpc_proc_unregister("nfs"); 1587 rpc_proc_unregister("nfs");
1516#endif 1588#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..e21b1bb9972f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -49,6 +49,11 @@ struct nfs_clone_mount {
49#define NFS_MAX_SECFLAVORS (12) 49#define NFS_MAX_SECFLAVORS (12)
50 50
51/* 51/*
52 * Value used if the user did not specify a port value.
53 */
54#define NFS_UNSPEC_PORT (-1)
55
56/*
52 * In-kernel mount arguments 57 * In-kernel mount arguments
53 */ 58 */
54struct nfs_parsed_mount_data { 59struct nfs_parsed_mount_data {
@@ -63,6 +68,7 @@ struct nfs_parsed_mount_data {
63 unsigned int auth_flavor_len; 68 unsigned int auth_flavor_len;
64 rpc_authflavor_t auth_flavors[1]; 69 rpc_authflavor_t auth_flavors[1];
65 char *client_address; 70 char *client_address;
71 unsigned int version;
66 unsigned int minorversion; 72 unsigned int minorversion;
67 char *fscache_uniq; 73 char *fscache_uniq;
68 74
@@ -71,7 +77,7 @@ struct nfs_parsed_mount_data {
71 size_t addrlen; 77 size_t addrlen;
72 char *hostname; 78 char *hostname;
73 u32 version; 79 u32 version;
74 unsigned short port; 80 int port;
75 unsigned short protocol; 81 unsigned short protocol;
76 } mount_server; 82 } mount_server;
77 83
@@ -80,7 +86,7 @@ struct nfs_parsed_mount_data {
80 size_t addrlen; 86 size_t addrlen;
81 char *hostname; 87 char *hostname;
82 char *export_path; 88 char *export_path;
83 unsigned short port; 89 int port;
84 unsigned short protocol; 90 unsigned short protocol;
85 } nfs_server; 91 } nfs_server;
86 92
@@ -102,6 +108,7 @@ struct nfs_mount_request {
102}; 108};
103 109
104extern int nfs_mount(struct nfs_mount_request *info); 110extern int nfs_mount(struct nfs_mount_request *info);
111extern void nfs_umount(const struct nfs_mount_request *info);
105 112
106/* client.c */ 113/* client.c */
107extern struct rpc_program nfs_program; 114extern struct rpc_program nfs_program;
@@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode);
213extern int nfs_wait_bit_killable(void *word); 220extern int nfs_wait_bit_killable(void *word);
214 221
215/* super.c */ 222/* super.c */
216void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
217extern struct file_system_type nfs_xdev_fs_type; 223extern struct file_system_type nfs_xdev_fs_type;
218#ifdef CONFIG_NFS_V4 224#ifdef CONFIG_NFS_V4
219extern struct file_system_type nfs4_xdev_fs_type; 225extern struct file_system_type nfs4_xdev_fs_type;
@@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
248 254
249/* write.c */ 255/* write.c */
250extern void nfs_write_prepare(struct rpc_task *task, void *calldata); 256extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
257#ifdef CONFIG_MIGRATION
258extern int nfs_migrate_page(struct address_space *,
259 struct page *, struct page *);
260#else
261#define nfs_migrate_page NULL
262#endif
251 263
252/* nfs4proc.c */ 264/* nfs4proc.c */
253extern int _nfs4_call_sync(struct nfs_server *server, 265extern int _nfs4_call_sync(struct nfs_server *server,
@@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
368 return ((unsigned long)len + (unsigned long)base + 380 return ((unsigned long)len + (unsigned long)base +
369 PAGE_SIZE - 1) >> PAGE_SHIFT; 381 PAGE_SIZE - 1) >> PAGE_SHIFT;
370} 382}
371
372#define IPV6_SCOPE_DELIMITER '%'
373
374/*
375 * Set the port number in an address. Be agnostic about the address
376 * family.
377 */
378static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
379{
380 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
381 struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
382
383 switch (sap->sa_family) {
384 case AF_INET:
385 ap->sin_port = htons(port);
386 break;
387 case AF_INET6:
388 ap6->sin6_port = htons(port);
389 break;
390 }
391}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..0adefc40cc89 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -209,6 +209,71 @@ out_mnt_err:
209 goto out; 209 goto out;
210} 210}
211 211
212/**
213 * nfs_umount - Notify a server that we have unmounted this export
214 * @info: pointer to umount request arguments
215 *
216 * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
217 * use UDP.
218 */
219void nfs_umount(const struct nfs_mount_request *info)
220{
221 static const struct rpc_timeout nfs_umnt_timeout = {
222 .to_initval = 1 * HZ,
223 .to_maxval = 3 * HZ,
224 .to_retries = 2,
225 };
226 struct rpc_create_args args = {
227 .protocol = IPPROTO_UDP,
228 .address = info->sap,
229 .addrsize = info->salen,
230 .timeout = &nfs_umnt_timeout,
231 .servername = info->hostname,
232 .program = &mnt_program,
233 .version = info->version,
234 .authflavor = RPC_AUTH_UNIX,
235 .flags = RPC_CLNT_CREATE_NOPING,
236 };
237 struct mountres result;
238 struct rpc_message msg = {
239 .rpc_argp = info->dirpath,
240 .rpc_resp = &result,
241 };
242 struct rpc_clnt *clnt;
243 int status;
244
245 if (info->noresvport)
246 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
247
248 clnt = rpc_create(&args);
249 if (unlikely(IS_ERR(clnt)))
250 goto out_clnt_err;
251
252 dprintk("NFS: sending UMNT request for %s:%s\n",
253 (info->hostname ? info->hostname : "server"), info->dirpath);
254
255 if (info->version == NFS_MNT3_VERSION)
256 msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
257 else
258 msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
259
260 status = rpc_call_sync(clnt, &msg, 0);
261 rpc_shutdown_client(clnt);
262
263 if (unlikely(status < 0))
264 goto out_call_err;
265
266 return;
267
268out_clnt_err:
269 dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
270 PTR_ERR(clnt));
271 return;
272
273out_call_err:
274 dprintk("NFS: UMNT request failed, status=%d\n", status);
275}
276
212/* 277/*
213 * XDR encode/decode functions for MOUNT 278 * XDR encode/decode functions for MOUNT
214 */ 279 */
@@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
258 return -EIO; 323 return -EIO;
259 status = ntohl(*p); 324 status = ntohl(*p);
260 325
261 for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) { 326 for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
262 if (mnt_errtbl[i].status == status) { 327 if (mnt_errtbl[i].status == status) {
263 res->errno = mnt_errtbl[i].errno; 328 res->errno = mnt_errtbl[i].errno;
264 return 0; 329 return 0;
@@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
309 return -EIO; 374 return -EIO;
310 status = ntohl(*p); 375 status = ntohl(*p);
311 376
312 for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) { 377 for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
313 if (mnt3_errtbl[i].status == status) { 378 if (mnt3_errtbl[i].status == status) {
314 res->errno = mnt3_errtbl[i].errno; 379 res->errno = mnt3_errtbl[i].errno;
315 return 0; 380 return 0;
@@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = {
407 .p_statidx = MOUNTPROC_MNT, 472 .p_statidx = MOUNTPROC_MNT,
408 .p_name = "MOUNT", 473 .p_name = "MOUNT",
409 }, 474 },
475 [MOUNTPROC_UMNT] = {
476 .p_proc = MOUNTPROC_UMNT,
477 .p_encode = (kxdrproc_t)mnt_enc_dirpath,
478 .p_arglen = MNT_enc_dirpath_sz,
479 .p_statidx = MOUNTPROC_UMNT,
480 .p_name = "UMOUNT",
481 },
410}; 482};
411 483
412static struct rpc_procinfo mnt3_procedures[] = { 484static struct rpc_procinfo mnt3_procedures[] = {
@@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
419 .p_statidx = MOUNTPROC3_MNT, 491 .p_statidx = MOUNTPROC3_MNT,
420 .p_name = "MOUNT", 492 .p_name = "MOUNT",
421 }, 493 },
494 [MOUNTPROC3_UMNT] = {
495 .p_proc = MOUNTPROC3_UMNT,
496 .p_encode = (kxdrproc_t)mnt_enc_dirpath,
497 .p_arglen = MNT_enc_dirpath_sz,
498 .p_statidx = MOUNTPROC3_UMNT,
499 .p_name = "UMOUNT",
500 },
422}; 501};
423 502
424 503
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d0cc5ce0edfe..ee6a13f05443 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -299,7 +299,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
299 299
300/* 300/*
301 * Create a regular file. 301 * Create a regular file.
302 * For now, we don't implement O_EXCL.
303 */ 302 */
304static int 303static int
305nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 304nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2a2a0a7143ad..2636c26d56fa 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -17,6 +17,7 @@
17#include <linux/inet.h> 17#include <linux/inet.h>
18#include "internal.h" 18#include "internal.h"
19#include "nfs4_fs.h" 19#include "nfs4_fs.h"
20#include "dns_resolve.h"
20 21
21#define NFSDBG_FACILITY NFSDBG_VFS 22#define NFSDBG_FACILITY NFSDBG_VFS
22 23
@@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
95 return 0; 96 return 0;
96} 97}
97 98
99static size_t nfs_parse_server_name(char *string, size_t len,
100 struct sockaddr *sa, size_t salen)
101{
102 ssize_t ret;
103
104 ret = rpc_pton(string, len, sa, salen);
105 if (ret == 0) {
106 ret = nfs_dns_resolve_name(string, len, sa, salen);
107 if (ret < 0)
108 ret = 0;
109 }
110 return ret;
111}
112
98static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, 113static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
99 char *page, char *page2, 114 char *page, char *page2,
100 const struct nfs4_fs_location *location) 115 const struct nfs4_fs_location *location)
@@ -121,11 +136,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
121 136
122 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) 137 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
123 continue; 138 continue;
124 nfs_parse_ip_address(buf->data, buf->len, 139 mountdata->addrlen = nfs_parse_server_name(buf->data,
125 mountdata->addr, &mountdata->addrlen); 140 buf->len,
126 if (mountdata->addr->sa_family == AF_UNSPEC) 141 mountdata->addr, mountdata->addrlen);
142 if (mountdata->addrlen == 0)
127 continue; 143 continue;
128 nfs_set_port(mountdata->addr, NFS_PORT); 144 rpc_set_port(mountdata->addr, NFS_PORT);
129 145
130 memcpy(page2, buf->data, buf->len); 146 memcpy(page2, buf->data, buf->len);
131 page2[buf->len] = '\0'; 147 page2[buf->len] = '\0';
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6917311f201c..be6544aef41f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -61,6 +61,8 @@
61#define NFS4_POLL_RETRY_MIN (HZ/10) 61#define NFS4_POLL_RETRY_MIN (HZ/10)
62#define NFS4_POLL_RETRY_MAX (15*HZ) 62#define NFS4_POLL_RETRY_MAX (15*HZ)
63 63
64#define NFS4_MAX_LOOP_ON_RECOVER (10)
65
64struct nfs4_opendata; 66struct nfs4_opendata;
65static int _nfs4_proc_open(struct nfs4_opendata *data); 67static int _nfs4_proc_open(struct nfs4_opendata *data);
66static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 68static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
@@ -426,17 +428,19 @@ out:
426static int nfs4_recover_session(struct nfs4_session *session) 428static int nfs4_recover_session(struct nfs4_session *session)
427{ 429{
428 struct nfs_client *clp = session->clp; 430 struct nfs_client *clp = session->clp;
431 unsigned int loop;
429 int ret; 432 int ret;
430 433
431 for (;;) { 434 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
432 ret = nfs4_wait_clnt_recover(clp); 435 ret = nfs4_wait_clnt_recover(clp);
433 if (ret != 0) 436 if (ret != 0)
434 return ret; 437 break;
435 if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) 438 if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
436 break; 439 break;
437 nfs4_schedule_state_manager(clp); 440 nfs4_schedule_state_manager(clp);
441 ret = -EIO;
438 } 442 }
439 return 0; 443 return ret;
440} 444}
441 445
442static int nfs41_setup_sequence(struct nfs4_session *session, 446static int nfs41_setup_sequence(struct nfs4_session *session,
@@ -1444,18 +1448,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1444static int nfs4_recover_expired_lease(struct nfs_server *server) 1448static int nfs4_recover_expired_lease(struct nfs_server *server)
1445{ 1449{
1446 struct nfs_client *clp = server->nfs_client; 1450 struct nfs_client *clp = server->nfs_client;
1451 unsigned int loop;
1447 int ret; 1452 int ret;
1448 1453
1449 for (;;) { 1454 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
1450 ret = nfs4_wait_clnt_recover(clp); 1455 ret = nfs4_wait_clnt_recover(clp);
1451 if (ret != 0) 1456 if (ret != 0)
1452 return ret; 1457 break;
1453 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && 1458 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1454 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) 1459 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
1455 break; 1460 break;
1456 nfs4_schedule_state_recovery(clp); 1461 nfs4_schedule_state_recovery(clp);
1462 ret = -EIO;
1457 } 1463 }
1458 return 0; 1464 return ret;
1459} 1465}
1460 1466
1461/* 1467/*
@@ -1997,12 +2003,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
1997 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2003 status = nfs4_call_sync(server, &msg, &args, &res, 0);
1998 if (status == 0) { 2004 if (status == 0) {
1999 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); 2005 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
2006 server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
2007 NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
2008 NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
2009 NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
2010 NFS_CAP_CTIME|NFS_CAP_MTIME);
2000 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) 2011 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
2001 server->caps |= NFS_CAP_ACLS; 2012 server->caps |= NFS_CAP_ACLS;
2002 if (res.has_links != 0) 2013 if (res.has_links != 0)
2003 server->caps |= NFS_CAP_HARDLINKS; 2014 server->caps |= NFS_CAP_HARDLINKS;
2004 if (res.has_symlinks != 0) 2015 if (res.has_symlinks != 0)
2005 server->caps |= NFS_CAP_SYMLINKS; 2016 server->caps |= NFS_CAP_SYMLINKS;
2017 if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
2018 server->caps |= NFS_CAP_FILEID;
2019 if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
2020 server->caps |= NFS_CAP_MODE;
2021 if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
2022 server->caps |= NFS_CAP_NLINK;
2023 if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
2024 server->caps |= NFS_CAP_OWNER;
2025 if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
2026 server->caps |= NFS_CAP_OWNER_GROUP;
2027 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
2028 server->caps |= NFS_CAP_ATIME;
2029 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
2030 server->caps |= NFS_CAP_CTIME;
2031 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
2032 server->caps |= NFS_CAP_MTIME;
2033
2006 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); 2034 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
2007 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; 2035 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
2008 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; 2036 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 65ca8c18476f..1434080aefeb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1250,8 +1250,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
1250 continue; 1250 continue;
1251 } 1251 }
1252 /* Initialize or reset the session */ 1252 /* Initialize or reset the session */
1253 if (nfs4_has_session(clp) && 1253 if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)
1254 test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) { 1254 && nfs4_has_session(clp)) {
1255 if (clp->cl_cons_state == NFS_CS_SESSION_INITING) 1255 if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
1256 status = nfs4_initialize_session(clp); 1256 status = nfs4_initialize_session(clp);
1257 else 1257 else
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..cfc30d362f94 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -702,29 +702,12 @@ struct compound_hdr {
702 u32 minorversion; 702 u32 minorversion;
703}; 703};
704 704
705/* 705static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
706 * START OF "GENERIC" ENCODE ROUTINES. 706{
707 * These may look a little ugly since they are imported from a "generic" 707 __be32 *p = xdr_reserve_space(xdr, nbytes);
708 * set of XDR encode/decode routines which are intended to be shared by 708 BUG_ON(!p);
709 * all of our NFSv4 implementations (OpenBSD, MacOS X...). 709 return p;
710 * 710}
711 * If the pain of reading these is too great, it should be a straightforward
712 * task to translate them into Linux-specific versions which are more
713 * consistent with the style used in NFSv2/v3...
714 */
715#define WRITE32(n) *p++ = htonl(n)
716#define WRITE64(n) do { \
717 *p++ = htonl((uint32_t)((n) >> 32)); \
718 *p++ = htonl((uint32_t)(n)); \
719} while (0)
720#define WRITEMEM(ptr,nbytes) do { \
721 p = xdr_encode_opaque_fixed(p, ptr, nbytes); \
722} while (0)
723
724#define RESERVE_SPACE(nbytes) do { \
725 p = xdr_reserve_space(xdr, nbytes); \
726 BUG_ON(!p); \
727} while (0)
728 711
729static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 712static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
730{ 713{
@@ -749,12 +732,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
749 732
750 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); 733 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
751 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); 734 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
752 RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); 735 p = reserve_space(xdr, 4 + hdr->taglen + 8);
753 WRITE32(hdr->taglen); 736 p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
754 WRITEMEM(hdr->tag, hdr->taglen); 737 *p++ = cpu_to_be32(hdr->minorversion);
755 WRITE32(hdr->minorversion);
756 hdr->nops_p = p; 738 hdr->nops_p = p;
757 WRITE32(hdr->nops); 739 *p = cpu_to_be32(hdr->nops);
758} 740}
759 741
760static void encode_nops(struct compound_hdr *hdr) 742static void encode_nops(struct compound_hdr *hdr)
@@ -829,55 +811,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
829 len += 16; 811 len += 16;
830 else if (iap->ia_valid & ATTR_MTIME) 812 else if (iap->ia_valid & ATTR_MTIME)
831 len += 4; 813 len += 4;
832 RESERVE_SPACE(len); 814 p = reserve_space(xdr, len);
833 815
834 /* 816 /*
835 * We write the bitmap length now, but leave the bitmap and the attribute 817 * We write the bitmap length now, but leave the bitmap and the attribute
836 * buffer length to be backfilled at the end of this routine. 818 * buffer length to be backfilled at the end of this routine.
837 */ 819 */
838 WRITE32(2); 820 *p++ = cpu_to_be32(2);
839 q = p; 821 q = p;
840 p += 3; 822 p += 3;
841 823
842 if (iap->ia_valid & ATTR_SIZE) { 824 if (iap->ia_valid & ATTR_SIZE) {
843 bmval0 |= FATTR4_WORD0_SIZE; 825 bmval0 |= FATTR4_WORD0_SIZE;
844 WRITE64(iap->ia_size); 826 p = xdr_encode_hyper(p, iap->ia_size);
845 } 827 }
846 if (iap->ia_valid & ATTR_MODE) { 828 if (iap->ia_valid & ATTR_MODE) {
847 bmval1 |= FATTR4_WORD1_MODE; 829 bmval1 |= FATTR4_WORD1_MODE;
848 WRITE32(iap->ia_mode & S_IALLUGO); 830 *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
849 } 831 }
850 if (iap->ia_valid & ATTR_UID) { 832 if (iap->ia_valid & ATTR_UID) {
851 bmval1 |= FATTR4_WORD1_OWNER; 833 bmval1 |= FATTR4_WORD1_OWNER;
852 WRITE32(owner_namelen); 834 p = xdr_encode_opaque(p, owner_name, owner_namelen);
853 WRITEMEM(owner_name, owner_namelen);
854 } 835 }
855 if (iap->ia_valid & ATTR_GID) { 836 if (iap->ia_valid & ATTR_GID) {
856 bmval1 |= FATTR4_WORD1_OWNER_GROUP; 837 bmval1 |= FATTR4_WORD1_OWNER_GROUP;
857 WRITE32(owner_grouplen); 838 p = xdr_encode_opaque(p, owner_group, owner_grouplen);
858 WRITEMEM(owner_group, owner_grouplen);
859 } 839 }
860 if (iap->ia_valid & ATTR_ATIME_SET) { 840 if (iap->ia_valid & ATTR_ATIME_SET) {
861 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; 841 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
862 WRITE32(NFS4_SET_TO_CLIENT_TIME); 842 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
863 WRITE32(0); 843 *p++ = cpu_to_be32(0);
864 WRITE32(iap->ia_mtime.tv_sec); 844 *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
865 WRITE32(iap->ia_mtime.tv_nsec); 845 *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
866 } 846 }
867 else if (iap->ia_valid & ATTR_ATIME) { 847 else if (iap->ia_valid & ATTR_ATIME) {
868 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; 848 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
869 WRITE32(NFS4_SET_TO_SERVER_TIME); 849 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
870 } 850 }
871 if (iap->ia_valid & ATTR_MTIME_SET) { 851 if (iap->ia_valid & ATTR_MTIME_SET) {
872 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 852 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
873 WRITE32(NFS4_SET_TO_CLIENT_TIME); 853 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
874 WRITE32(0); 854 *p++ = cpu_to_be32(0);
875 WRITE32(iap->ia_mtime.tv_sec); 855 *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
876 WRITE32(iap->ia_mtime.tv_nsec); 856 *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
877 } 857 }
878 else if (iap->ia_valid & ATTR_MTIME) { 858 else if (iap->ia_valid & ATTR_MTIME) {
879 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 859 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
880 WRITE32(NFS4_SET_TO_SERVER_TIME); 860 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
881 } 861 }
882 862
883 /* 863 /*
@@ -891,7 +871,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
891 len = (char *)p - (char *)q - 12; 871 len = (char *)p - (char *)q - 12;
892 *q++ = htonl(bmval0); 872 *q++ = htonl(bmval0);
893 *q++ = htonl(bmval1); 873 *q++ = htonl(bmval1);
894 *q++ = htonl(len); 874 *q = htonl(len);
895 875
896/* out: */ 876/* out: */
897} 877}
@@ -900,9 +880,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
900{ 880{
901 __be32 *p; 881 __be32 *p;
902 882
903 RESERVE_SPACE(8); 883 p = reserve_space(xdr, 8);
904 WRITE32(OP_ACCESS); 884 *p++ = cpu_to_be32(OP_ACCESS);
905 WRITE32(access); 885 *p = cpu_to_be32(access);
906 hdr->nops++; 886 hdr->nops++;
907 hdr->replen += decode_access_maxsz; 887 hdr->replen += decode_access_maxsz;
908} 888}
@@ -911,10 +891,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
911{ 891{
912 __be32 *p; 892 __be32 *p;
913 893
914 RESERVE_SPACE(8+NFS4_STATEID_SIZE); 894 p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
915 WRITE32(OP_CLOSE); 895 *p++ = cpu_to_be32(OP_CLOSE);
916 WRITE32(arg->seqid->sequence->counter); 896 *p++ = cpu_to_be32(arg->seqid->sequence->counter);
917 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 897 xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
918 hdr->nops++; 898 hdr->nops++;
919 hdr->replen += decode_close_maxsz; 899 hdr->replen += decode_close_maxsz;
920} 900}
@@ -923,10 +903,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
923{ 903{
924 __be32 *p; 904 __be32 *p;
925 905
926 RESERVE_SPACE(16); 906 p = reserve_space(xdr, 16);
927 WRITE32(OP_COMMIT); 907 *p++ = cpu_to_be32(OP_COMMIT);
928 WRITE64(args->offset); 908 p = xdr_encode_hyper(p, args->offset);
929 WRITE32(args->count); 909 *p = cpu_to_be32(args->count);
930 hdr->nops++; 910 hdr->nops++;
931 hdr->replen += decode_commit_maxsz; 911 hdr->replen += decode_commit_maxsz;
932} 912}
@@ -935,30 +915,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
935{ 915{
936 __be32 *p; 916 __be32 *p;
937 917
938 RESERVE_SPACE(8); 918 p = reserve_space(xdr, 8);
939 WRITE32(OP_CREATE); 919 *p++ = cpu_to_be32(OP_CREATE);
940 WRITE32(create->ftype); 920 *p = cpu_to_be32(create->ftype);
941 921
942 switch (create->ftype) { 922 switch (create->ftype) {
943 case NF4LNK: 923 case NF4LNK:
944 RESERVE_SPACE(4); 924 p = reserve_space(xdr, 4);
945 WRITE32(create->u.symlink.len); 925 *p = cpu_to_be32(create->u.symlink.len);
946 xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); 926 xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
947 break; 927 break;
948 928
949 case NF4BLK: case NF4CHR: 929 case NF4BLK: case NF4CHR:
950 RESERVE_SPACE(8); 930 p = reserve_space(xdr, 8);
951 WRITE32(create->u.device.specdata1); 931 *p++ = cpu_to_be32(create->u.device.specdata1);
952 WRITE32(create->u.device.specdata2); 932 *p = cpu_to_be32(create->u.device.specdata2);
953 break; 933 break;
954 934
955 default: 935 default:
956 break; 936 break;
957 } 937 }
958 938
959 RESERVE_SPACE(4 + create->name->len); 939 encode_string(xdr, create->name->len, create->name->name);
960 WRITE32(create->name->len);
961 WRITEMEM(create->name->name, create->name->len);
962 hdr->nops++; 940 hdr->nops++;
963 hdr->replen += decode_create_maxsz; 941 hdr->replen += decode_create_maxsz;
964 942
@@ -969,10 +947,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
969{ 947{
970 __be32 *p; 948 __be32 *p;
971 949
972 RESERVE_SPACE(12); 950 p = reserve_space(xdr, 12);
973 WRITE32(OP_GETATTR); 951 *p++ = cpu_to_be32(OP_GETATTR);
974 WRITE32(1); 952 *p++ = cpu_to_be32(1);
975 WRITE32(bitmap); 953 *p = cpu_to_be32(bitmap);
976 hdr->nops++; 954 hdr->nops++;
977 hdr->replen += decode_getattr_maxsz; 955 hdr->replen += decode_getattr_maxsz;
978} 956}
@@ -981,11 +959,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
981{ 959{
982 __be32 *p; 960 __be32 *p;
983 961
984 RESERVE_SPACE(16); 962 p = reserve_space(xdr, 16);
985 WRITE32(OP_GETATTR); 963 *p++ = cpu_to_be32(OP_GETATTR);
986 WRITE32(2); 964 *p++ = cpu_to_be32(2);
987 WRITE32(bm0); 965 *p++ = cpu_to_be32(bm0);
988 WRITE32(bm1); 966 *p = cpu_to_be32(bm1);
989 hdr->nops++; 967 hdr->nops++;
990 hdr->replen += decode_getattr_maxsz; 968 hdr->replen += decode_getattr_maxsz;
991} 969}
@@ -1012,8 +990,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1012{ 990{
1013 __be32 *p; 991 __be32 *p;
1014 992
1015 RESERVE_SPACE(4); 993 p = reserve_space(xdr, 4);
1016 WRITE32(OP_GETFH); 994 *p = cpu_to_be32(OP_GETFH);
1017 hdr->nops++; 995 hdr->nops++;
1018 hdr->replen += decode_getfh_maxsz; 996 hdr->replen += decode_getfh_maxsz;
1019} 997}
@@ -1022,10 +1000,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
1022{ 1000{
1023 __be32 *p; 1001 __be32 *p;
1024 1002
1025 RESERVE_SPACE(8 + name->len); 1003 p = reserve_space(xdr, 8 + name->len);
1026 WRITE32(OP_LINK); 1004 *p++ = cpu_to_be32(OP_LINK);
1027 WRITE32(name->len); 1005 xdr_encode_opaque(p, name->name, name->len);
1028 WRITEMEM(name->name, name->len);
1029 hdr->nops++; 1006 hdr->nops++;
1030 hdr->replen += decode_link_maxsz; 1007 hdr->replen += decode_link_maxsz;
1031} 1008}
@@ -1052,27 +1029,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1052{ 1029{
1053 __be32 *p; 1030 __be32 *p;
1054 1031
1055 RESERVE_SPACE(32); 1032 p = reserve_space(xdr, 32);
1056 WRITE32(OP_LOCK); 1033 *p++ = cpu_to_be32(OP_LOCK);
1057 WRITE32(nfs4_lock_type(args->fl, args->block)); 1034 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
1058 WRITE32(args->reclaim); 1035 *p++ = cpu_to_be32(args->reclaim);
1059 WRITE64(args->fl->fl_start); 1036 p = xdr_encode_hyper(p, args->fl->fl_start);
1060 WRITE64(nfs4_lock_length(args->fl)); 1037 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1061 WRITE32(args->new_lock_owner); 1038 *p = cpu_to_be32(args->new_lock_owner);
1062 if (args->new_lock_owner){ 1039 if (args->new_lock_owner){
1063 RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); 1040 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
1064 WRITE32(args->open_seqid->sequence->counter); 1041 *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
1065 WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); 1042 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
1066 WRITE32(args->lock_seqid->sequence->counter); 1043 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
1067 WRITE64(args->lock_owner.clientid); 1044 p = xdr_encode_hyper(p, args->lock_owner.clientid);
1068 WRITE32(16); 1045 *p++ = cpu_to_be32(16);
1069 WRITEMEM("lock id:", 8); 1046 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1070 WRITE64(args->lock_owner.id); 1047 xdr_encode_hyper(p, args->lock_owner.id);
1071 } 1048 }
1072 else { 1049 else {
1073 RESERVE_SPACE(NFS4_STATEID_SIZE+4); 1050 p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
1074 WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); 1051 p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
1075 WRITE32(args->lock_seqid->sequence->counter); 1052 *p = cpu_to_be32(args->lock_seqid->sequence->counter);
1076 } 1053 }
1077 hdr->nops++; 1054 hdr->nops++;
1078 hdr->replen += decode_lock_maxsz; 1055 hdr->replen += decode_lock_maxsz;
@@ -1082,15 +1059,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
1082{ 1059{
1083 __be32 *p; 1060 __be32 *p;
1084 1061
1085 RESERVE_SPACE(52); 1062 p = reserve_space(xdr, 52);
1086 WRITE32(OP_LOCKT); 1063 *p++ = cpu_to_be32(OP_LOCKT);
1087 WRITE32(nfs4_lock_type(args->fl, 0)); 1064 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1088 WRITE64(args->fl->fl_start); 1065 p = xdr_encode_hyper(p, args->fl->fl_start);
1089 WRITE64(nfs4_lock_length(args->fl)); 1066 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1090 WRITE64(args->lock_owner.clientid); 1067 p = xdr_encode_hyper(p, args->lock_owner.clientid);
1091 WRITE32(16); 1068 *p++ = cpu_to_be32(16);
1092 WRITEMEM("lock id:", 8); 1069 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1093 WRITE64(args->lock_owner.id); 1070 xdr_encode_hyper(p, args->lock_owner.id);
1094 hdr->nops++; 1071 hdr->nops++;
1095 hdr->replen += decode_lockt_maxsz; 1072 hdr->replen += decode_lockt_maxsz;
1096} 1073}
@@ -1099,13 +1076,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
1099{ 1076{
1100 __be32 *p; 1077 __be32 *p;
1101 1078
1102 RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); 1079 p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
1103 WRITE32(OP_LOCKU); 1080 *p++ = cpu_to_be32(OP_LOCKU);
1104 WRITE32(nfs4_lock_type(args->fl, 0)); 1081 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1105 WRITE32(args->seqid->sequence->counter); 1082 *p++ = cpu_to_be32(args->seqid->sequence->counter);
1106 WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); 1083 p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
1107 WRITE64(args->fl->fl_start); 1084 p = xdr_encode_hyper(p, args->fl->fl_start);
1108 WRITE64(nfs4_lock_length(args->fl)); 1085 xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1109 hdr->nops++; 1086 hdr->nops++;
1110 hdr->replen += decode_locku_maxsz; 1087 hdr->replen += decode_locku_maxsz;
1111} 1088}
@@ -1115,10 +1092,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
1115 int len = name->len; 1092 int len = name->len;
1116 __be32 *p; 1093 __be32 *p;
1117 1094
1118 RESERVE_SPACE(8 + len); 1095 p = reserve_space(xdr, 8 + len);
1119 WRITE32(OP_LOOKUP); 1096 *p++ = cpu_to_be32(OP_LOOKUP);
1120 WRITE32(len); 1097 xdr_encode_opaque(p, name->name, len);
1121 WRITEMEM(name->name, len);
1122 hdr->nops++; 1098 hdr->nops++;
1123 hdr->replen += decode_lookup_maxsz; 1099 hdr->replen += decode_lookup_maxsz;
1124} 1100}
@@ -1127,21 +1103,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
1127{ 1103{
1128 __be32 *p; 1104 __be32 *p;
1129 1105
1130 RESERVE_SPACE(8); 1106 p = reserve_space(xdr, 8);
1131 switch (fmode & (FMODE_READ|FMODE_WRITE)) { 1107 switch (fmode & (FMODE_READ|FMODE_WRITE)) {
1132 case FMODE_READ: 1108 case FMODE_READ:
1133 WRITE32(NFS4_SHARE_ACCESS_READ); 1109 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
1134 break; 1110 break;
1135 case FMODE_WRITE: 1111 case FMODE_WRITE:
1136 WRITE32(NFS4_SHARE_ACCESS_WRITE); 1112 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
1137 break; 1113 break;
1138 case FMODE_READ|FMODE_WRITE: 1114 case FMODE_READ|FMODE_WRITE:
1139 WRITE32(NFS4_SHARE_ACCESS_BOTH); 1115 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
1140 break; 1116 break;
1141 default: 1117 default:
1142 WRITE32(0); 1118 *p++ = cpu_to_be32(0);
1143 } 1119 }
1144 WRITE32(0); /* for linux, share_deny = 0 always */ 1120 *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
1145} 1121}
1146 1122
1147static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) 1123static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1151,29 +1127,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1151 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, 1127 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
1152 * owner 4 = 32 1128 * owner 4 = 32
1153 */ 1129 */
1154 RESERVE_SPACE(8); 1130 p = reserve_space(xdr, 8);
1155 WRITE32(OP_OPEN); 1131 *p++ = cpu_to_be32(OP_OPEN);
1156 WRITE32(arg->seqid->sequence->counter); 1132 *p = cpu_to_be32(arg->seqid->sequence->counter);
1157 encode_share_access(xdr, arg->fmode); 1133 encode_share_access(xdr, arg->fmode);
1158 RESERVE_SPACE(28); 1134 p = reserve_space(xdr, 28);
1159 WRITE64(arg->clientid); 1135 p = xdr_encode_hyper(p, arg->clientid);
1160 WRITE32(16); 1136 *p++ = cpu_to_be32(16);
1161 WRITEMEM("open id:", 8); 1137 p = xdr_encode_opaque_fixed(p, "open id:", 8);
1162 WRITE64(arg->id); 1138 xdr_encode_hyper(p, arg->id);
1163} 1139}
1164 1140
1165static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) 1141static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
1166{ 1142{
1167 __be32 *p; 1143 __be32 *p;
1168 1144
1169 RESERVE_SPACE(4); 1145 p = reserve_space(xdr, 4);
1170 switch(arg->open_flags & O_EXCL) { 1146 switch(arg->open_flags & O_EXCL) {
1171 case 0: 1147 case 0:
1172 WRITE32(NFS4_CREATE_UNCHECKED); 1148 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
1173 encode_attrs(xdr, arg->u.attrs, arg->server); 1149 encode_attrs(xdr, arg->u.attrs, arg->server);
1174 break; 1150 break;
1175 default: 1151 default:
1176 WRITE32(NFS4_CREATE_EXCLUSIVE); 1152 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
1177 encode_nfs4_verifier(xdr, &arg->u.verifier); 1153 encode_nfs4_verifier(xdr, &arg->u.verifier);
1178 } 1154 }
1179} 1155}
@@ -1182,14 +1158,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
1182{ 1158{
1183 __be32 *p; 1159 __be32 *p;
1184 1160
1185 RESERVE_SPACE(4); 1161 p = reserve_space(xdr, 4);
1186 switch (arg->open_flags & O_CREAT) { 1162 switch (arg->open_flags & O_CREAT) {
1187 case 0: 1163 case 0:
1188 WRITE32(NFS4_OPEN_NOCREATE); 1164 *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
1189 break; 1165 break;
1190 default: 1166 default:
1191 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); 1167 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
1192 WRITE32(NFS4_OPEN_CREATE); 1168 *p = cpu_to_be32(NFS4_OPEN_CREATE);
1193 encode_createmode(xdr, arg); 1169 encode_createmode(xdr, arg);
1194 } 1170 }
1195} 1171}
@@ -1198,16 +1174,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
1198{ 1174{
1199 __be32 *p; 1175 __be32 *p;
1200 1176
1201 RESERVE_SPACE(4); 1177 p = reserve_space(xdr, 4);
1202 switch (delegation_type) { 1178 switch (delegation_type) {
1203 case 0: 1179 case 0:
1204 WRITE32(NFS4_OPEN_DELEGATE_NONE); 1180 *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
1205 break; 1181 break;
1206 case FMODE_READ: 1182 case FMODE_READ:
1207 WRITE32(NFS4_OPEN_DELEGATE_READ); 1183 *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
1208 break; 1184 break;
1209 case FMODE_WRITE|FMODE_READ: 1185 case FMODE_WRITE|FMODE_READ:
1210 WRITE32(NFS4_OPEN_DELEGATE_WRITE); 1186 *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
1211 break; 1187 break;
1212 default: 1188 default:
1213 BUG(); 1189 BUG();
@@ -1218,8 +1194,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
1218{ 1194{
1219 __be32 *p; 1195 __be32 *p;
1220 1196
1221 RESERVE_SPACE(4); 1197 p = reserve_space(xdr, 4);
1222 WRITE32(NFS4_OPEN_CLAIM_NULL); 1198 *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
1223 encode_string(xdr, name->len, name->name); 1199 encode_string(xdr, name->len, name->name);
1224} 1200}
1225 1201
@@ -1227,8 +1203,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
1227{ 1203{
1228 __be32 *p; 1204 __be32 *p;
1229 1205
1230 RESERVE_SPACE(4); 1206 p = reserve_space(xdr, 4);
1231 WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); 1207 *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
1232 encode_delegation_type(xdr, type); 1208 encode_delegation_type(xdr, type);
1233} 1209}
1234 1210
@@ -1236,9 +1212,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
1236{ 1212{
1237 __be32 *p; 1213 __be32 *p;
1238 1214
1239 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1215 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1240 WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); 1216 *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
1241 WRITEMEM(stateid->data, NFS4_STATEID_SIZE); 1217 xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
1242 encode_string(xdr, name->len, name->name); 1218 encode_string(xdr, name->len, name->name);
1243} 1219}
1244 1220
@@ -1267,10 +1243,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
1267{ 1243{
1268 __be32 *p; 1244 __be32 *p;
1269 1245
1270 RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); 1246 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
1271 WRITE32(OP_OPEN_CONFIRM); 1247 *p++ = cpu_to_be32(OP_OPEN_CONFIRM);
1272 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 1248 p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
1273 WRITE32(arg->seqid->sequence->counter); 1249 *p = cpu_to_be32(arg->seqid->sequence->counter);
1274 hdr->nops++; 1250 hdr->nops++;
1275 hdr->replen += decode_open_confirm_maxsz; 1251 hdr->replen += decode_open_confirm_maxsz;
1276} 1252}
@@ -1279,10 +1255,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
1279{ 1255{
1280 __be32 *p; 1256 __be32 *p;
1281 1257
1282 RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); 1258 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
1283 WRITE32(OP_OPEN_DOWNGRADE); 1259 *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
1284 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 1260 p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
1285 WRITE32(arg->seqid->sequence->counter); 1261 *p = cpu_to_be32(arg->seqid->sequence->counter);
1286 encode_share_access(xdr, arg->fmode); 1262 encode_share_access(xdr, arg->fmode);
1287 hdr->nops++; 1263 hdr->nops++;
1288 hdr->replen += decode_open_downgrade_maxsz; 1264 hdr->replen += decode_open_downgrade_maxsz;
@@ -1294,10 +1270,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
1294 int len = fh->size; 1270 int len = fh->size;
1295 __be32 *p; 1271 __be32 *p;
1296 1272
1297 RESERVE_SPACE(8 + len); 1273 p = reserve_space(xdr, 8 + len);
1298 WRITE32(OP_PUTFH); 1274 *p++ = cpu_to_be32(OP_PUTFH);
1299 WRITE32(len); 1275 xdr_encode_opaque(p, fh->data, len);
1300 WRITEMEM(fh->data, len);
1301 hdr->nops++; 1276 hdr->nops++;
1302 hdr->replen += decode_putfh_maxsz; 1277 hdr->replen += decode_putfh_maxsz;
1303} 1278}
@@ -1306,8 +1281,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1306{ 1281{
1307 __be32 *p; 1282 __be32 *p;
1308 1283
1309 RESERVE_SPACE(4); 1284 p = reserve_space(xdr, 4);
1310 WRITE32(OP_PUTROOTFH); 1285 *p = cpu_to_be32(OP_PUTROOTFH);
1311 hdr->nops++; 1286 hdr->nops++;
1312 hdr->replen += decode_putrootfh_maxsz; 1287 hdr->replen += decode_putrootfh_maxsz;
1313} 1288}
@@ -1317,26 +1292,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
1317 nfs4_stateid stateid; 1292 nfs4_stateid stateid;
1318 __be32 *p; 1293 __be32 *p;
1319 1294
1320 RESERVE_SPACE(NFS4_STATEID_SIZE); 1295 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1321 if (ctx->state != NULL) { 1296 if (ctx->state != NULL) {
1322 nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); 1297 nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
1323 WRITEMEM(stateid.data, NFS4_STATEID_SIZE); 1298 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
1324 } else 1299 } else
1325 WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); 1300 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
1326} 1301}
1327 1302
1328static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) 1303static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
1329{ 1304{
1330 __be32 *p; 1305 __be32 *p;
1331 1306
1332 RESERVE_SPACE(4); 1307 p = reserve_space(xdr, 4);
1333 WRITE32(OP_READ); 1308 *p = cpu_to_be32(OP_READ);
1334 1309
1335 encode_stateid(xdr, args->context); 1310 encode_stateid(xdr, args->context);
1336 1311
1337 RESERVE_SPACE(12); 1312 p = reserve_space(xdr, 12);
1338 WRITE64(args->offset); 1313 p = xdr_encode_hyper(p, args->offset);
1339 WRITE32(args->count); 1314 *p = cpu_to_be32(args->count);
1340 hdr->nops++; 1315 hdr->nops++;
1341 hdr->replen += decode_read_maxsz; 1316 hdr->replen += decode_read_maxsz;
1342} 1317}
@@ -1349,20 +1324,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1349 }; 1324 };
1350 __be32 *p; 1325 __be32 *p;
1351 1326
1352 RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); 1327 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
1353 WRITE32(OP_READDIR); 1328 *p++ = cpu_to_be32(OP_READDIR);
1354 WRITE64(readdir->cookie); 1329 p = xdr_encode_hyper(p, readdir->cookie);
1355 WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); 1330 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
1356 WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ 1331 *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */
1357 WRITE32(readdir->count); 1332 *p++ = cpu_to_be32(readdir->count);
1358 WRITE32(2); 1333 *p++ = cpu_to_be32(2);
1359 /* Switch to mounted_on_fileid if the server supports it */ 1334 /* Switch to mounted_on_fileid if the server supports it */
1360 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) 1335 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1361 attrs[0] &= ~FATTR4_WORD0_FILEID; 1336 attrs[0] &= ~FATTR4_WORD0_FILEID;
1362 else 1337 else
1363 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 1338 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1364 WRITE32(attrs[0] & readdir->bitmask[0]); 1339 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1365 WRITE32(attrs[1] & readdir->bitmask[1]); 1340 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1366 hdr->nops++; 1341 hdr->nops++;
1367 hdr->replen += decode_readdir_maxsz; 1342 hdr->replen += decode_readdir_maxsz;
1368 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", 1343 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1378,8 +1353,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
1378{ 1353{
1379 __be32 *p; 1354 __be32 *p;
1380 1355
1381 RESERVE_SPACE(4); 1356 p = reserve_space(xdr, 4);
1382 WRITE32(OP_READLINK); 1357 *p = cpu_to_be32(OP_READLINK);
1383 hdr->nops++; 1358 hdr->nops++;
1384 hdr->replen += decode_readlink_maxsz; 1359 hdr->replen += decode_readlink_maxsz;
1385} 1360}
@@ -1388,10 +1363,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
1388{ 1363{
1389 __be32 *p; 1364 __be32 *p;
1390 1365
1391 RESERVE_SPACE(8 + name->len); 1366 p = reserve_space(xdr, 8 + name->len);
1392 WRITE32(OP_REMOVE); 1367 *p++ = cpu_to_be32(OP_REMOVE);
1393 WRITE32(name->len); 1368 xdr_encode_opaque(p, name->name, name->len);
1394 WRITEMEM(name->name, name->len);
1395 hdr->nops++; 1369 hdr->nops++;
1396 hdr->replen += decode_remove_maxsz; 1370 hdr->replen += decode_remove_maxsz;
1397} 1371}
@@ -1400,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
1400{ 1374{
1401 __be32 *p; 1375 __be32 *p;
1402 1376
1403 RESERVE_SPACE(8 + oldname->len); 1377 p = reserve_space(xdr, 4);
1404 WRITE32(OP_RENAME); 1378 *p = cpu_to_be32(OP_RENAME);
1405 WRITE32(oldname->len); 1379 encode_string(xdr, oldname->len, oldname->name);
1406 WRITEMEM(oldname->name, oldname->len); 1380 encode_string(xdr, newname->len, newname->name);
1407
1408 RESERVE_SPACE(4 + newname->len);
1409 WRITE32(newname->len);
1410 WRITEMEM(newname->name, newname->len);
1411 hdr->nops++; 1381 hdr->nops++;
1412 hdr->replen += decode_rename_maxsz; 1382 hdr->replen += decode_rename_maxsz;
1413} 1383}
@@ -1416,9 +1386,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
1416{ 1386{
1417 __be32 *p; 1387 __be32 *p;
1418 1388
1419 RESERVE_SPACE(12); 1389 p = reserve_space(xdr, 12);
1420 WRITE32(OP_RENEW); 1390 *p++ = cpu_to_be32(OP_RENEW);
1421 WRITE64(client_stateid->cl_clientid); 1391 xdr_encode_hyper(p, client_stateid->cl_clientid);
1422 hdr->nops++; 1392 hdr->nops++;
1423 hdr->replen += decode_renew_maxsz; 1393 hdr->replen += decode_renew_maxsz;
1424} 1394}
@@ -1428,8 +1398,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1428{ 1398{
1429 __be32 *p; 1399 __be32 *p;
1430 1400
1431 RESERVE_SPACE(4); 1401 p = reserve_space(xdr, 4);
1432 WRITE32(OP_RESTOREFH); 1402 *p = cpu_to_be32(OP_RESTOREFH);
1433 hdr->nops++; 1403 hdr->nops++;
1434 hdr->replen += decode_restorefh_maxsz; 1404 hdr->replen += decode_restorefh_maxsz;
1435} 1405}
@@ -1439,16 +1409,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1439{ 1409{
1440 __be32 *p; 1410 __be32 *p;
1441 1411
1442 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1412 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1443 WRITE32(OP_SETATTR); 1413 *p++ = cpu_to_be32(OP_SETATTR);
1444 WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); 1414 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
1445 RESERVE_SPACE(2*4); 1415 p = reserve_space(xdr, 2*4);
1446 WRITE32(1); 1416 *p++ = cpu_to_be32(1);
1447 WRITE32(FATTR4_WORD0_ACL); 1417 *p = cpu_to_be32(FATTR4_WORD0_ACL);
1448 if (arg->acl_len % 4) 1418 if (arg->acl_len % 4)
1449 return -EINVAL; 1419 return -EINVAL;
1450 RESERVE_SPACE(4); 1420 p = reserve_space(xdr, 4);
1451 WRITE32(arg->acl_len); 1421 *p = cpu_to_be32(arg->acl_len);
1452 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1422 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
1453 hdr->nops++; 1423 hdr->nops++;
1454 hdr->replen += decode_setacl_maxsz; 1424 hdr->replen += decode_setacl_maxsz;
@@ -1460,8 +1430,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1460{ 1430{
1461 __be32 *p; 1431 __be32 *p;
1462 1432
1463 RESERVE_SPACE(4); 1433 p = reserve_space(xdr, 4);
1464 WRITE32(OP_SAVEFH); 1434 *p = cpu_to_be32(OP_SAVEFH);
1465 hdr->nops++; 1435 hdr->nops++;
1466 hdr->replen += decode_savefh_maxsz; 1436 hdr->replen += decode_savefh_maxsz;
1467} 1437}
@@ -1470,9 +1440,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
1470{ 1440{
1471 __be32 *p; 1441 __be32 *p;
1472 1442
1473 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1443 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1474 WRITE32(OP_SETATTR); 1444 *p++ = cpu_to_be32(OP_SETATTR);
1475 WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); 1445 xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
1476 hdr->nops++; 1446 hdr->nops++;
1477 hdr->replen += decode_setattr_maxsz; 1447 hdr->replen += decode_setattr_maxsz;
1478 encode_attrs(xdr, arg->iap, server); 1448 encode_attrs(xdr, arg->iap, server);
@@ -1482,17 +1452,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1482{ 1452{
1483 __be32 *p; 1453 __be32 *p;
1484 1454
1485 RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); 1455 p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
1486 WRITE32(OP_SETCLIENTID); 1456 *p++ = cpu_to_be32(OP_SETCLIENTID);
1487 WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); 1457 xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
1488 1458
1489 encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); 1459 encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
1490 RESERVE_SPACE(4); 1460 p = reserve_space(xdr, 4);
1491 WRITE32(setclientid->sc_prog); 1461 *p = cpu_to_be32(setclientid->sc_prog);
1492 encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); 1462 encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
1493 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); 1463 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
1494 RESERVE_SPACE(4); 1464 p = reserve_space(xdr, 4);
1495 WRITE32(setclientid->sc_cb_ident); 1465 *p = cpu_to_be32(setclientid->sc_cb_ident);
1496 hdr->nops++; 1466 hdr->nops++;
1497 hdr->replen += decode_setclientid_maxsz; 1467 hdr->replen += decode_setclientid_maxsz;
1498} 1468}
@@ -1501,10 +1471,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
1501{ 1471{
1502 __be32 *p; 1472 __be32 *p;
1503 1473
1504 RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); 1474 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
1505 WRITE32(OP_SETCLIENTID_CONFIRM); 1475 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
1506 WRITE64(client_state->cl_clientid); 1476 p = xdr_encode_hyper(p, client_state->cl_clientid);
1507 WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); 1477 xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
1508 hdr->nops++; 1478 hdr->nops++;
1509 hdr->replen += decode_setclientid_confirm_maxsz; 1479 hdr->replen += decode_setclientid_confirm_maxsz;
1510} 1480}
@@ -1513,15 +1483,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1513{ 1483{
1514 __be32 *p; 1484 __be32 *p;
1515 1485
1516 RESERVE_SPACE(4); 1486 p = reserve_space(xdr, 4);
1517 WRITE32(OP_WRITE); 1487 *p = cpu_to_be32(OP_WRITE);
1518 1488
1519 encode_stateid(xdr, args->context); 1489 encode_stateid(xdr, args->context);
1520 1490
1521 RESERVE_SPACE(16); 1491 p = reserve_space(xdr, 16);
1522 WRITE64(args->offset); 1492 p = xdr_encode_hyper(p, args->offset);
1523 WRITE32(args->stable); 1493 *p++ = cpu_to_be32(args->stable);
1524 WRITE32(args->count); 1494 *p = cpu_to_be32(args->count);
1525 1495
1526 xdr_write_pages(xdr, args->pages, args->pgbase, args->count); 1496 xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
1527 hdr->nops++; 1497 hdr->nops++;
@@ -1532,10 +1502,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
1532{ 1502{
1533 __be32 *p; 1503 __be32 *p;
1534 1504
1535 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1505 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1536 1506
1537 WRITE32(OP_DELEGRETURN); 1507 *p++ = cpu_to_be32(OP_DELEGRETURN);
1538 WRITEMEM(stateid->data, NFS4_STATEID_SIZE); 1508 xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
1539 hdr->nops++; 1509 hdr->nops++;
1540 hdr->replen += decode_delegreturn_maxsz; 1510 hdr->replen += decode_delegreturn_maxsz;
1541} 1511}
@@ -1548,16 +1518,16 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1548{ 1518{
1549 __be32 *p; 1519 __be32 *p;
1550 1520
1551 RESERVE_SPACE(4 + sizeof(args->verifier->data)); 1521 p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
1552 WRITE32(OP_EXCHANGE_ID); 1522 *p++ = cpu_to_be32(OP_EXCHANGE_ID);
1553 WRITEMEM(args->verifier->data, sizeof(args->verifier->data)); 1523 xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
1554 1524
1555 encode_string(xdr, args->id_len, args->id); 1525 encode_string(xdr, args->id_len, args->id);
1556 1526
1557 RESERVE_SPACE(12); 1527 p = reserve_space(xdr, 12);
1558 WRITE32(args->flags); 1528 *p++ = cpu_to_be32(args->flags);
1559 WRITE32(0); /* zero length state_protect4_a */ 1529 *p++ = cpu_to_be32(0); /* zero length state_protect4_a */
1560 WRITE32(0); /* zero length implementation id array */ 1530 *p = cpu_to_be32(0); /* zero length implementation id array */
1561 hdr->nops++; 1531 hdr->nops++;
1562 hdr->replen += decode_exchange_id_maxsz; 1532 hdr->replen += decode_exchange_id_maxsz;
1563} 1533}
@@ -1571,55 +1541,43 @@ static void encode_create_session(struct xdr_stream *xdr,
1571 uint32_t len; 1541 uint32_t len;
1572 struct nfs_client *clp = args->client; 1542 struct nfs_client *clp = args->client;
1573 1543
1574 RESERVE_SPACE(4); 1544 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1575 WRITE32(OP_CREATE_SESSION); 1545 clp->cl_ipaddr);
1576
1577 RESERVE_SPACE(8);
1578 WRITE64(clp->cl_ex_clid);
1579 1546
1580 RESERVE_SPACE(8); 1547 p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
1581 WRITE32(clp->cl_seqid); /*Sequence id */ 1548 *p++ = cpu_to_be32(OP_CREATE_SESSION);
1582 WRITE32(args->flags); /*flags */ 1549 p = xdr_encode_hyper(p, clp->cl_ex_clid);
1550 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
1551 *p++ = cpu_to_be32(args->flags); /*flags */
1583 1552
1584 RESERVE_SPACE(2*28); /* 2 channel_attrs */
1585 /* Fore Channel */ 1553 /* Fore Channel */
1586 WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ 1554 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
1587 WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */ 1555 *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
1588 WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */ 1556 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
1589 WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ 1557 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */
1590 WRITE32(args->fc_attrs.max_ops); /* max operations */ 1558 *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
1591 WRITE32(args->fc_attrs.max_reqs); /* max requests */ 1559 *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
1592 WRITE32(0); /* rdmachannel_attrs */ 1560 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
1593 1561
1594 /* Back Channel */ 1562 /* Back Channel */
1595 WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ 1563 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
1596 WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */ 1564 *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */
1597 WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */ 1565 *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */
1598 WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ 1566 *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
1599 WRITE32(args->bc_attrs.max_ops); /* max operations */ 1567 *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */
1600 WRITE32(args->bc_attrs.max_reqs); /* max requests */ 1568 *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */
1601 WRITE32(0); /* rdmachannel_attrs */ 1569 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
1602 1570
1603 RESERVE_SPACE(4); 1571 *p++ = cpu_to_be32(args->cb_program); /* cb_program */
1604 WRITE32(args->cb_program); /* cb_program */ 1572 *p++ = cpu_to_be32(1);
1605 1573 *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
1606 RESERVE_SPACE(4); /* # of security flavors */
1607 WRITE32(1);
1608
1609 RESERVE_SPACE(4);
1610 WRITE32(RPC_AUTH_UNIX); /* auth_sys */
1611 1574
1612 /* authsys_parms rfc1831 */ 1575 /* authsys_parms rfc1831 */
1613 RESERVE_SPACE(4); 1576 *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */
1614 WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ 1577 p = xdr_encode_opaque(p, machine_name, len);
1615 len = scnprintf(machine_name, sizeof(machine_name), "%s", 1578 *p++ = cpu_to_be32(0); /* UID */
1616 clp->cl_ipaddr); 1579 *p++ = cpu_to_be32(0); /* GID */
1617 RESERVE_SPACE(16 + len); 1580 *p = cpu_to_be32(0); /* No more gids */
1618 WRITE32(len);
1619 WRITEMEM(machine_name, len);
1620 WRITE32(0); /* UID */
1621 WRITE32(0); /* GID */
1622 WRITE32(0); /* No more gids */
1623 hdr->nops++; 1581 hdr->nops++;
1624 hdr->replen += decode_create_session_maxsz; 1582 hdr->replen += decode_create_session_maxsz;
1625} 1583}
@@ -1629,9 +1587,9 @@ static void encode_destroy_session(struct xdr_stream *xdr,
1629 struct compound_hdr *hdr) 1587 struct compound_hdr *hdr)
1630{ 1588{
1631 __be32 *p; 1589 __be32 *p;
1632 RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); 1590 p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
1633 WRITE32(OP_DESTROY_SESSION); 1591 *p++ = cpu_to_be32(OP_DESTROY_SESSION);
1634 WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1592 xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1635 hdr->nops++; 1593 hdr->nops++;
1636 hdr->replen += decode_destroy_session_maxsz; 1594 hdr->replen += decode_destroy_session_maxsz;
1637} 1595}
@@ -1655,8 +1613,8 @@ static void encode_sequence(struct xdr_stream *xdr,
1655 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); 1613 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
1656 slot = tp->slots + args->sa_slotid; 1614 slot = tp->slots + args->sa_slotid;
1657 1615
1658 RESERVE_SPACE(4); 1616 p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
1659 WRITE32(OP_SEQUENCE); 1617 *p++ = cpu_to_be32(OP_SEQUENCE);
1660 1618
1661 /* 1619 /*
1662 * Sessionid + seqid + slotid + max slotid + cache_this 1620 * Sessionid + seqid + slotid + max slotid + cache_this
@@ -1670,12 +1628,11 @@ static void encode_sequence(struct xdr_stream *xdr,
1670 ((u32 *)session->sess_id.data)[3], 1628 ((u32 *)session->sess_id.data)[3],
1671 slot->seq_nr, args->sa_slotid, 1629 slot->seq_nr, args->sa_slotid,
1672 tp->highest_used_slotid, args->sa_cache_this); 1630 tp->highest_used_slotid, args->sa_cache_this);
1673 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); 1631 p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1674 WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1632 *p++ = cpu_to_be32(slot->seq_nr);
1675 WRITE32(slot->seq_nr); 1633 *p++ = cpu_to_be32(args->sa_slotid);
1676 WRITE32(args->sa_slotid); 1634 *p++ = cpu_to_be32(tp->highest_used_slotid);
1677 WRITE32(tp->highest_used_slotid); 1635 *p = cpu_to_be32(args->sa_cache_this);
1678 WRITE32(args->sa_cache_this);
1679 hdr->nops++; 1636 hdr->nops++;
1680 hdr->replen += decode_sequence_maxsz; 1637 hdr->replen += decode_sequence_maxsz;
1681#endif /* CONFIG_NFS_V4_1 */ 1638#endif /* CONFIG_NFS_V4_1 */
@@ -2466,68 +2423,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
2466} 2423}
2467#endif /* CONFIG_NFS_V4_1 */ 2424#endif /* CONFIG_NFS_V4_1 */
2468 2425
2469/* 2426static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
2470 * START OF "GENERIC" DECODE ROUTINES. 2427{
2471 * These may look a little ugly since they are imported from a "generic" 2428 dprintk("nfs: %s: prematurely hit end of receive buffer. "
2472 * set of XDR encode/decode routines which are intended to be shared by 2429 "Remaining buffer length is %tu words.\n",
2473 * all of our NFSv4 implementations (OpenBSD, MacOS X...). 2430 func, xdr->end - xdr->p);
2474 * 2431}
2475 * If the pain of reading these is too great, it should be a straightforward
2476 * task to translate them into Linux-specific versions which are more
2477 * consistent with the style used in NFSv2/v3...
2478 */
2479#define READ32(x) (x) = ntohl(*p++)
2480#define READ64(x) do { \
2481 (x) = (u64)ntohl(*p++) << 32; \
2482 (x) |= ntohl(*p++); \
2483} while (0)
2484#define READTIME(x) do { \
2485 p++; \
2486 (x.tv_sec) = ntohl(*p++); \
2487 (x.tv_nsec) = ntohl(*p++); \
2488} while (0)
2489#define COPYMEM(x,nbytes) do { \
2490 memcpy((x), p, nbytes); \
2491 p += XDR_QUADLEN(nbytes); \
2492} while (0)
2493
2494#define READ_BUF(nbytes) do { \
2495 p = xdr_inline_decode(xdr, nbytes); \
2496 if (unlikely(!p)) { \
2497 dprintk("nfs: %s: prematurely hit end of receive" \
2498 " buffer\n", __func__); \
2499 dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
2500 __func__, xdr->p, nbytes, xdr->end); \
2501 return -EIO; \
2502 } \
2503} while (0)
2504 2432
2505static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) 2433static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
2506{ 2434{
2507 __be32 *p; 2435 __be32 *p;
2508 2436
2509 READ_BUF(4); 2437 p = xdr_inline_decode(xdr, 4);
2510 READ32(*len); 2438 if (unlikely(!p))
2511 READ_BUF(*len); 2439 goto out_overflow;
2440 *len = be32_to_cpup(p);
2441 p = xdr_inline_decode(xdr, *len);
2442 if (unlikely(!p))
2443 goto out_overflow;
2512 *string = (char *)p; 2444 *string = (char *)p;
2513 return 0; 2445 return 0;
2446out_overflow:
2447 print_overflow_msg(__func__, xdr);
2448 return -EIO;
2514} 2449}
2515 2450
2516static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) 2451static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
2517{ 2452{
2518 __be32 *p; 2453 __be32 *p;
2519 2454
2520 READ_BUF(8); 2455 p = xdr_inline_decode(xdr, 8);
2521 READ32(hdr->status); 2456 if (unlikely(!p))
2522 READ32(hdr->taglen); 2457 goto out_overflow;
2458 hdr->status = be32_to_cpup(p++);
2459 hdr->taglen = be32_to_cpup(p);
2523 2460
2524 READ_BUF(hdr->taglen + 4); 2461 p = xdr_inline_decode(xdr, hdr->taglen + 4);
2462 if (unlikely(!p))
2463 goto out_overflow;
2525 hdr->tag = (char *)p; 2464 hdr->tag = (char *)p;
2526 p += XDR_QUADLEN(hdr->taglen); 2465 p += XDR_QUADLEN(hdr->taglen);
2527 READ32(hdr->nops); 2466 hdr->nops = be32_to_cpup(p);
2528 if (unlikely(hdr->nops < 1)) 2467 if (unlikely(hdr->nops < 1))
2529 return nfs4_stat_to_errno(hdr->status); 2468 return nfs4_stat_to_errno(hdr->status);
2530 return 0; 2469 return 0;
2470out_overflow:
2471 print_overflow_msg(__func__, xdr);
2472 return -EIO;
2531} 2473}
2532 2474
2533static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) 2475static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
@@ -2536,18 +2478,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
2536 uint32_t opnum; 2478 uint32_t opnum;
2537 int32_t nfserr; 2479 int32_t nfserr;
2538 2480
2539 READ_BUF(8); 2481 p = xdr_inline_decode(xdr, 8);
2540 READ32(opnum); 2482 if (unlikely(!p))
2483 goto out_overflow;
2484 opnum = be32_to_cpup(p++);
2541 if (opnum != expected) { 2485 if (opnum != expected) {
2542 dprintk("nfs: Server returned operation" 2486 dprintk("nfs: Server returned operation"
2543 " %d but we issued a request for %d\n", 2487 " %d but we issued a request for %d\n",
2544 opnum, expected); 2488 opnum, expected);
2545 return -EIO; 2489 return -EIO;
2546 } 2490 }
2547 READ32(nfserr); 2491 nfserr = be32_to_cpup(p);
2548 if (nfserr != NFS_OK) 2492 if (nfserr != NFS_OK)
2549 return nfs4_stat_to_errno(nfserr); 2493 return nfs4_stat_to_errno(nfserr);
2550 return 0; 2494 return 0;
2495out_overflow:
2496 print_overflow_msg(__func__, xdr);
2497 return -EIO;
2551} 2498}
2552 2499
2553/* Dummy routine */ 2500/* Dummy routine */
@@ -2557,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
2557 unsigned int strlen; 2504 unsigned int strlen;
2558 char *str; 2505 char *str;
2559 2506
2560 READ_BUF(12); 2507 p = xdr_inline_decode(xdr, 12);
2561 return decode_opaque_inline(xdr, &strlen, &str); 2508 if (likely(p))
2509 return decode_opaque_inline(xdr, &strlen, &str);
2510 print_overflow_msg(__func__, xdr);
2511 return -EIO;
2562} 2512}
2563 2513
2564static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) 2514static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -2566,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
2566 uint32_t bmlen; 2516 uint32_t bmlen;
2567 __be32 *p; 2517 __be32 *p;
2568 2518
2569 READ_BUF(4); 2519 p = xdr_inline_decode(xdr, 4);
2570 READ32(bmlen); 2520 if (unlikely(!p))
2521 goto out_overflow;
2522 bmlen = be32_to_cpup(p);
2571 2523
2572 bitmap[0] = bitmap[1] = 0; 2524 bitmap[0] = bitmap[1] = 0;
2573 READ_BUF((bmlen << 2)); 2525 p = xdr_inline_decode(xdr, (bmlen << 2));
2526 if (unlikely(!p))
2527 goto out_overflow;
2574 if (bmlen > 0) { 2528 if (bmlen > 0) {
2575 READ32(bitmap[0]); 2529 bitmap[0] = be32_to_cpup(p++);
2576 if (bmlen > 1) 2530 if (bmlen > 1)
2577 READ32(bitmap[1]); 2531 bitmap[1] = be32_to_cpup(p);
2578 } 2532 }
2579 return 0; 2533 return 0;
2534out_overflow:
2535 print_overflow_msg(__func__, xdr);
2536 return -EIO;
2580} 2537}
2581 2538
2582static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) 2539static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
2583{ 2540{
2584 __be32 *p; 2541 __be32 *p;
2585 2542
2586 READ_BUF(4); 2543 p = xdr_inline_decode(xdr, 4);
2587 READ32(*attrlen); 2544 if (unlikely(!p))
2545 goto out_overflow;
2546 *attrlen = be32_to_cpup(p);
2588 *savep = xdr->p; 2547 *savep = xdr->p;
2589 return 0; 2548 return 0;
2549out_overflow:
2550 print_overflow_msg(__func__, xdr);
2551 return -EIO;
2590} 2552}
2591 2553
2592static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) 2554static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -2609,8 +2571,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2609 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) 2571 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
2610 return -EIO; 2572 return -EIO;
2611 if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { 2573 if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
2612 READ_BUF(4); 2574 p = xdr_inline_decode(xdr, 4);
2613 READ32(*type); 2575 if (unlikely(!p))
2576 goto out_overflow;
2577 *type = be32_to_cpup(p);
2614 if (*type < NF4REG || *type > NF4NAMEDATTR) { 2578 if (*type < NF4REG || *type > NF4NAMEDATTR) {
2615 dprintk("%s: bad type %d\n", __func__, *type); 2579 dprintk("%s: bad type %d\n", __func__, *type);
2616 return -EIO; 2580 return -EIO;
@@ -2620,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2620 } 2584 }
2621 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); 2585 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
2622 return ret; 2586 return ret;
2587out_overflow:
2588 print_overflow_msg(__func__, xdr);
2589 return -EIO;
2623} 2590}
2624 2591
2625static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) 2592static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -2631,14 +2598,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2631 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) 2598 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
2632 return -EIO; 2599 return -EIO;
2633 if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { 2600 if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
2634 READ_BUF(8); 2601 p = xdr_inline_decode(xdr, 8);
2635 READ64(*change); 2602 if (unlikely(!p))
2603 goto out_overflow;
2604 xdr_decode_hyper(p, change);
2636 bitmap[0] &= ~FATTR4_WORD0_CHANGE; 2605 bitmap[0] &= ~FATTR4_WORD0_CHANGE;
2637 ret = NFS_ATTR_FATTR_CHANGE; 2606 ret = NFS_ATTR_FATTR_CHANGE;
2638 } 2607 }
2639 dprintk("%s: change attribute=%Lu\n", __func__, 2608 dprintk("%s: change attribute=%Lu\n", __func__,
2640 (unsigned long long)*change); 2609 (unsigned long long)*change);
2641 return ret; 2610 return ret;
2611out_overflow:
2612 print_overflow_msg(__func__, xdr);
2613 return -EIO;
2642} 2614}
2643 2615
2644static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) 2616static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -2650,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
2650 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) 2622 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
2651 return -EIO; 2623 return -EIO;
2652 if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { 2624 if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
2653 READ_BUF(8); 2625 p = xdr_inline_decode(xdr, 8);
2654 READ64(*size); 2626 if (unlikely(!p))
2627 goto out_overflow;
2628 xdr_decode_hyper(p, size);
2655 bitmap[0] &= ~FATTR4_WORD0_SIZE; 2629 bitmap[0] &= ~FATTR4_WORD0_SIZE;
2656 ret = NFS_ATTR_FATTR_SIZE; 2630 ret = NFS_ATTR_FATTR_SIZE;
2657 } 2631 }
2658 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); 2632 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
2659 return ret; 2633 return ret;
2634out_overflow:
2635 print_overflow_msg(__func__, xdr);
2636 return -EIO;
2660} 2637}
2661 2638
2662static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2639static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2667,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
2667 if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) 2644 if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
2668 return -EIO; 2645 return -EIO;
2669 if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { 2646 if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
2670 READ_BUF(4); 2647 p = xdr_inline_decode(xdr, 4);
2671 READ32(*res); 2648 if (unlikely(!p))
2649 goto out_overflow;
2650 *res = be32_to_cpup(p);
2672 bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; 2651 bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
2673 } 2652 }
2674 dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); 2653 dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
2675 return 0; 2654 return 0;
2655out_overflow:
2656 print_overflow_msg(__func__, xdr);
2657 return -EIO;
2676} 2658}
2677 2659
2678static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2660static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2683,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
2683 if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) 2665 if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
2684 return -EIO; 2666 return -EIO;
2685 if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { 2667 if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
2686 READ_BUF(4); 2668 p = xdr_inline_decode(xdr, 4);
2687 READ32(*res); 2669 if (unlikely(!p))
2670 goto out_overflow;
2671 *res = be32_to_cpup(p);
2688 bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; 2672 bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
2689 } 2673 }
2690 dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); 2674 dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
2691 return 0; 2675 return 0;
2676out_overflow:
2677 print_overflow_msg(__func__, xdr);
2678 return -EIO;
2692} 2679}
2693 2680
2694static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) 2681static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -2701,9 +2688,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2701 if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) 2688 if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
2702 return -EIO; 2689 return -EIO;
2703 if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { 2690 if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
2704 READ_BUF(16); 2691 p = xdr_inline_decode(xdr, 16);
2705 READ64(fsid->major); 2692 if (unlikely(!p))
2706 READ64(fsid->minor); 2693 goto out_overflow;
2694 p = xdr_decode_hyper(p, &fsid->major);
2695 xdr_decode_hyper(p, &fsid->minor);
2707 bitmap[0] &= ~FATTR4_WORD0_FSID; 2696 bitmap[0] &= ~FATTR4_WORD0_FSID;
2708 ret = NFS_ATTR_FATTR_FSID; 2697 ret = NFS_ATTR_FATTR_FSID;
2709 } 2698 }
@@ -2711,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2711 (unsigned long long)fsid->major, 2700 (unsigned long long)fsid->major,
2712 (unsigned long long)fsid->minor); 2701 (unsigned long long)fsid->minor);
2713 return ret; 2702 return ret;
2703out_overflow:
2704 print_overflow_msg(__func__, xdr);
2705 return -EIO;
2714} 2706}
2715 2707
2716static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2708static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2721,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
2721 if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) 2713 if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
2722 return -EIO; 2714 return -EIO;
2723 if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { 2715 if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
2724 READ_BUF(4); 2716 p = xdr_inline_decode(xdr, 4);
2725 READ32(*res); 2717 if (unlikely(!p))
2718 goto out_overflow;
2719 *res = be32_to_cpup(p);
2726 bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; 2720 bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
2727 } 2721 }
2728 dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); 2722 dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
2729 return 0; 2723 return 0;
2724out_overflow:
2725 print_overflow_msg(__func__, xdr);
2726 return -EIO;
2730} 2727}
2731 2728
2732static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2729static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2737,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
2737 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) 2734 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
2738 return -EIO; 2735 return -EIO;
2739 if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { 2736 if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
2740 READ_BUF(4); 2737 p = xdr_inline_decode(xdr, 4);
2741 READ32(*res); 2738 if (unlikely(!p))
2739 goto out_overflow;
2740 *res = be32_to_cpup(p);
2742 bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; 2741 bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
2743 } 2742 }
2744 dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); 2743 dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
2745 return 0; 2744 return 0;
2745out_overflow:
2746 print_overflow_msg(__func__, xdr);
2747 return -EIO;
2746} 2748}
2747 2749
2748static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2750static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2754,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2754 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) 2756 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
2755 return -EIO; 2757 return -EIO;
2756 if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { 2758 if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
2757 READ_BUF(8); 2759 p = xdr_inline_decode(xdr, 8);
2758 READ64(*fileid); 2760 if (unlikely(!p))
2761 goto out_overflow;
2762 xdr_decode_hyper(p, fileid);
2759 bitmap[0] &= ~FATTR4_WORD0_FILEID; 2763 bitmap[0] &= ~FATTR4_WORD0_FILEID;
2760 ret = NFS_ATTR_FATTR_FILEID; 2764 ret = NFS_ATTR_FATTR_FILEID;
2761 } 2765 }
2762 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2766 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2763 return ret; 2767 return ret;
2768out_overflow:
2769 print_overflow_msg(__func__, xdr);
2770 return -EIO;
2764} 2771}
2765 2772
2766static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2773static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2772,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
2772 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) 2779 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
2773 return -EIO; 2780 return -EIO;
2774 if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { 2781 if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
2775 READ_BUF(8); 2782 p = xdr_inline_decode(xdr, 8);
2776 READ64(*fileid); 2783 if (unlikely(!p))
2784 goto out_overflow;
2785 xdr_decode_hyper(p, fileid);
2777 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 2786 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
2778 ret = NFS_ATTR_FATTR_FILEID; 2787 ret = NFS_ATTR_FATTR_FILEID;
2779 } 2788 }
2780 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2789 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2781 return ret; 2790 return ret;
2791out_overflow:
2792 print_overflow_msg(__func__, xdr);
2793 return -EIO;
2782} 2794}
2783 2795
2784static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2796static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2790,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
2790 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) 2802 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
2791 return -EIO; 2803 return -EIO;
2792 if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { 2804 if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
2793 READ_BUF(8); 2805 p = xdr_inline_decode(xdr, 8);
2794 READ64(*res); 2806 if (unlikely(!p))
2807 goto out_overflow;
2808 xdr_decode_hyper(p, res);
2795 bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; 2809 bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
2796 } 2810 }
2797 dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); 2811 dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
2798 return status; 2812 return status;
2813out_overflow:
2814 print_overflow_msg(__func__, xdr);
2815 return -EIO;
2799} 2816}
2800 2817
2801static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2818static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2807,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
2807 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) 2824 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
2808 return -EIO; 2825 return -EIO;
2809 if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { 2826 if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
2810 READ_BUF(8); 2827 p = xdr_inline_decode(xdr, 8);
2811 READ64(*res); 2828 if (unlikely(!p))
2829 goto out_overflow;
2830 xdr_decode_hyper(p, res);
2812 bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; 2831 bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
2813 } 2832 }
2814 dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); 2833 dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
2815 return status; 2834 return status;
2835out_overflow:
2836 print_overflow_msg(__func__, xdr);
2837 return -EIO;
2816} 2838}
2817 2839
2818static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2840static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2824,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2824 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) 2846 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
2825 return -EIO; 2847 return -EIO;
2826 if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { 2848 if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
2827 READ_BUF(8); 2849 p = xdr_inline_decode(xdr, 8);
2828 READ64(*res); 2850 if (unlikely(!p))
2851 goto out_overflow;
2852 xdr_decode_hyper(p, res);
2829 bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; 2853 bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
2830 } 2854 }
2831 dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); 2855 dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
2832 return status; 2856 return status;
2857out_overflow:
2858 print_overflow_msg(__func__, xdr);
2859 return -EIO;
2833} 2860}
2834 2861
2835static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) 2862static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -2838,8 +2865,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
2838 __be32 *p; 2865 __be32 *p;
2839 int status = 0; 2866 int status = 0;
2840 2867
2841 READ_BUF(4); 2868 p = xdr_inline_decode(xdr, 4);
2842 READ32(n); 2869 if (unlikely(!p))
2870 goto out_overflow;
2871 n = be32_to_cpup(p);
2843 if (n == 0) 2872 if (n == 0)
2844 goto root_path; 2873 goto root_path;
2845 dprintk("path "); 2874 dprintk("path ");
@@ -2873,6 +2902,9 @@ out_eio:
2873 dprintk(" status %d", status); 2902 dprintk(" status %d", status);
2874 status = -EIO; 2903 status = -EIO;
2875 goto out; 2904 goto out;
2905out_overflow:
2906 print_overflow_msg(__func__, xdr);
2907 return -EIO;
2876} 2908}
2877 2909
2878static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) 2910static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -2890,8 +2922,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2890 status = decode_pathname(xdr, &res->fs_path); 2922 status = decode_pathname(xdr, &res->fs_path);
2891 if (unlikely(status != 0)) 2923 if (unlikely(status != 0))
2892 goto out; 2924 goto out;
2893 READ_BUF(4); 2925 p = xdr_inline_decode(xdr, 4);
2894 READ32(n); 2926 if (unlikely(!p))
2927 goto out_overflow;
2928 n = be32_to_cpup(p);
2895 if (n <= 0) 2929 if (n <= 0)
2896 goto out_eio; 2930 goto out_eio;
2897 res->nlocations = 0; 2931 res->nlocations = 0;
@@ -2899,8 +2933,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2899 u32 m; 2933 u32 m;
2900 struct nfs4_fs_location *loc = &res->locations[res->nlocations]; 2934 struct nfs4_fs_location *loc = &res->locations[res->nlocations];
2901 2935
2902 READ_BUF(4); 2936 p = xdr_inline_decode(xdr, 4);
2903 READ32(m); 2937 if (unlikely(!p))
2938 goto out_overflow;
2939 m = be32_to_cpup(p);
2904 2940
2905 loc->nservers = 0; 2941 loc->nservers = 0;
2906 dprintk("%s: servers ", __func__); 2942 dprintk("%s: servers ", __func__);
@@ -2939,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2939out: 2975out:
2940 dprintk("%s: fs_locations done, error = %d\n", __func__, status); 2976 dprintk("%s: fs_locations done, error = %d\n", __func__, status);
2941 return status; 2977 return status;
2978out_overflow:
2979 print_overflow_msg(__func__, xdr);
2942out_eio: 2980out_eio:
2943 status = -EIO; 2981 status = -EIO;
2944 goto out; 2982 goto out;
@@ -2953,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
2953 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) 2991 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
2954 return -EIO; 2992 return -EIO;
2955 if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { 2993 if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
2956 READ_BUF(8); 2994 p = xdr_inline_decode(xdr, 8);
2957 READ64(*res); 2995 if (unlikely(!p))
2996 goto out_overflow;
2997 xdr_decode_hyper(p, res);
2958 bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; 2998 bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
2959 } 2999 }
2960 dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); 3000 dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
2961 return status; 3001 return status;
3002out_overflow:
3003 print_overflow_msg(__func__, xdr);
3004 return -EIO;
2962} 3005}
2963 3006
2964static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) 3007static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -2970,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
2970 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) 3013 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
2971 return -EIO; 3014 return -EIO;
2972 if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { 3015 if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
2973 READ_BUF(4); 3016 p = xdr_inline_decode(xdr, 4);
2974 READ32(*maxlink); 3017 if (unlikely(!p))
3018 goto out_overflow;
3019 *maxlink = be32_to_cpup(p);
2975 bitmap[0] &= ~FATTR4_WORD0_MAXLINK; 3020 bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
2976 } 3021 }
2977 dprintk("%s: maxlink=%u\n", __func__, *maxlink); 3022 dprintk("%s: maxlink=%u\n", __func__, *maxlink);
2978 return status; 3023 return status;
3024out_overflow:
3025 print_overflow_msg(__func__, xdr);
3026 return -EIO;
2979} 3027}
2980 3028
2981static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) 3029static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -2987,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
2987 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) 3035 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
2988 return -EIO; 3036 return -EIO;
2989 if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { 3037 if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
2990 READ_BUF(4); 3038 p = xdr_inline_decode(xdr, 4);
2991 READ32(*maxname); 3039 if (unlikely(!p))
3040 goto out_overflow;
3041 *maxname = be32_to_cpup(p);
2992 bitmap[0] &= ~FATTR4_WORD0_MAXNAME; 3042 bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
2993 } 3043 }
2994 dprintk("%s: maxname=%u\n", __func__, *maxname); 3044 dprintk("%s: maxname=%u\n", __func__, *maxname);
2995 return status; 3045 return status;
3046out_overflow:
3047 print_overflow_msg(__func__, xdr);
3048 return -EIO;
2996} 3049}
2997 3050
2998static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 3051static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3005,8 +3058,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
3005 return -EIO; 3058 return -EIO;
3006 if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { 3059 if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
3007 uint64_t maxread; 3060 uint64_t maxread;
3008 READ_BUF(8); 3061 p = xdr_inline_decode(xdr, 8);
3009 READ64(maxread); 3062 if (unlikely(!p))
3063 goto out_overflow;
3064 xdr_decode_hyper(p, &maxread);
3010 if (maxread > 0x7FFFFFFF) 3065 if (maxread > 0x7FFFFFFF)
3011 maxread = 0x7FFFFFFF; 3066 maxread = 0x7FFFFFFF;
3012 *res = (uint32_t)maxread; 3067 *res = (uint32_t)maxread;
@@ -3014,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
3014 } 3069 }
3015 dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); 3070 dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
3016 return status; 3071 return status;
3072out_overflow:
3073 print_overflow_msg(__func__, xdr);
3074 return -EIO;
3017} 3075}
3018 3076
3019static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 3077static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3026,8 +3084,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
3026 return -EIO; 3084 return -EIO;
3027 if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { 3085 if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
3028 uint64_t maxwrite; 3086 uint64_t maxwrite;
3029 READ_BUF(8); 3087 p = xdr_inline_decode(xdr, 8);
3030 READ64(maxwrite); 3088 if (unlikely(!p))
3089 goto out_overflow;
3090 xdr_decode_hyper(p, &maxwrite);
3031 if (maxwrite > 0x7FFFFFFF) 3091 if (maxwrite > 0x7FFFFFFF)
3032 maxwrite = 0x7FFFFFFF; 3092 maxwrite = 0x7FFFFFFF;
3033 *res = (uint32_t)maxwrite; 3093 *res = (uint32_t)maxwrite;
@@ -3035,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
3035 } 3095 }
3036 dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); 3096 dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
3037 return status; 3097 return status;
3098out_overflow:
3099 print_overflow_msg(__func__, xdr);
3100 return -EIO;
3038} 3101}
3039 3102
3040static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) 3103static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3047,14 +3110,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
3047 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) 3110 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
3048 return -EIO; 3111 return -EIO;
3049 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { 3112 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
3050 READ_BUF(4); 3113 p = xdr_inline_decode(xdr, 4);
3051 READ32(tmp); 3114 if (unlikely(!p))
3115 goto out_overflow;
3116 tmp = be32_to_cpup(p);
3052 *mode = tmp & ~S_IFMT; 3117 *mode = tmp & ~S_IFMT;
3053 bitmap[1] &= ~FATTR4_WORD1_MODE; 3118 bitmap[1] &= ~FATTR4_WORD1_MODE;
3054 ret = NFS_ATTR_FATTR_MODE; 3119 ret = NFS_ATTR_FATTR_MODE;
3055 } 3120 }
3056 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); 3121 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
3057 return ret; 3122 return ret;
3123out_overflow:
3124 print_overflow_msg(__func__, xdr);
3125 return -EIO;
3058} 3126}
3059 3127
3060static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) 3128static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3066,16 +3134,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
3066 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) 3134 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
3067 return -EIO; 3135 return -EIO;
3068 if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { 3136 if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
3069 READ_BUF(4); 3137 p = xdr_inline_decode(xdr, 4);
3070 READ32(*nlink); 3138 if (unlikely(!p))
3139 goto out_overflow;
3140 *nlink = be32_to_cpup(p);
3071 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; 3141 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
3072 ret = NFS_ATTR_FATTR_NLINK; 3142 ret = NFS_ATTR_FATTR_NLINK;
3073 } 3143 }
3074 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); 3144 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
3075 return ret; 3145 return ret;
3146out_overflow:
3147 print_overflow_msg(__func__, xdr);
3148 return -EIO;
3076} 3149}
3077 3150
3078static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) 3151static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3152 struct nfs_client *clp, uint32_t *uid, int may_sleep)
3079{ 3153{
3080 uint32_t len; 3154 uint32_t len;
3081 __be32 *p; 3155 __be32 *p;
@@ -3085,10 +3159,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3085 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) 3159 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
3086 return -EIO; 3160 return -EIO;
3087 if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { 3161 if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
3088 READ_BUF(4); 3162 p = xdr_inline_decode(xdr, 4);
3089 READ32(len); 3163 if (unlikely(!p))
3090 READ_BUF(len); 3164 goto out_overflow;
3091 if (len < XDR_MAX_NETOBJ) { 3165 len = be32_to_cpup(p);
3166 p = xdr_inline_decode(xdr, len);
3167 if (unlikely(!p))
3168 goto out_overflow;
3169 if (!may_sleep) {
3170 /* do nothing */
3171 } else if (len < XDR_MAX_NETOBJ) {
3092 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) 3172 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
3093 ret = NFS_ATTR_FATTR_OWNER; 3173 ret = NFS_ATTR_FATTR_OWNER;
3094 else 3174 else
@@ -3101,9 +3181,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3101 } 3181 }
3102 dprintk("%s: uid=%d\n", __func__, (int)*uid); 3182 dprintk("%s: uid=%d\n", __func__, (int)*uid);
3103 return ret; 3183 return ret;
3184out_overflow:
3185 print_overflow_msg(__func__, xdr);
3186 return -EIO;
3104} 3187}
3105 3188
3106static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) 3189static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3190 struct nfs_client *clp, uint32_t *gid, int may_sleep)
3107{ 3191{
3108 uint32_t len; 3192 uint32_t len;
3109 __be32 *p; 3193 __be32 *p;
@@ -3113,10 +3197,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3113 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) 3197 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
3114 return -EIO; 3198 return -EIO;
3115 if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { 3199 if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
3116 READ_BUF(4); 3200 p = xdr_inline_decode(xdr, 4);
3117 READ32(len); 3201 if (unlikely(!p))
3118 READ_BUF(len); 3202 goto out_overflow;
3119 if (len < XDR_MAX_NETOBJ) { 3203 len = be32_to_cpup(p);
3204 p = xdr_inline_decode(xdr, len);
3205 if (unlikely(!p))
3206 goto out_overflow;
3207 if (!may_sleep) {
3208 /* do nothing */
3209 } else if (len < XDR_MAX_NETOBJ) {
3120 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) 3210 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
3121 ret = NFS_ATTR_FATTR_GROUP; 3211 ret = NFS_ATTR_FATTR_GROUP;
3122 else 3212 else
@@ -3129,6 +3219,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3129 } 3219 }
3130 dprintk("%s: gid=%d\n", __func__, (int)*gid); 3220 dprintk("%s: gid=%d\n", __func__, (int)*gid);
3131 return ret; 3221 return ret;
3222out_overflow:
3223 print_overflow_msg(__func__, xdr);
3224 return -EIO;
3132} 3225}
3133 3226
3134static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) 3227static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -3143,9 +3236,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
3143 if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { 3236 if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
3144 dev_t tmp; 3237 dev_t tmp;
3145 3238
3146 READ_BUF(8); 3239 p = xdr_inline_decode(xdr, 8);
3147 READ32(major); 3240 if (unlikely(!p))
3148 READ32(minor); 3241 goto out_overflow;
3242 major = be32_to_cpup(p++);
3243 minor = be32_to_cpup(p);
3149 tmp = MKDEV(major, minor); 3244 tmp = MKDEV(major, minor);
3150 if (MAJOR(tmp) == major && MINOR(tmp) == minor) 3245 if (MAJOR(tmp) == major && MINOR(tmp) == minor)
3151 *rdev = tmp; 3246 *rdev = tmp;
@@ -3154,6 +3249,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
3154 } 3249 }
3155 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); 3250 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
3156 return ret; 3251 return ret;
3252out_overflow:
3253 print_overflow_msg(__func__, xdr);
3254 return -EIO;
3157} 3255}
3158 3256
3159static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 3257static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3165,12 +3263,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
3165 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) 3263 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
3166 return -EIO; 3264 return -EIO;
3167 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { 3265 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
3168 READ_BUF(8); 3266 p = xdr_inline_decode(xdr, 8);
3169 READ64(*res); 3267 if (unlikely(!p))
3268 goto out_overflow;
3269 xdr_decode_hyper(p, res);
3170 bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; 3270 bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
3171 } 3271 }
3172 dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); 3272 dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
3173 return status; 3273 return status;
3274out_overflow:
3275 print_overflow_msg(__func__, xdr);
3276 return -EIO;
3174} 3277}
3175 3278
3176static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 3279static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3182,12 +3285,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
3182 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) 3285 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
3183 return -EIO; 3286 return -EIO;
3184 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { 3287 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
3185 READ_BUF(8); 3288 p = xdr_inline_decode(xdr, 8);
3186 READ64(*res); 3289 if (unlikely(!p))
3290 goto out_overflow;
3291 xdr_decode_hyper(p, res);
3187 bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; 3292 bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
3188 } 3293 }
3189 dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); 3294 dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
3190 return status; 3295 return status;
3296out_overflow:
3297 print_overflow_msg(__func__, xdr);
3298 return -EIO;
3191} 3299}
3192 3300
3193static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 3301static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3199,12 +3307,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
3199 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) 3307 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
3200 return -EIO; 3308 return -EIO;
3201 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { 3309 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
3202 READ_BUF(8); 3310 p = xdr_inline_decode(xdr, 8);
3203 READ64(*res); 3311 if (unlikely(!p))
3312 goto out_overflow;
3313 xdr_decode_hyper(p, res);
3204 bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; 3314 bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
3205 } 3315 }
3206 dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); 3316 dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
3207 return status; 3317 return status;
3318out_overflow:
3319 print_overflow_msg(__func__, xdr);
3320 return -EIO;
3208} 3321}
3209 3322
3210static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) 3323static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -3216,14 +3329,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
3216 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) 3329 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
3217 return -EIO; 3330 return -EIO;
3218 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { 3331 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
3219 READ_BUF(8); 3332 p = xdr_inline_decode(xdr, 8);
3220 READ64(*used); 3333 if (unlikely(!p))
3334 goto out_overflow;
3335 xdr_decode_hyper(p, used);
3221 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; 3336 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
3222 ret = NFS_ATTR_FATTR_SPACE_USED; 3337 ret = NFS_ATTR_FATTR_SPACE_USED;
3223 } 3338 }
3224 dprintk("%s: space used=%Lu\n", __func__, 3339 dprintk("%s: space used=%Lu\n", __func__,
3225 (unsigned long long)*used); 3340 (unsigned long long)*used);
3226 return ret; 3341 return ret;
3342out_overflow:
3343 print_overflow_msg(__func__, xdr);
3344 return -EIO;
3227} 3345}
3228 3346
3229static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) 3347static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -3232,12 +3350,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
3232 uint64_t sec; 3350 uint64_t sec;
3233 uint32_t nsec; 3351 uint32_t nsec;
3234 3352
3235 READ_BUF(12); 3353 p = xdr_inline_decode(xdr, 12);
3236 READ64(sec); 3354 if (unlikely(!p))
3237 READ32(nsec); 3355 goto out_overflow;
3356 p = xdr_decode_hyper(p, &sec);
3357 nsec = be32_to_cpup(p);
3238 time->tv_sec = (time_t)sec; 3358 time->tv_sec = (time_t)sec;
3239 time->tv_nsec = (long)nsec; 3359 time->tv_nsec = (long)nsec;
3240 return 0; 3360 return 0;
3361out_overflow:
3362 print_overflow_msg(__func__, xdr);
3363 return -EIO;
3241} 3364}
3242 3365
3243static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 3366static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -3315,11 +3438,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
3315{ 3438{
3316 __be32 *p; 3439 __be32 *p;
3317 3440
3318 READ_BUF(20); 3441 p = xdr_inline_decode(xdr, 20);
3319 READ32(cinfo->atomic); 3442 if (unlikely(!p))
3320 READ64(cinfo->before); 3443 goto out_overflow;
3321 READ64(cinfo->after); 3444 cinfo->atomic = be32_to_cpup(p++);
3445 p = xdr_decode_hyper(p, &cinfo->before);
3446 xdr_decode_hyper(p, &cinfo->after);
3322 return 0; 3447 return 0;
3448out_overflow:
3449 print_overflow_msg(__func__, xdr);
3450 return -EIO;
3323} 3451}
3324 3452
3325static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) 3453static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
@@ -3331,40 +3459,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
3331 status = decode_op_hdr(xdr, OP_ACCESS); 3459 status = decode_op_hdr(xdr, OP_ACCESS);
3332 if (status) 3460 if (status)
3333 return status; 3461 return status;
3334 READ_BUF(8); 3462 p = xdr_inline_decode(xdr, 8);
3335 READ32(supp); 3463 if (unlikely(!p))
3336 READ32(acc); 3464 goto out_overflow;
3465 supp = be32_to_cpup(p++);
3466 acc = be32_to_cpup(p);
3337 access->supported = supp; 3467 access->supported = supp;
3338 access->access = acc; 3468 access->access = acc;
3339 return 0; 3469 return 0;
3470out_overflow:
3471 print_overflow_msg(__func__, xdr);
3472 return -EIO;
3340} 3473}
3341 3474
3342static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) 3475static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
3343{ 3476{
3344 __be32 *p; 3477 __be32 *p;
3478
3479 p = xdr_inline_decode(xdr, len);
3480 if (likely(p)) {
3481 memcpy(buf, p, len);
3482 return 0;
3483 }
3484 print_overflow_msg(__func__, xdr);
3485 return -EIO;
3486}
3487
3488static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
3489{
3490 return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
3491}
3492
3493static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
3494{
3345 int status; 3495 int status;
3346 3496
3347 status = decode_op_hdr(xdr, OP_CLOSE); 3497 status = decode_op_hdr(xdr, OP_CLOSE);
3348 if (status != -EIO) 3498 if (status != -EIO)
3349 nfs_increment_open_seqid(status, res->seqid); 3499 nfs_increment_open_seqid(status, res->seqid);
3350 if (status) 3500 if (!status)
3351 return status; 3501 status = decode_stateid(xdr, &res->stateid);
3352 READ_BUF(NFS4_STATEID_SIZE); 3502 return status;
3353 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); 3503}
3354 return 0; 3504
3505static int decode_verifier(struct xdr_stream *xdr, void *verifier)
3506{
3507 return decode_opaque_fixed(xdr, verifier, 8);
3355} 3508}
3356 3509
3357static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) 3510static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
3358{ 3511{
3359 __be32 *p;
3360 int status; 3512 int status;
3361 3513
3362 status = decode_op_hdr(xdr, OP_COMMIT); 3514 status = decode_op_hdr(xdr, OP_COMMIT);
3363 if (status) 3515 if (!status)
3364 return status; 3516 status = decode_verifier(xdr, res->verf->verifier);
3365 READ_BUF(8); 3517 return status;
3366 COPYMEM(res->verf->verifier, 8);
3367 return 0;
3368} 3518}
3369 3519
3370static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 3520static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3378,10 +3528,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
3378 return status; 3528 return status;
3379 if ((status = decode_change_info(xdr, cinfo))) 3529 if ((status = decode_change_info(xdr, cinfo)))
3380 return status; 3530 return status;
3381 READ_BUF(4); 3531 p = xdr_inline_decode(xdr, 4);
3382 READ32(bmlen); 3532 if (unlikely(!p))
3383 READ_BUF(bmlen << 2); 3533 goto out_overflow;
3384 return 0; 3534 bmlen = be32_to_cpup(p);
3535 p = xdr_inline_decode(xdr, bmlen << 2);
3536 if (likely(p))
3537 return 0;
3538out_overflow:
3539 print_overflow_msg(__func__, xdr);
3540 return -EIO;
3385} 3541}
3386 3542
3387static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) 3543static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
@@ -3466,7 +3622,8 @@ xdr_error:
3466 return status; 3622 return status;
3467} 3623}
3468 3624
3469static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) 3625static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3626 const struct nfs_server *server, int may_sleep)
3470{ 3627{
3471 __be32 *savep; 3628 __be32 *savep;
3472 uint32_t attrlen, 3629 uint32_t attrlen,
@@ -3538,12 +3695,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
3538 goto xdr_error; 3695 goto xdr_error;
3539 fattr->valid |= status; 3696 fattr->valid |= status;
3540 3697
3541 status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); 3698 status = decode_attr_owner(xdr, bitmap, server->nfs_client,
3699 &fattr->uid, may_sleep);
3542 if (status < 0) 3700 if (status < 0)
3543 goto xdr_error; 3701 goto xdr_error;
3544 fattr->valid |= status; 3702 fattr->valid |= status;
3545 3703
3546 status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); 3704 status = decode_attr_group(xdr, bitmap, server->nfs_client,
3705 &fattr->gid, may_sleep);
3547 if (status < 0) 3706 if (status < 0)
3548 goto xdr_error; 3707 goto xdr_error;
3549 fattr->valid |= status; 3708 fattr->valid |= status;
@@ -3633,14 +3792,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
3633 if (status) 3792 if (status)
3634 return status; 3793 return status;
3635 3794
3636 READ_BUF(4); 3795 p = xdr_inline_decode(xdr, 4);
3637 READ32(len); 3796 if (unlikely(!p))
3797 goto out_overflow;
3798 len = be32_to_cpup(p);
3638 if (len > NFS4_FHSIZE) 3799 if (len > NFS4_FHSIZE)
3639 return -EIO; 3800 return -EIO;
3640 fh->size = len; 3801 fh->size = len;
3641 READ_BUF(len); 3802 p = xdr_inline_decode(xdr, len);
3642 COPYMEM(fh->data, len); 3803 if (unlikely(!p))
3804 goto out_overflow;
3805 memcpy(fh->data, p, len);
3643 return 0; 3806 return 0;
3807out_overflow:
3808 print_overflow_msg(__func__, xdr);
3809 return -EIO;
3644} 3810}
3645 3811
3646static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 3812static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3662,10 +3828,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3662 __be32 *p; 3828 __be32 *p;
3663 uint32_t namelen, type; 3829 uint32_t namelen, type;
3664 3830
3665 READ_BUF(32); 3831 p = xdr_inline_decode(xdr, 32);
3666 READ64(offset); 3832 if (unlikely(!p))
3667 READ64(length); 3833 goto out_overflow;
3668 READ32(type); 3834 p = xdr_decode_hyper(p, &offset);
3835 p = xdr_decode_hyper(p, &length);
3836 type = be32_to_cpup(p++);
3669 if (fl != NULL) { 3837 if (fl != NULL) {
3670 fl->fl_start = (loff_t)offset; 3838 fl->fl_start = (loff_t)offset;
3671 fl->fl_end = fl->fl_start + (loff_t)length - 1; 3839 fl->fl_end = fl->fl_start + (loff_t)length - 1;
@@ -3676,23 +3844,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3676 fl->fl_type = F_RDLCK; 3844 fl->fl_type = F_RDLCK;
3677 fl->fl_pid = 0; 3845 fl->fl_pid = 0;
3678 } 3846 }
3679 READ64(clientid); 3847 p = xdr_decode_hyper(p, &clientid);
3680 READ32(namelen); 3848 namelen = be32_to_cpup(p);
3681 READ_BUF(namelen); 3849 p = xdr_inline_decode(xdr, namelen);
3682 return -NFS4ERR_DENIED; 3850 if (likely(p))
3851 return -NFS4ERR_DENIED;
3852out_overflow:
3853 print_overflow_msg(__func__, xdr);
3854 return -EIO;
3683} 3855}
3684 3856
3685static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) 3857static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
3686{ 3858{
3687 __be32 *p;
3688 int status; 3859 int status;
3689 3860
3690 status = decode_op_hdr(xdr, OP_LOCK); 3861 status = decode_op_hdr(xdr, OP_LOCK);
3691 if (status == -EIO) 3862 if (status == -EIO)
3692 goto out; 3863 goto out;
3693 if (status == 0) { 3864 if (status == 0) {
3694 READ_BUF(NFS4_STATEID_SIZE); 3865 status = decode_stateid(xdr, &res->stateid);
3695 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); 3866 if (unlikely(status))
3867 goto out;
3696 } else if (status == -NFS4ERR_DENIED) 3868 } else if (status == -NFS4ERR_DENIED)
3697 status = decode_lock_denied(xdr, NULL); 3869 status = decode_lock_denied(xdr, NULL);
3698 if (res->open_seqid != NULL) 3870 if (res->open_seqid != NULL)
@@ -3713,16 +3885,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
3713 3885
3714static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) 3886static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
3715{ 3887{
3716 __be32 *p;
3717 int status; 3888 int status;
3718 3889
3719 status = decode_op_hdr(xdr, OP_LOCKU); 3890 status = decode_op_hdr(xdr, OP_LOCKU);
3720 if (status != -EIO) 3891 if (status != -EIO)
3721 nfs_increment_lock_seqid(status, res->seqid); 3892 nfs_increment_lock_seqid(status, res->seqid);
3722 if (status == 0) { 3893 if (status == 0)
3723 READ_BUF(NFS4_STATEID_SIZE); 3894 status = decode_stateid(xdr, &res->stateid);
3724 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3725 }
3726 return status; 3895 return status;
3727} 3896}
3728 3897
@@ -3737,34 +3906,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
3737 __be32 *p; 3906 __be32 *p;
3738 uint32_t limit_type, nblocks, blocksize; 3907 uint32_t limit_type, nblocks, blocksize;
3739 3908
3740 READ_BUF(12); 3909 p = xdr_inline_decode(xdr, 12);
3741 READ32(limit_type); 3910 if (unlikely(!p))
3911 goto out_overflow;
3912 limit_type = be32_to_cpup(p++);
3742 switch (limit_type) { 3913 switch (limit_type) {
3743 case 1: 3914 case 1:
3744 READ64(*maxsize); 3915 xdr_decode_hyper(p, maxsize);
3745 break; 3916 break;
3746 case 2: 3917 case 2:
3747 READ32(nblocks); 3918 nblocks = be32_to_cpup(p++);
3748 READ32(blocksize); 3919 blocksize = be32_to_cpup(p);
3749 *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; 3920 *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
3750 } 3921 }
3751 return 0; 3922 return 0;
3923out_overflow:
3924 print_overflow_msg(__func__, xdr);
3925 return -EIO;
3752} 3926}
3753 3927
3754static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) 3928static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3755{ 3929{
3756 __be32 *p; 3930 __be32 *p;
3757 uint32_t delegation_type; 3931 uint32_t delegation_type;
3932 int status;
3758 3933
3759 READ_BUF(4); 3934 p = xdr_inline_decode(xdr, 4);
3760 READ32(delegation_type); 3935 if (unlikely(!p))
3936 goto out_overflow;
3937 delegation_type = be32_to_cpup(p);
3761 if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { 3938 if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
3762 res->delegation_type = 0; 3939 res->delegation_type = 0;
3763 return 0; 3940 return 0;
3764 } 3941 }
3765 READ_BUF(NFS4_STATEID_SIZE+4); 3942 status = decode_stateid(xdr, &res->delegation);
3766 COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); 3943 if (unlikely(status))
3767 READ32(res->do_recall); 3944 return status;
3945 p = xdr_inline_decode(xdr, 4);
3946 if (unlikely(!p))
3947 goto out_overflow;
3948 res->do_recall = be32_to_cpup(p);
3768 3949
3769 switch (delegation_type) { 3950 switch (delegation_type) {
3770 case NFS4_OPEN_DELEGATE_READ: 3951 case NFS4_OPEN_DELEGATE_READ:
@@ -3776,6 +3957,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3776 return -EIO; 3957 return -EIO;
3777 } 3958 }
3778 return decode_ace(xdr, NULL, res->server->nfs_client); 3959 return decode_ace(xdr, NULL, res->server->nfs_client);
3960out_overflow:
3961 print_overflow_msg(__func__, xdr);
3962 return -EIO;
3779} 3963}
3780 3964
3781static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) 3965static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3787,23 +3971,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3787 status = decode_op_hdr(xdr, OP_OPEN); 3971 status = decode_op_hdr(xdr, OP_OPEN);
3788 if (status != -EIO) 3972 if (status != -EIO)
3789 nfs_increment_open_seqid(status, res->seqid); 3973 nfs_increment_open_seqid(status, res->seqid);
3790 if (status) 3974 if (!status)
3975 status = decode_stateid(xdr, &res->stateid);
3976 if (unlikely(status))
3791 return status; 3977 return status;
3792 READ_BUF(NFS4_STATEID_SIZE);
3793 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3794 3978
3795 decode_change_info(xdr, &res->cinfo); 3979 decode_change_info(xdr, &res->cinfo);
3796 3980
3797 READ_BUF(8); 3981 p = xdr_inline_decode(xdr, 8);
3798 READ32(res->rflags); 3982 if (unlikely(!p))
3799 READ32(bmlen); 3983 goto out_overflow;
3984 res->rflags = be32_to_cpup(p++);
3985 bmlen = be32_to_cpup(p);
3800 if (bmlen > 10) 3986 if (bmlen > 10)
3801 goto xdr_error; 3987 goto xdr_error;
3802 3988
3803 READ_BUF(bmlen << 2); 3989 p = xdr_inline_decode(xdr, bmlen << 2);
3990 if (unlikely(!p))
3991 goto out_overflow;
3804 savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); 3992 savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
3805 for (i = 0; i < savewords; ++i) 3993 for (i = 0; i < savewords; ++i)
3806 READ32(res->attrset[i]); 3994 res->attrset[i] = be32_to_cpup(p++);
3807 for (; i < NFS4_BITMAP_SIZE; i++) 3995 for (; i < NFS4_BITMAP_SIZE; i++)
3808 res->attrset[i] = 0; 3996 res->attrset[i] = 0;
3809 3997
@@ -3811,36 +3999,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3811xdr_error: 3999xdr_error:
3812 dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); 4000 dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
3813 return -EIO; 4001 return -EIO;
4002out_overflow:
4003 print_overflow_msg(__func__, xdr);
4004 return -EIO;
3814} 4005}
3815 4006
3816static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) 4007static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
3817{ 4008{
3818 __be32 *p;
3819 int status; 4009 int status;
3820 4010
3821 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); 4011 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
3822 if (status != -EIO) 4012 if (status != -EIO)
3823 nfs_increment_open_seqid(status, res->seqid); 4013 nfs_increment_open_seqid(status, res->seqid);
3824 if (status) 4014 if (!status)
3825 return status; 4015 status = decode_stateid(xdr, &res->stateid);
3826 READ_BUF(NFS4_STATEID_SIZE); 4016 return status;
3827 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3828 return 0;
3829} 4017}
3830 4018
3831static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) 4019static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
3832{ 4020{
3833 __be32 *p;
3834 int status; 4021 int status;
3835 4022
3836 status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); 4023 status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
3837 if (status != -EIO) 4024 if (status != -EIO)
3838 nfs_increment_open_seqid(status, res->seqid); 4025 nfs_increment_open_seqid(status, res->seqid);
3839 if (status) 4026 if (!status)
3840 return status; 4027 status = decode_stateid(xdr, &res->stateid);
3841 READ_BUF(NFS4_STATEID_SIZE); 4028 return status;
3842 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3843 return 0;
3844} 4029}
3845 4030
3846static int decode_putfh(struct xdr_stream *xdr) 4031static int decode_putfh(struct xdr_stream *xdr)
@@ -3863,9 +4048,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
3863 status = decode_op_hdr(xdr, OP_READ); 4048 status = decode_op_hdr(xdr, OP_READ);
3864 if (status) 4049 if (status)
3865 return status; 4050 return status;
3866 READ_BUF(8); 4051 p = xdr_inline_decode(xdr, 8);
3867 READ32(eof); 4052 if (unlikely(!p))
3868 READ32(count); 4053 goto out_overflow;
4054 eof = be32_to_cpup(p++);
4055 count = be32_to_cpup(p);
3869 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 4056 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
3870 recvd = req->rq_rcv_buf.len - hdrlen; 4057 recvd = req->rq_rcv_buf.len - hdrlen;
3871 if (count > recvd) { 4058 if (count > recvd) {
@@ -3878,6 +4065,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
3878 res->eof = eof; 4065 res->eof = eof;
3879 res->count = count; 4066 res->count = count;
3880 return 0; 4067 return 0;
4068out_overflow:
4069 print_overflow_msg(__func__, xdr);
4070 return -EIO;
3881} 4071}
3882 4072
3883static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) 4073static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -3892,17 +4082,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3892 int status; 4082 int status;
3893 4083
3894 status = decode_op_hdr(xdr, OP_READDIR); 4084 status = decode_op_hdr(xdr, OP_READDIR);
3895 if (status) 4085 if (!status)
4086 status = decode_verifier(xdr, readdir->verifier.data);
4087 if (unlikely(status))
3896 return status; 4088 return status;
3897 READ_BUF(8);
3898 COPYMEM(readdir->verifier.data, 8);
3899 dprintk("%s: verifier = %08x:%08x\n", 4089 dprintk("%s: verifier = %08x:%08x\n",
3900 __func__, 4090 __func__,
3901 ((u32 *)readdir->verifier.data)[0], 4091 ((u32 *)readdir->verifier.data)[0],
3902 ((u32 *)readdir->verifier.data)[1]); 4092 ((u32 *)readdir->verifier.data)[1]);
3903 4093
3904 4094
3905 hdrlen = (char *) p - (char *) iov->iov_base; 4095 hdrlen = (char *) xdr->p - (char *) iov->iov_base;
3906 recvd = rcvbuf->len - hdrlen; 4096 recvd = rcvbuf->len - hdrlen;
3907 if (pglen > recvd) 4097 if (pglen > recvd)
3908 pglen = recvd; 4098 pglen = recvd;
@@ -3990,8 +4180,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
3990 return status; 4180 return status;
3991 4181
3992 /* Convert length of symlink */ 4182 /* Convert length of symlink */
3993 READ_BUF(4); 4183 p = xdr_inline_decode(xdr, 4);
3994 READ32(len); 4184 if (unlikely(!p))
4185 goto out_overflow;
4186 len = be32_to_cpup(p);
3995 if (len >= rcvbuf->page_len || len <= 0) { 4187 if (len >= rcvbuf->page_len || len <= 0) {
3996 dprintk("nfs: server returned giant symlink!\n"); 4188 dprintk("nfs: server returned giant symlink!\n");
3997 return -ENAMETOOLONG; 4189 return -ENAMETOOLONG;
@@ -4015,6 +4207,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4015 kaddr[len+rcvbuf->page_base] = '\0'; 4207 kaddr[len+rcvbuf->page_base] = '\0';
4016 kunmap_atomic(kaddr, KM_USER0); 4208 kunmap_atomic(kaddr, KM_USER0);
4017 return 0; 4209 return 0;
4210out_overflow:
4211 print_overflow_msg(__func__, xdr);
4212 return -EIO;
4018} 4213}
4019 4214
4020static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 4215static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -4112,10 +4307,16 @@ static int decode_setattr(struct xdr_stream *xdr)
4112 status = decode_op_hdr(xdr, OP_SETATTR); 4307 status = decode_op_hdr(xdr, OP_SETATTR);
4113 if (status) 4308 if (status)
4114 return status; 4309 return status;
4115 READ_BUF(4); 4310 p = xdr_inline_decode(xdr, 4);
4116 READ32(bmlen); 4311 if (unlikely(!p))
4117 READ_BUF(bmlen << 2); 4312 goto out_overflow;
4118 return 0; 4313 bmlen = be32_to_cpup(p);
4314 p = xdr_inline_decode(xdr, bmlen << 2);
4315 if (likely(p))
4316 return 0;
4317out_overflow:
4318 print_overflow_msg(__func__, xdr);
4319 return -EIO;
4119} 4320}
4120 4321
4121static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) 4322static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
@@ -4124,35 +4325,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
4124 uint32_t opnum; 4325 uint32_t opnum;
4125 int32_t nfserr; 4326 int32_t nfserr;
4126 4327
4127 READ_BUF(8); 4328 p = xdr_inline_decode(xdr, 8);
4128 READ32(opnum); 4329 if (unlikely(!p))
4330 goto out_overflow;
4331 opnum = be32_to_cpup(p++);
4129 if (opnum != OP_SETCLIENTID) { 4332 if (opnum != OP_SETCLIENTID) {
4130 dprintk("nfs: decode_setclientid: Server returned operation" 4333 dprintk("nfs: decode_setclientid: Server returned operation"
4131 " %d\n", opnum); 4334 " %d\n", opnum);
4132 return -EIO; 4335 return -EIO;
4133 } 4336 }
4134 READ32(nfserr); 4337 nfserr = be32_to_cpup(p);
4135 if (nfserr == NFS_OK) { 4338 if (nfserr == NFS_OK) {
4136 READ_BUF(8 + NFS4_VERIFIER_SIZE); 4339 p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
4137 READ64(clp->cl_clientid); 4340 if (unlikely(!p))
4138 COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); 4341 goto out_overflow;
4342 p = xdr_decode_hyper(p, &clp->cl_clientid);
4343 memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
4139 } else if (nfserr == NFSERR_CLID_INUSE) { 4344 } else if (nfserr == NFSERR_CLID_INUSE) {
4140 uint32_t len; 4345 uint32_t len;
4141 4346
4142 /* skip netid string */ 4347 /* skip netid string */
4143 READ_BUF(4); 4348 p = xdr_inline_decode(xdr, 4);
4144 READ32(len); 4349 if (unlikely(!p))
4145 READ_BUF(len); 4350 goto out_overflow;
4351 len = be32_to_cpup(p);
4352 p = xdr_inline_decode(xdr, len);
4353 if (unlikely(!p))
4354 goto out_overflow;
4146 4355
4147 /* skip uaddr string */ 4356 /* skip uaddr string */
4148 READ_BUF(4); 4357 p = xdr_inline_decode(xdr, 4);
4149 READ32(len); 4358 if (unlikely(!p))
4150 READ_BUF(len); 4359 goto out_overflow;
4360 len = be32_to_cpup(p);
4361 p = xdr_inline_decode(xdr, len);
4362 if (unlikely(!p))
4363 goto out_overflow;
4151 return -NFSERR_CLID_INUSE; 4364 return -NFSERR_CLID_INUSE;
4152 } else 4365 } else
4153 return nfs4_stat_to_errno(nfserr); 4366 return nfs4_stat_to_errno(nfserr);
4154 4367
4155 return 0; 4368 return 0;
4369out_overflow:
4370 print_overflow_msg(__func__, xdr);
4371 return -EIO;
4156} 4372}
4157 4373
4158static int decode_setclientid_confirm(struct xdr_stream *xdr) 4374static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -4169,11 +4385,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
4169 if (status) 4385 if (status)
4170 return status; 4386 return status;
4171 4387
4172 READ_BUF(16); 4388 p = xdr_inline_decode(xdr, 16);
4173 READ32(res->count); 4389 if (unlikely(!p))
4174 READ32(res->verf->committed); 4390 goto out_overflow;
4175 COPYMEM(res->verf->verifier, 8); 4391 res->count = be32_to_cpup(p++);
4392 res->verf->committed = be32_to_cpup(p++);
4393 memcpy(res->verf->verifier, p, 8);
4176 return 0; 4394 return 0;
4395out_overflow:
4396 print_overflow_msg(__func__, xdr);
4397 return -EIO;
4177} 4398}
4178 4399
4179static int decode_delegreturn(struct xdr_stream *xdr) 4400static int decode_delegreturn(struct xdr_stream *xdr)
@@ -4187,6 +4408,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4187{ 4408{
4188 __be32 *p; 4409 __be32 *p;
4189 uint32_t dummy; 4410 uint32_t dummy;
4411 char *dummy_str;
4190 int status; 4412 int status;
4191 struct nfs_client *clp = res->client; 4413 struct nfs_client *clp = res->client;
4192 4414
@@ -4194,36 +4416,45 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4194 if (status) 4416 if (status)
4195 return status; 4417 return status;
4196 4418
4197 READ_BUF(8); 4419 p = xdr_inline_decode(xdr, 8);
4198 READ64(clp->cl_ex_clid); 4420 if (unlikely(!p))
4199 READ_BUF(12); 4421 goto out_overflow;
4200 READ32(clp->cl_seqid); 4422 xdr_decode_hyper(p, &clp->cl_ex_clid);
4201 READ32(clp->cl_exchange_flags); 4423 p = xdr_inline_decode(xdr, 12);
4424 if (unlikely(!p))
4425 goto out_overflow;
4426 clp->cl_seqid = be32_to_cpup(p++);
4427 clp->cl_exchange_flags = be32_to_cpup(p++);
4202 4428
4203 /* We ask for SP4_NONE */ 4429 /* We ask for SP4_NONE */
4204 READ32(dummy); 4430 dummy = be32_to_cpup(p);
4205 if (dummy != SP4_NONE) 4431 if (dummy != SP4_NONE)
4206 return -EIO; 4432 return -EIO;
4207 4433
4208 /* Throw away minor_id */ 4434 /* Throw away minor_id */
4209 READ_BUF(8); 4435 p = xdr_inline_decode(xdr, 8);
4436 if (unlikely(!p))
4437 goto out_overflow;
4210 4438
4211 /* Throw away Major id */ 4439 /* Throw away Major id */
4212 READ_BUF(4); 4440 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4213 READ32(dummy); 4441 if (unlikely(status))
4214 READ_BUF(dummy); 4442 return status;
4215 4443
4216 /* Throw away server_scope */ 4444 /* Throw away server_scope */
4217 READ_BUF(4); 4445 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4218 READ32(dummy); 4446 if (unlikely(status))
4219 READ_BUF(dummy); 4447 return status;
4220 4448
4221 /* Throw away Implementation id array */ 4449 /* Throw away Implementation id array */
4222 READ_BUF(4); 4450 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4223 READ32(dummy); 4451 if (unlikely(status))
4224 READ_BUF(dummy); 4452 return status;
4225 4453
4226 return 0; 4454 return 0;
4455out_overflow:
4456 print_overflow_msg(__func__, xdr);
4457 return -EIO;
4227} 4458}
4228 4459
4229static int decode_chan_attrs(struct xdr_stream *xdr, 4460static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -4232,22 +4463,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
4232 __be32 *p; 4463 __be32 *p;
4233 u32 nr_attrs; 4464 u32 nr_attrs;
4234 4465
4235 READ_BUF(28); 4466 p = xdr_inline_decode(xdr, 28);
4236 READ32(attrs->headerpadsz); 4467 if (unlikely(!p))
4237 READ32(attrs->max_rqst_sz); 4468 goto out_overflow;
4238 READ32(attrs->max_resp_sz); 4469 attrs->headerpadsz = be32_to_cpup(p++);
4239 READ32(attrs->max_resp_sz_cached); 4470 attrs->max_rqst_sz = be32_to_cpup(p++);
4240 READ32(attrs->max_ops); 4471 attrs->max_resp_sz = be32_to_cpup(p++);
4241 READ32(attrs->max_reqs); 4472 attrs->max_resp_sz_cached = be32_to_cpup(p++);
4242 READ32(nr_attrs); 4473 attrs->max_ops = be32_to_cpup(p++);
4474 attrs->max_reqs = be32_to_cpup(p++);
4475 nr_attrs = be32_to_cpup(p);
4243 if (unlikely(nr_attrs > 1)) { 4476 if (unlikely(nr_attrs > 1)) {
4244 printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", 4477 printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
4245 __func__, nr_attrs); 4478 __func__, nr_attrs);
4246 return -EINVAL; 4479 return -EINVAL;
4247 } 4480 }
4248 if (nr_attrs == 1) 4481 if (nr_attrs == 1) {
4249 READ_BUF(4); /* skip rdma_attrs */ 4482 p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
4483 if (unlikely(!p))
4484 goto out_overflow;
4485 }
4250 return 0; 4486 return 0;
4487out_overflow:
4488 print_overflow_msg(__func__, xdr);
4489 return -EIO;
4490}
4491
4492static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
4493{
4494 return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
4251} 4495}
4252 4496
4253static int decode_create_session(struct xdr_stream *xdr, 4497static int decode_create_session(struct xdr_stream *xdr,
@@ -4259,24 +4503,26 @@ static int decode_create_session(struct xdr_stream *xdr,
4259 struct nfs4_session *session = clp->cl_session; 4503 struct nfs4_session *session = clp->cl_session;
4260 4504
4261 status = decode_op_hdr(xdr, OP_CREATE_SESSION); 4505 status = decode_op_hdr(xdr, OP_CREATE_SESSION);
4262 4506 if (!status)
4263 if (status) 4507 status = decode_sessionid(xdr, &session->sess_id);
4508 if (unlikely(status))
4264 return status; 4509 return status;
4265 4510
4266 /* sessionid */
4267 READ_BUF(NFS4_MAX_SESSIONID_LEN);
4268 COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
4269
4270 /* seqid, flags */ 4511 /* seqid, flags */
4271 READ_BUF(8); 4512 p = xdr_inline_decode(xdr, 8);
4272 READ32(clp->cl_seqid); 4513 if (unlikely(!p))
4273 READ32(session->flags); 4514 goto out_overflow;
4515 clp->cl_seqid = be32_to_cpup(p++);
4516 session->flags = be32_to_cpup(p);
4274 4517
4275 /* Channel attributes */ 4518 /* Channel attributes */
4276 status = decode_chan_attrs(xdr, &session->fc_attrs); 4519 status = decode_chan_attrs(xdr, &session->fc_attrs);
4277 if (!status) 4520 if (!status)
4278 status = decode_chan_attrs(xdr, &session->bc_attrs); 4521 status = decode_chan_attrs(xdr, &session->bc_attrs);
4279 return status; 4522 return status;
4523out_overflow:
4524 print_overflow_msg(__func__, xdr);
4525 return -EIO;
4280} 4526}
4281 4527
4282static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) 4528static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -4300,7 +4546,9 @@ static int decode_sequence(struct xdr_stream *xdr,
4300 return 0; 4546 return 0;
4301 4547
4302 status = decode_op_hdr(xdr, OP_SEQUENCE); 4548 status = decode_op_hdr(xdr, OP_SEQUENCE);
4303 if (status) 4549 if (!status)
4550 status = decode_sessionid(xdr, &id);
4551 if (unlikely(status))
4304 goto out_err; 4552 goto out_err;
4305 4553
4306 /* 4554 /*
@@ -4309,36 +4557,43 @@ static int decode_sequence(struct xdr_stream *xdr,
4309 */ 4557 */
4310 status = -ESERVERFAULT; 4558 status = -ESERVERFAULT;
4311 4559
4312 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4313 READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
4314 COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
4315 if (memcmp(id.data, res->sr_session->sess_id.data, 4560 if (memcmp(id.data, res->sr_session->sess_id.data,
4316 NFS4_MAX_SESSIONID_LEN)) { 4561 NFS4_MAX_SESSIONID_LEN)) {
4317 dprintk("%s Invalid session id\n", __func__); 4562 dprintk("%s Invalid session id\n", __func__);
4318 goto out_err; 4563 goto out_err;
4319 } 4564 }
4565
4566 p = xdr_inline_decode(xdr, 20);
4567 if (unlikely(!p))
4568 goto out_overflow;
4569
4320 /* seqid */ 4570 /* seqid */
4321 READ32(dummy); 4571 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4572 dummy = be32_to_cpup(p++);
4322 if (dummy != slot->seq_nr) { 4573 if (dummy != slot->seq_nr) {
4323 dprintk("%s Invalid sequence number\n", __func__); 4574 dprintk("%s Invalid sequence number\n", __func__);
4324 goto out_err; 4575 goto out_err;
4325 } 4576 }
4326 /* slot id */ 4577 /* slot id */
4327 READ32(dummy); 4578 dummy = be32_to_cpup(p++);
4328 if (dummy != res->sr_slotid) { 4579 if (dummy != res->sr_slotid) {
4329 dprintk("%s Invalid slot id\n", __func__); 4580 dprintk("%s Invalid slot id\n", __func__);
4330 goto out_err; 4581 goto out_err;
4331 } 4582 }
4332 /* highest slot id - currently not processed */ 4583 /* highest slot id - currently not processed */
4333 READ32(dummy); 4584 dummy = be32_to_cpup(p++);
4334 /* target highest slot id - currently not processed */ 4585 /* target highest slot id - currently not processed */
4335 READ32(dummy); 4586 dummy = be32_to_cpup(p++);
4336 /* result flags - currently not processed */ 4587 /* result flags - currently not processed */
4337 READ32(dummy); 4588 dummy = be32_to_cpup(p);
4338 status = 0; 4589 status = 0;
4339out_err: 4590out_err:
4340 res->sr_status = status; 4591 res->sr_status = status;
4341 return status; 4592 return status;
4593out_overflow:
4594 print_overflow_msg(__func__, xdr);
4595 status = -EIO;
4596 goto out_err;
4342#else /* CONFIG_NFS_V4_1 */ 4597#else /* CONFIG_NFS_V4_1 */
4343 return 0; 4598 return 0;
4344#endif /* CONFIG_NFS_V4_1 */ 4599#endif /* CONFIG_NFS_V4_1 */
@@ -4370,7 +4625,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
4370 status = decode_open_downgrade(&xdr, res); 4625 status = decode_open_downgrade(&xdr, res);
4371 if (status != 0) 4626 if (status != 0)
4372 goto out; 4627 goto out;
4373 decode_getfattr(&xdr, res->fattr, res->server); 4628 decode_getfattr(&xdr, res->fattr, res->server,
4629 !RPC_IS_ASYNC(rqstp->rq_task));
4374out: 4630out:
4375 return status; 4631 return status;
4376} 4632}
@@ -4397,7 +4653,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
4397 status = decode_access(&xdr, res); 4653 status = decode_access(&xdr, res);
4398 if (status != 0) 4654 if (status != 0)
4399 goto out; 4655 goto out;
4400 decode_getfattr(&xdr, res->fattr, res->server); 4656 decode_getfattr(&xdr, res->fattr, res->server,
4657 !RPC_IS_ASYNC(rqstp->rq_task));
4401out: 4658out:
4402 return status; 4659 return status;
4403} 4660}
@@ -4424,7 +4681,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
4424 goto out; 4681 goto out;
4425 if ((status = decode_getfh(&xdr, res->fh)) != 0) 4682 if ((status = decode_getfh(&xdr, res->fh)) != 0)
4426 goto out; 4683 goto out;
4427 status = decode_getfattr(&xdr, res->fattr, res->server); 4684 status = decode_getfattr(&xdr, res->fattr, res->server
4685 ,!RPC_IS_ASYNC(rqstp->rq_task));
4428out: 4686out:
4429 return status; 4687 return status;
4430} 4688}
@@ -4448,7 +4706,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
4448 if ((status = decode_putrootfh(&xdr)) != 0) 4706 if ((status = decode_putrootfh(&xdr)) != 0)
4449 goto out; 4707 goto out;
4450 if ((status = decode_getfh(&xdr, res->fh)) == 0) 4708 if ((status = decode_getfh(&xdr, res->fh)) == 0)
4451 status = decode_getfattr(&xdr, res->fattr, res->server); 4709 status = decode_getfattr(&xdr, res->fattr, res->server,
4710 !RPC_IS_ASYNC(rqstp->rq_task));
4452out: 4711out:
4453 return status; 4712 return status;
4454} 4713}
@@ -4473,7 +4732,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
4473 goto out; 4732 goto out;
4474 if ((status = decode_remove(&xdr, &res->cinfo)) != 0) 4733 if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
4475 goto out; 4734 goto out;
4476 decode_getfattr(&xdr, &res->dir_attr, res->server); 4735 decode_getfattr(&xdr, &res->dir_attr, res->server,
4736 !RPC_IS_ASYNC(rqstp->rq_task));
4477out: 4737out:
4478 return status; 4738 return status;
4479} 4739}
@@ -4503,11 +4763,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
4503 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) 4763 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
4504 goto out; 4764 goto out;
4505 /* Current FH is target directory */ 4765 /* Current FH is target directory */
4506 if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0) 4766 if (decode_getfattr(&xdr, res->new_fattr, res->server,
4767 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4507 goto out; 4768 goto out;
4508 if ((status = decode_restorefh(&xdr)) != 0) 4769 if ((status = decode_restorefh(&xdr)) != 0)
4509 goto out; 4770 goto out;
4510 decode_getfattr(&xdr, res->old_fattr, res->server); 4771 decode_getfattr(&xdr, res->old_fattr, res->server,
4772 !RPC_IS_ASYNC(rqstp->rq_task));
4511out: 4773out:
4512 return status; 4774 return status;
4513} 4775}
@@ -4540,11 +4802,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
4540 * Note order: OP_LINK leaves the directory as the current 4802 * Note order: OP_LINK leaves the directory as the current
4541 * filehandle. 4803 * filehandle.
4542 */ 4804 */
4543 if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0) 4805 if (decode_getfattr(&xdr, res->dir_attr, res->server,
4806 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4544 goto out; 4807 goto out;
4545 if ((status = decode_restorefh(&xdr)) != 0) 4808 if ((status = decode_restorefh(&xdr)) != 0)
4546 goto out; 4809 goto out;
4547 decode_getfattr(&xdr, res->fattr, res->server); 4810 decode_getfattr(&xdr, res->fattr, res->server,
4811 !RPC_IS_ASYNC(rqstp->rq_task));
4548out: 4812out:
4549 return status; 4813 return status;
4550} 4814}
@@ -4573,11 +4837,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
4573 goto out; 4837 goto out;
4574 if ((status = decode_getfh(&xdr, res->fh)) != 0) 4838 if ((status = decode_getfh(&xdr, res->fh)) != 0)
4575 goto out; 4839 goto out;
4576 if (decode_getfattr(&xdr, res->fattr, res->server) != 0) 4840 if (decode_getfattr(&xdr, res->fattr, res->server,
4841 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4577 goto out; 4842 goto out;
4578 if ((status = decode_restorefh(&xdr)) != 0) 4843 if ((status = decode_restorefh(&xdr)) != 0)
4579 goto out; 4844 goto out;
4580 decode_getfattr(&xdr, res->dir_fattr, res->server); 4845 decode_getfattr(&xdr, res->dir_fattr, res->server,
4846 !RPC_IS_ASYNC(rqstp->rq_task));
4581out: 4847out:
4582 return status; 4848 return status;
4583} 4849}
@@ -4609,7 +4875,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
4609 status = decode_putfh(&xdr); 4875 status = decode_putfh(&xdr);
4610 if (status) 4876 if (status)
4611 goto out; 4877 goto out;
4612 status = decode_getfattr(&xdr, res->fattr, res->server); 4878 status = decode_getfattr(&xdr, res->fattr, res->server,
4879 !RPC_IS_ASYNC(rqstp->rq_task));
4613out: 4880out:
4614 return status; 4881 return status;
4615} 4882}
@@ -4716,7 +4983,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
4716 * an ESTALE error. Shouldn't be a problem, 4983 * an ESTALE error. Shouldn't be a problem,
4717 * though, since fattr->valid will remain unset. 4984 * though, since fattr->valid will remain unset.
4718 */ 4985 */
4719 decode_getfattr(&xdr, res->fattr, res->server); 4986 decode_getfattr(&xdr, res->fattr, res->server,
4987 !RPC_IS_ASYNC(rqstp->rq_task));
4720out: 4988out:
4721 return status; 4989 return status;
4722} 4990}
@@ -4748,11 +5016,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
4748 goto out; 5016 goto out;
4749 if (decode_getfh(&xdr, &res->fh) != 0) 5017 if (decode_getfh(&xdr, &res->fh) != 0)
4750 goto out; 5018 goto out;
4751 if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) 5019 if (decode_getfattr(&xdr, res->f_attr, res->server,
5020 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4752 goto out; 5021 goto out;
4753 if (decode_restorefh(&xdr) != 0) 5022 if (decode_restorefh(&xdr) != 0)
4754 goto out; 5023 goto out;
4755 decode_getfattr(&xdr, res->dir_attr, res->server); 5024 decode_getfattr(&xdr, res->dir_attr, res->server,
5025 !RPC_IS_ASYNC(rqstp->rq_task));
4756out: 5026out:
4757 return status; 5027 return status;
4758} 5028}
@@ -4800,7 +5070,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
4800 status = decode_open(&xdr, res); 5070 status = decode_open(&xdr, res);
4801 if (status) 5071 if (status)
4802 goto out; 5072 goto out;
4803 decode_getfattr(&xdr, res->f_attr, res->server); 5073 decode_getfattr(&xdr, res->f_attr, res->server,
5074 !RPC_IS_ASYNC(rqstp->rq_task));
4804out: 5075out:
4805 return status; 5076 return status;
4806} 5077}
@@ -4827,7 +5098,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
4827 status = decode_setattr(&xdr); 5098 status = decode_setattr(&xdr);
4828 if (status) 5099 if (status)
4829 goto out; 5100 goto out;
4830 decode_getfattr(&xdr, res->fattr, res->server); 5101 decode_getfattr(&xdr, res->fattr, res->server,
5102 !RPC_IS_ASYNC(rqstp->rq_task));
4831out: 5103out:
4832 return status; 5104 return status;
4833} 5105}
@@ -5001,7 +5273,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
5001 status = decode_write(&xdr, res); 5273 status = decode_write(&xdr, res);
5002 if (status) 5274 if (status)
5003 goto out; 5275 goto out;
5004 decode_getfattr(&xdr, res->fattr, res->server); 5276 decode_getfattr(&xdr, res->fattr, res->server,
5277 !RPC_IS_ASYNC(rqstp->rq_task));
5005 if (!status) 5278 if (!status)
5006 status = res->count; 5279 status = res->count;
5007out: 5280out:
@@ -5030,7 +5303,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
5030 status = decode_commit(&xdr, res); 5303 status = decode_commit(&xdr, res);
5031 if (status) 5304 if (status)
5032 goto out; 5305 goto out;
5033 decode_getfattr(&xdr, res->fattr, res->server); 5306 decode_getfattr(&xdr, res->fattr, res->server,
5307 !RPC_IS_ASYNC(rqstp->rq_task));
5034out: 5308out:
5035 return status; 5309 return status;
5036} 5310}
@@ -5194,7 +5468,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5194 if (status != 0) 5468 if (status != 0)
5195 goto out; 5469 goto out;
5196 status = decode_delegreturn(&xdr); 5470 status = decode_delegreturn(&xdr);
5197 decode_getfattr(&xdr, res->fattr, res->server); 5471 decode_getfattr(&xdr, res->fattr, res->server,
5472 !RPC_IS_ASYNC(rqstp->rq_task));
5198out: 5473out:
5199 return status; 5474 return status;
5200} 5475}
@@ -5222,7 +5497,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
5222 goto out; 5497 goto out;
5223 xdr_enter_page(&xdr, PAGE_SIZE); 5498 xdr_enter_page(&xdr, PAGE_SIZE);
5224 status = decode_getfattr(&xdr, &res->fs_locations->fattr, 5499 status = decode_getfattr(&xdr, &res->fs_locations->fattr,
5225 res->fs_locations->server); 5500 res->fs_locations->server,
5501 !RPC_IS_ASYNC(req->rq_task));
5226out: 5502out:
5227 return status; 5503 return status;
5228} 5504}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 73ea5e8d66ce..12c9e66d3f1d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
60 return p; 60 return p;
61} 61}
62 62
63static void nfs_readdata_free(struct nfs_read_data *p) 63void nfs_readdata_free(struct nfs_read_data *p)
64{ 64{
65 if (p && (p->pagevec != &p->page_array[0])) 65 if (p && (p->pagevec != &p->page_array[0]))
66 kfree(p->pagevec); 66 kfree(p->pagevec);
67 mempool_free(p, nfs_rdata_mempool); 67 mempool_free(p, nfs_rdata_mempool);
68} 68}
69 69
70void nfs_readdata_release(void *data) 70static void nfs_readdata_release(struct nfs_read_data *rdata)
71{ 71{
72 struct nfs_read_data *rdata = data;
73
74 put_nfs_open_context(rdata->args.context); 72 put_nfs_open_context(rdata->args.context);
75 nfs_readdata_free(rdata); 73 nfs_readdata_free(rdata);
76} 74}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0b4cbdc60abd..de935692d40d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,7 +73,7 @@ enum {
73 Opt_cto, Opt_nocto, 73 Opt_cto, Opt_nocto,
74 Opt_ac, Opt_noac, 74 Opt_ac, Opt_noac,
75 Opt_lock, Opt_nolock, 75 Opt_lock, Opt_nolock,
76 Opt_v2, Opt_v3, 76 Opt_v2, Opt_v3, Opt_v4,
77 Opt_udp, Opt_tcp, Opt_rdma, 77 Opt_udp, Opt_tcp, Opt_rdma,
78 Opt_acl, Opt_noacl, 78 Opt_acl, Opt_noacl,
79 Opt_rdirplus, Opt_nordirplus, 79 Opt_rdirplus, Opt_nordirplus,
@@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = {
127 { Opt_nolock, "nolock" }, 127 { Opt_nolock, "nolock" },
128 { Opt_v2, "v2" }, 128 { Opt_v2, "v2" },
129 { Opt_v3, "v3" }, 129 { Opt_v3, "v3" },
130 { Opt_v4, "v4" },
130 { Opt_udp, "udp" }, 131 { Opt_udp, "udp" },
131 { Opt_tcp, "tcp" }, 132 { Opt_tcp, "tcp" },
132 { Opt_rdma, "rdma" }, 133 { Opt_rdma, "rdma" },
@@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = {
158 { Opt_mountvers, "mountvers=%s" }, 159 { Opt_mountvers, "mountvers=%s" },
159 { Opt_nfsvers, "nfsvers=%s" }, 160 { Opt_nfsvers, "nfsvers=%s" },
160 { Opt_nfsvers, "vers=%s" }, 161 { Opt_nfsvers, "vers=%s" },
161 { Opt_minorversion, "minorversion=%u" }, 162 { Opt_minorversion, "minorversion=%s" },
162 163
163 { Opt_sec, "sec=%s" }, 164 { Opt_sec, "sec=%s" },
164 { Opt_proto, "proto=%s" }, 165 { Opt_proto, "proto=%s" },
@@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = {
272}; 273};
273 274
274#ifdef CONFIG_NFS_V4 275#ifdef CONFIG_NFS_V4
276static int nfs4_validate_text_mount_data(void *options,
277 struct nfs_parsed_mount_data *args, const char *dev_name);
278static int nfs4_try_mount(int flags, const char *dev_name,
279 struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
275static int nfs4_get_sb(struct file_system_type *fs_type, 280static int nfs4_get_sb(struct file_system_type *fs_type,
276 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 281 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
277static int nfs4_remote_get_sb(struct file_system_type *fs_type, 282static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -742,127 +747,23 @@ static int nfs_verify_server_address(struct sockaddr *addr)
742 } 747 }
743 } 748 }
744 749
750 dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
745 return 0; 751 return 0;
746} 752}
747 753
748static void nfs_parse_ipv4_address(char *string, size_t str_len,
749 struct sockaddr *sap, size_t *addr_len)
750{
751 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
752 u8 *addr = (u8 *)&sin->sin_addr.s_addr;
753
754 if (str_len <= INET_ADDRSTRLEN) {
755 dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
756 (int)str_len, string);
757
758 sin->sin_family = AF_INET;
759 *addr_len = sizeof(*sin);
760 if (in4_pton(string, str_len, addr, '\0', NULL))
761 return;
762 }
763
764 sap->sa_family = AF_UNSPEC;
765 *addr_len = 0;
766}
767
768#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
769static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
770 const char *delim,
771 struct sockaddr_in6 *sin6)
772{
773 char *p;
774 size_t len;
775
776 if ((string + str_len) == delim)
777 return 1;
778
779 if (*delim != IPV6_SCOPE_DELIMITER)
780 return 0;
781
782 if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
783 return 0;
784
785 len = (string + str_len) - delim - 1;
786 p = kstrndup(delim + 1, len, GFP_KERNEL);
787 if (p) {
788 unsigned long scope_id = 0;
789 struct net_device *dev;
790
791 dev = dev_get_by_name(&init_net, p);
792 if (dev != NULL) {
793 scope_id = dev->ifindex;
794 dev_put(dev);
795 } else {
796 if (strict_strtoul(p, 10, &scope_id) == 0) {
797 kfree(p);
798 return 0;
799 }
800 }
801
802 kfree(p);
803
804 sin6->sin6_scope_id = scope_id;
805 dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
806 return 1;
807 }
808
809 return 0;
810}
811
812static void nfs_parse_ipv6_address(char *string, size_t str_len,
813 struct sockaddr *sap, size_t *addr_len)
814{
815 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
816 u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
817 const char *delim;
818
819 if (str_len <= INET6_ADDRSTRLEN) {
820 dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
821 (int)str_len, string);
822
823 sin6->sin6_family = AF_INET6;
824 *addr_len = sizeof(*sin6);
825 if (in6_pton(string, str_len, addr,
826 IPV6_SCOPE_DELIMITER, &delim) != 0) {
827 if (nfs_parse_ipv6_scope_id(string, str_len,
828 delim, sin6) != 0)
829 return;
830 }
831 }
832
833 sap->sa_family = AF_UNSPEC;
834 *addr_len = 0;
835}
836#else
837static void nfs_parse_ipv6_address(char *string, size_t str_len,
838 struct sockaddr *sap, size_t *addr_len)
839{
840 sap->sa_family = AF_UNSPEC;
841 *addr_len = 0;
842}
843#endif
844
845/* 754/*
846 * Construct a sockaddr based on the contents of a string that contains 755 * Select between a default port value and a user-specified port value.
847 * an IP address in presentation format. 756 * If a zero value is set, then autobind will be used.
848 *
849 * If there is a problem constructing the new sockaddr, set the address
850 * family to AF_UNSPEC.
851 */ 757 */
852void nfs_parse_ip_address(char *string, size_t str_len, 758static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
853 struct sockaddr *sap, size_t *addr_len) 759 const unsigned short default_port)
854{ 760{
855 unsigned int i, colons; 761 unsigned short port = default_port;
856 762
857 colons = 0; 763 if (parsed_port != NFS_UNSPEC_PORT)
858 for (i = 0; i < str_len; i++) 764 port = parsed_port;
859 if (string[i] == ':')
860 colons++;
861 765
862 if (colons >= 2) 766 rpc_set_port(sap, port);
863 nfs_parse_ipv6_address(string, str_len, sap, addr_len);
864 else
865 nfs_parse_ipv4_address(string, str_len, sap, addr_len);
866} 767}
867 768
868/* 769/*
@@ -904,8 +805,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
904 805
905/* 806/*
906 * Parse the value of the 'sec=' option. 807 * Parse the value of the 'sec=' option.
907 *
908 * The flavor_len setting is for v4 mounts.
909 */ 808 */
910static int nfs_parse_security_flavors(char *value, 809static int nfs_parse_security_flavors(char *value,
911 struct nfs_parsed_mount_data *mnt) 810 struct nfs_parsed_mount_data *mnt)
@@ -916,53 +815,43 @@ static int nfs_parse_security_flavors(char *value,
916 815
917 switch (match_token(value, nfs_secflavor_tokens, args)) { 816 switch (match_token(value, nfs_secflavor_tokens, args)) {
918 case Opt_sec_none: 817 case Opt_sec_none:
919 mnt->auth_flavor_len = 0;
920 mnt->auth_flavors[0] = RPC_AUTH_NULL; 818 mnt->auth_flavors[0] = RPC_AUTH_NULL;
921 break; 819 break;
922 case Opt_sec_sys: 820 case Opt_sec_sys:
923 mnt->auth_flavor_len = 0;
924 mnt->auth_flavors[0] = RPC_AUTH_UNIX; 821 mnt->auth_flavors[0] = RPC_AUTH_UNIX;
925 break; 822 break;
926 case Opt_sec_krb5: 823 case Opt_sec_krb5:
927 mnt->auth_flavor_len = 1;
928 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; 824 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
929 break; 825 break;
930 case Opt_sec_krb5i: 826 case Opt_sec_krb5i:
931 mnt->auth_flavor_len = 1;
932 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; 827 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
933 break; 828 break;
934 case Opt_sec_krb5p: 829 case Opt_sec_krb5p:
935 mnt->auth_flavor_len = 1;
936 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; 830 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
937 break; 831 break;
938 case Opt_sec_lkey: 832 case Opt_sec_lkey:
939 mnt->auth_flavor_len = 1;
940 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; 833 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
941 break; 834 break;
942 case Opt_sec_lkeyi: 835 case Opt_sec_lkeyi:
943 mnt->auth_flavor_len = 1;
944 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; 836 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
945 break; 837 break;
946 case Opt_sec_lkeyp: 838 case Opt_sec_lkeyp:
947 mnt->auth_flavor_len = 1;
948 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; 839 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
949 break; 840 break;
950 case Opt_sec_spkm: 841 case Opt_sec_spkm:
951 mnt->auth_flavor_len = 1;
952 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; 842 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
953 break; 843 break;
954 case Opt_sec_spkmi: 844 case Opt_sec_spkmi:
955 mnt->auth_flavor_len = 1;
956 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; 845 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
957 break; 846 break;
958 case Opt_sec_spkmp: 847 case Opt_sec_spkmp:
959 mnt->auth_flavor_len = 1;
960 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; 848 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
961 break; 849 break;
962 default: 850 default:
963 return 0; 851 return 0;
964 } 852 }
965 853
854 mnt->auth_flavor_len = 1;
966 return 1; 855 return 1;
967} 856}
968 857
@@ -1001,7 +890,6 @@ static int nfs_parse_mount_options(char *raw,
1001 while ((p = strsep(&raw, ",")) != NULL) { 890 while ((p = strsep(&raw, ",")) != NULL) {
1002 substring_t args[MAX_OPT_ARGS]; 891 substring_t args[MAX_OPT_ARGS];
1003 unsigned long option; 892 unsigned long option;
1004 int int_option;
1005 int token; 893 int token;
1006 894
1007 if (!*p) 895 if (!*p)
@@ -1047,10 +935,18 @@ static int nfs_parse_mount_options(char *raw,
1047 break; 935 break;
1048 case Opt_v2: 936 case Opt_v2:
1049 mnt->flags &= ~NFS_MOUNT_VER3; 937 mnt->flags &= ~NFS_MOUNT_VER3;
938 mnt->version = 2;
1050 break; 939 break;
1051 case Opt_v3: 940 case Opt_v3:
1052 mnt->flags |= NFS_MOUNT_VER3; 941 mnt->flags |= NFS_MOUNT_VER3;
942 mnt->version = 3;
1053 break; 943 break;
944#ifdef CONFIG_NFS_V4
945 case Opt_v4:
946 mnt->flags &= ~NFS_MOUNT_VER3;
947 mnt->version = 4;
948 break;
949#endif
1054 case Opt_udp: 950 case Opt_udp:
1055 mnt->flags &= ~NFS_MOUNT_TCP; 951 mnt->flags &= ~NFS_MOUNT_TCP;
1056 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 952 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1264,20 +1160,33 @@ static int nfs_parse_mount_options(char *raw,
1264 switch (option) { 1160 switch (option) {
1265 case NFS2_VERSION: 1161 case NFS2_VERSION:
1266 mnt->flags &= ~NFS_MOUNT_VER3; 1162 mnt->flags &= ~NFS_MOUNT_VER3;
1163 mnt->version = 2;
1267 break; 1164 break;
1268 case NFS3_VERSION: 1165 case NFS3_VERSION:
1269 mnt->flags |= NFS_MOUNT_VER3; 1166 mnt->flags |= NFS_MOUNT_VER3;
1167 mnt->version = 3;
1270 break; 1168 break;
1169#ifdef CONFIG_NFS_V4
1170 case NFS4_VERSION:
1171 mnt->flags &= ~NFS_MOUNT_VER3;
1172 mnt->version = 4;
1173 break;
1174#endif
1271 default: 1175 default:
1272 goto out_invalid_value; 1176 goto out_invalid_value;
1273 } 1177 }
1274 break; 1178 break;
1275 case Opt_minorversion: 1179 case Opt_minorversion:
1276 if (match_int(args, &int_option)) 1180 string = match_strdup(args);
1277 return 0; 1181 if (string == NULL)
1278 if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION) 1182 goto out_nomem;
1279 return 0; 1183 rc = strict_strtoul(string, 10, &option);
1280 mnt->minorversion = int_option; 1184 kfree(string);
1185 if (rc != 0)
1186 goto out_invalid_value;
1187 if (option > NFS4_MAX_MINOR_VERSION)
1188 goto out_invalid_value;
1189 mnt->minorversion = option;
1281 break; 1190 break;
1282 1191
1283 /* 1192 /*
@@ -1352,11 +1261,14 @@ static int nfs_parse_mount_options(char *raw,
1352 string = match_strdup(args); 1261 string = match_strdup(args);
1353 if (string == NULL) 1262 if (string == NULL)
1354 goto out_nomem; 1263 goto out_nomem;
1355 nfs_parse_ip_address(string, strlen(string), 1264 mnt->nfs_server.addrlen =
1356 (struct sockaddr *) 1265 rpc_pton(string, strlen(string),
1357 &mnt->nfs_server.address, 1266 (struct sockaddr *)
1358 &mnt->nfs_server.addrlen); 1267 &mnt->nfs_server.address,
1268 sizeof(mnt->nfs_server.address));
1359 kfree(string); 1269 kfree(string);
1270 if (mnt->nfs_server.addrlen == 0)
1271 goto out_invalid_address;
1360 break; 1272 break;
1361 case Opt_clientaddr: 1273 case Opt_clientaddr:
1362 string = match_strdup(args); 1274 string = match_strdup(args);
@@ -1376,11 +1288,14 @@ static int nfs_parse_mount_options(char *raw,
1376 string = match_strdup(args); 1288 string = match_strdup(args);
1377 if (string == NULL) 1289 if (string == NULL)
1378 goto out_nomem; 1290 goto out_nomem;
1379 nfs_parse_ip_address(string, strlen(string), 1291 mnt->mount_server.addrlen =
1380 (struct sockaddr *) 1292 rpc_pton(string, strlen(string),
1381 &mnt->mount_server.address, 1293 (struct sockaddr *)
1382 &mnt->mount_server.addrlen); 1294 &mnt->mount_server.address,
1295 sizeof(mnt->mount_server.address));
1383 kfree(string); 1296 kfree(string);
1297 if (mnt->mount_server.addrlen == 0)
1298 goto out_invalid_address;
1384 break; 1299 break;
1385 case Opt_lookupcache: 1300 case Opt_lookupcache:
1386 string = match_strdup(args); 1301 string = match_strdup(args);
@@ -1432,8 +1347,11 @@ static int nfs_parse_mount_options(char *raw,
1432 1347
1433 return 1; 1348 return 1;
1434 1349
1350out_invalid_address:
1351 printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
1352 return 0;
1435out_invalid_value: 1353out_invalid_value:
1436 printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p); 1354 printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
1437 return 0; 1355 return 0;
1438out_nomem: 1356out_nomem:
1439 printk(KERN_INFO "NFS: not enough memory to parse option\n"); 1357 printk(KERN_INFO "NFS: not enough memory to parse option\n");
@@ -1445,13 +1363,60 @@ out_security_failure:
1445} 1363}
1446 1364
1447/* 1365/*
1366 * Match the requested auth flavors with the list returned by
1367 * the server. Returns zero and sets the mount's authentication
1368 * flavor on success; returns -EACCES if server does not support
1369 * the requested flavor.
1370 */
1371static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
1372 struct nfs_mount_request *request)
1373{
1374 unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
1375
1376 /*
1377 * Certain releases of Linux's mountd return an empty
1378 * flavor list. To prevent behavioral regression with
1379 * these servers (ie. rejecting mounts that used to
1380 * succeed), revert to pre-2.6.32 behavior (no checking)
1381 * if the returned flavor list is empty.
1382 */
1383 if (server_authlist_len == 0)
1384 return 0;
1385
1386 /*
1387 * We avoid sophisticated negotiating here, as there are
1388 * plenty of cases where we can get it wrong, providing
1389 * either too little or too much security.
1390 *
1391 * RFC 2623, section 2.7 suggests we SHOULD prefer the
1392 * flavor listed first. However, some servers list
1393 * AUTH_NULL first. Our caller plants AUTH_SYS, the
1394 * preferred default, in args->auth_flavors[0] if user
1395 * didn't specify sec= mount option.
1396 */
1397 for (i = 0; i < args->auth_flavor_len; i++)
1398 for (j = 0; j < server_authlist_len; j++)
1399 if (args->auth_flavors[i] == request->auth_flavs[j]) {
1400 dfprintk(MOUNT, "NFS: using auth flavor %d\n",
1401 request->auth_flavs[j]);
1402 args->auth_flavors[0] = request->auth_flavs[j];
1403 return 0;
1404 }
1405
1406 dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
1407 nfs_umount(request);
1408 return -EACCES;
1409}
1410
1411/*
1448 * Use the remote server's MOUNT service to request the NFS file handle 1412 * Use the remote server's MOUNT service to request the NFS file handle
1449 * corresponding to the provided path. 1413 * corresponding to the provided path.
1450 */ 1414 */
1451static int nfs_try_mount(struct nfs_parsed_mount_data *args, 1415static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1452 struct nfs_fh *root_fh) 1416 struct nfs_fh *root_fh)
1453{ 1417{
1454 unsigned int auth_flavor_len = 0; 1418 rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
1419 unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
1455 struct nfs_mount_request request = { 1420 struct nfs_mount_request request = {
1456 .sap = (struct sockaddr *) 1421 .sap = (struct sockaddr *)
1457 &args->mount_server.address, 1422 &args->mount_server.address,
@@ -1459,7 +1424,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1459 .protocol = args->mount_server.protocol, 1424 .protocol = args->mount_server.protocol,
1460 .fh = root_fh, 1425 .fh = root_fh,
1461 .noresvport = args->flags & NFS_MOUNT_NORESVPORT, 1426 .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
1462 .auth_flav_len = &auth_flavor_len, 1427 .auth_flav_len = &server_authlist_len,
1428 .auth_flavs = server_authlist,
1463 }; 1429 };
1464 int status; 1430 int status;
1465 1431
@@ -1485,23 +1451,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1485 args->mount_server.addrlen = args->nfs_server.addrlen; 1451 args->mount_server.addrlen = args->nfs_server.addrlen;
1486 } 1452 }
1487 request.salen = args->mount_server.addrlen; 1453 request.salen = args->mount_server.addrlen;
1488 1454 nfs_set_default_port(request.sap, args->mount_server.port, 0);
1489 /*
1490 * autobind will be used if mount_server.port == 0
1491 */
1492 nfs_set_port(request.sap, args->mount_server.port);
1493 1455
1494 /* 1456 /*
1495 * Now ask the mount server to map our export path 1457 * Now ask the mount server to map our export path
1496 * to a file handle. 1458 * to a file handle.
1497 */ 1459 */
1498 status = nfs_mount(&request); 1460 status = nfs_mount(&request);
1499 if (status == 0) 1461 if (status != 0) {
1500 return 0; 1462 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
1463 request.hostname, status);
1464 return status;
1465 }
1501 1466
1502 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", 1467 /*
1503 request.hostname, status); 1468 * MNTv1 (NFSv2) does not support auth flavor negotiation.
1504 return status; 1469 */
1470 if (args->mount_server.version != NFS_MNT3_VERSION)
1471 return 0;
1472 return nfs_walk_authlist(args, &request);
1505} 1473}
1506 1474
1507static int nfs_parse_simple_hostname(const char *dev_name, 1475static int nfs_parse_simple_hostname(const char *dev_name,
@@ -1661,6 +1629,7 @@ static int nfs_validate_mount_data(void *options,
1661 const char *dev_name) 1629 const char *dev_name)
1662{ 1630{
1663 struct nfs_mount_data *data = (struct nfs_mount_data *)options; 1631 struct nfs_mount_data *data = (struct nfs_mount_data *)options;
1632 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
1664 1633
1665 if (data == NULL) 1634 if (data == NULL)
1666 goto out_no_data; 1635 goto out_no_data;
@@ -1672,10 +1641,12 @@ static int nfs_validate_mount_data(void *options,
1672 args->acregmax = NFS_DEF_ACREGMAX; 1641 args->acregmax = NFS_DEF_ACREGMAX;
1673 args->acdirmin = NFS_DEF_ACDIRMIN; 1642 args->acdirmin = NFS_DEF_ACDIRMIN;
1674 args->acdirmax = NFS_DEF_ACDIRMAX; 1643 args->acdirmax = NFS_DEF_ACDIRMAX;
1675 args->mount_server.port = 0; /* autobind unless user sets port */ 1644 args->mount_server.port = NFS_UNSPEC_PORT;
1676 args->nfs_server.port = 0; /* autobind unless user sets port */ 1645 args->nfs_server.port = NFS_UNSPEC_PORT;
1677 args->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1646 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1678 args->auth_flavors[0] = RPC_AUTH_UNIX; 1647 args->auth_flavors[0] = RPC_AUTH_UNIX;
1648 args->auth_flavor_len = 1;
1649 args->minorversion = 0;
1679 1650
1680 switch (data->version) { 1651 switch (data->version) {
1681 case 1: 1652 case 1:
@@ -1697,8 +1668,11 @@ static int nfs_validate_mount_data(void *options,
1697 if (data->root.size > NFS3_FHSIZE || data->root.size == 0) 1668 if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
1698 goto out_invalid_fh; 1669 goto out_invalid_fh;
1699 mntfh->size = data->root.size; 1670 mntfh->size = data->root.size;
1700 } else 1671 args->version = 3;
1672 } else {
1701 mntfh->size = NFS2_FHSIZE; 1673 mntfh->size = NFS2_FHSIZE;
1674 args->version = 2;
1675 }
1702 1676
1703 1677
1704 memcpy(mntfh->data, data->root.data, mntfh->size); 1678 memcpy(mntfh->data, data->root.data, mntfh->size);
@@ -1720,11 +1694,9 @@ static int nfs_validate_mount_data(void *options,
1720 args->acdirmin = data->acdirmin; 1694 args->acdirmin = data->acdirmin;
1721 args->acdirmax = data->acdirmax; 1695 args->acdirmax = data->acdirmax;
1722 1696
1723 memcpy(&args->nfs_server.address, &data->addr, 1697 memcpy(sap, &data->addr, sizeof(data->addr));
1724 sizeof(data->addr));
1725 args->nfs_server.addrlen = sizeof(data->addr); 1698 args->nfs_server.addrlen = sizeof(data->addr);
1726 if (!nfs_verify_server_address((struct sockaddr *) 1699 if (!nfs_verify_server_address(sap))
1727 &args->nfs_server.address))
1728 goto out_no_address; 1700 goto out_no_address;
1729 1701
1730 if (!(data->flags & NFS_MOUNT_TCP)) 1702 if (!(data->flags & NFS_MOUNT_TCP))
@@ -1772,12 +1744,18 @@ static int nfs_validate_mount_data(void *options,
1772 if (nfs_parse_mount_options((char *)options, args) == 0) 1744 if (nfs_parse_mount_options((char *)options, args) == 0)
1773 return -EINVAL; 1745 return -EINVAL;
1774 1746
1775 if (!nfs_verify_server_address((struct sockaddr *) 1747 if (!nfs_verify_server_address(sap))
1776 &args->nfs_server.address))
1777 goto out_no_address; 1748 goto out_no_address;
1778 1749
1779 nfs_set_port((struct sockaddr *)&args->nfs_server.address, 1750 if (args->version == 4)
1780 args->nfs_server.port); 1751#ifdef CONFIG_NFS_V4
1752 return nfs4_validate_text_mount_data(options,
1753 args, dev_name);
1754#else
1755 goto out_v4_not_compiled;
1756#endif
1757
1758 nfs_set_default_port(sap, args->nfs_server.port, 0);
1781 1759
1782 nfs_set_mount_transport_protocol(args); 1760 nfs_set_mount_transport_protocol(args);
1783 1761
@@ -1825,6 +1803,12 @@ out_v3_not_compiled:
1825 return -EPROTONOSUPPORT; 1803 return -EPROTONOSUPPORT;
1826#endif /* !CONFIG_NFS_V3 */ 1804#endif /* !CONFIG_NFS_V3 */
1827 1805
1806#ifndef CONFIG_NFS_V4
1807out_v4_not_compiled:
1808 dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
1809 return -EPROTONOSUPPORT;
1810#endif /* !CONFIG_NFS_V4 */
1811
1828out_nomem: 1812out_nomem:
1829 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); 1813 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
1830 return -ENOMEM; 1814 return -ENOMEM;
@@ -1934,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
1934 if (server->flags & NFS_MOUNT_NOAC) 1918 if (server->flags & NFS_MOUNT_NOAC)
1935 sb->s_flags |= MS_SYNCHRONOUS; 1919 sb->s_flags |= MS_SYNCHRONOUS;
1936 1920
1921 sb->s_bdi = &server->backing_dev_info;
1922
1937 nfs_super_set_maxbytes(sb, server->maxfilesize); 1923 nfs_super_set_maxbytes(sb, server->maxfilesize);
1938} 1924}
1939 1925
@@ -2120,6 +2106,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2120 if (error < 0) 2106 if (error < 0)
2121 goto out; 2107 goto out;
2122 2108
2109#ifdef CONFIG_NFS_V4
2110 if (data->version == 4) {
2111 error = nfs4_try_mount(flags, dev_name, data, mnt);
2112 kfree(data->client_address);
2113 goto out;
2114 }
2115#endif /* CONFIG_NFS_V4 */
2116
2123 /* Get a volume representation */ 2117 /* Get a volume representation */
2124 server = nfs_create_server(data, mntfh); 2118 server = nfs_create_server(data, mntfh);
2125 if (IS_ERR(server)) { 2119 if (IS_ERR(server)) {
@@ -2317,6 +2311,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
2317 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); 2311 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
2318} 2312}
2319 2313
2314static int nfs4_validate_text_mount_data(void *options,
2315 struct nfs_parsed_mount_data *args,
2316 const char *dev_name)
2317{
2318 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
2319
2320 nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
2321
2322 nfs_validate_transport_protocol(args);
2323
2324 nfs4_validate_mount_flags(args);
2325
2326 if (args->version != 4) {
2327 dfprintk(MOUNT,
2328 "NFS4: Illegal mount version\n");
2329 return -EINVAL;
2330 }
2331
2332 if (args->auth_flavor_len > 1) {
2333 dfprintk(MOUNT,
2334 "NFS4: Too many RPC auth flavours specified\n");
2335 return -EINVAL;
2336 }
2337
2338 if (args->client_address == NULL) {
2339 dfprintk(MOUNT,
2340 "NFS4: mount program didn't pass callback address\n");
2341 return -EINVAL;
2342 }
2343
2344 return nfs_parse_devname(dev_name,
2345 &args->nfs_server.hostname,
2346 NFS4_MAXNAMLEN,
2347 &args->nfs_server.export_path,
2348 NFS4_MAXPATHLEN);
2349}
2350
2320/* 2351/*
2321 * Validate NFSv4 mount options 2352 * Validate NFSv4 mount options
2322 */ 2353 */
@@ -2324,7 +2355,7 @@ static int nfs4_validate_mount_data(void *options,
2324 struct nfs_parsed_mount_data *args, 2355 struct nfs_parsed_mount_data *args,
2325 const char *dev_name) 2356 const char *dev_name)
2326{ 2357{
2327 struct sockaddr_in *ap; 2358 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
2328 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; 2359 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
2329 char *c; 2360 char *c;
2330 2361
@@ -2337,23 +2368,22 @@ static int nfs4_validate_mount_data(void *options,
2337 args->acregmax = NFS_DEF_ACREGMAX; 2368 args->acregmax = NFS_DEF_ACREGMAX;
2338 args->acdirmin = NFS_DEF_ACDIRMIN; 2369 args->acdirmin = NFS_DEF_ACDIRMIN;
2339 args->acdirmax = NFS_DEF_ACDIRMAX; 2370 args->acdirmax = NFS_DEF_ACDIRMAX;
2340 args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ 2371 args->nfs_server.port = NFS_UNSPEC_PORT;
2341 args->auth_flavors[0] = RPC_AUTH_UNIX; 2372 args->auth_flavors[0] = RPC_AUTH_UNIX;
2342 args->auth_flavor_len = 0; 2373 args->auth_flavor_len = 1;
2374 args->version = 4;
2343 args->minorversion = 0; 2375 args->minorversion = 0;
2344 2376
2345 switch (data->version) { 2377 switch (data->version) {
2346 case 1: 2378 case 1:
2347 ap = (struct sockaddr_in *)&args->nfs_server.address;
2348 if (data->host_addrlen > sizeof(args->nfs_server.address)) 2379 if (data->host_addrlen > sizeof(args->nfs_server.address))
2349 goto out_no_address; 2380 goto out_no_address;
2350 if (data->host_addrlen == 0) 2381 if (data->host_addrlen == 0)
2351 goto out_no_address; 2382 goto out_no_address;
2352 args->nfs_server.addrlen = data->host_addrlen; 2383 args->nfs_server.addrlen = data->host_addrlen;
2353 if (copy_from_user(ap, data->host_addr, data->host_addrlen)) 2384 if (copy_from_user(sap, data->host_addr, data->host_addrlen))
2354 return -EFAULT; 2385 return -EFAULT;
2355 if (!nfs_verify_server_address((struct sockaddr *) 2386 if (!nfs_verify_server_address(sap))
2356 &args->nfs_server.address))
2357 goto out_no_address; 2387 goto out_no_address;
2358 2388
2359 if (data->auth_flavourlen) { 2389 if (data->auth_flavourlen) {
@@ -2399,39 +2429,14 @@ static int nfs4_validate_mount_data(void *options,
2399 nfs_validate_transport_protocol(args); 2429 nfs_validate_transport_protocol(args);
2400 2430
2401 break; 2431 break;
2402 default: { 2432 default:
2403 int status;
2404
2405 if (nfs_parse_mount_options((char *)options, args) == 0) 2433 if (nfs_parse_mount_options((char *)options, args) == 0)
2406 return -EINVAL; 2434 return -EINVAL;
2407 2435
2408 if (!nfs_verify_server_address((struct sockaddr *) 2436 if (!nfs_verify_server_address(sap))
2409 &args->nfs_server.address))
2410 return -EINVAL; 2437 return -EINVAL;
2411 2438
2412 nfs_set_port((struct sockaddr *)&args->nfs_server.address, 2439 return nfs4_validate_text_mount_data(options, args, dev_name);
2413 args->nfs_server.port);
2414
2415 nfs_validate_transport_protocol(args);
2416
2417 nfs4_validate_mount_flags(args);
2418
2419 if (args->auth_flavor_len > 1)
2420 goto out_inval_auth;
2421
2422 if (args->client_address == NULL)
2423 goto out_no_client_address;
2424
2425 status = nfs_parse_devname(dev_name,
2426 &args->nfs_server.hostname,
2427 NFS4_MAXNAMLEN,
2428 &args->nfs_server.export_path,
2429 NFS4_MAXPATHLEN);
2430 if (status < 0)
2431 return status;
2432
2433 break;
2434 }
2435 } 2440 }
2436 2441
2437 return 0; 2442 return 0;
@@ -2448,10 +2453,6 @@ out_inval_auth:
2448out_no_address: 2453out_no_address:
2449 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); 2454 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
2450 return -EINVAL; 2455 return -EINVAL;
2451
2452out_no_client_address:
2453 dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n");
2454 return -EINVAL;
2455} 2456}
2456 2457
2457/* 2458/*
@@ -2618,6 +2619,34 @@ out_err:
2618 return ret; 2619 return ret;
2619} 2620}
2620 2621
2622static int nfs4_try_mount(int flags, const char *dev_name,
2623 struct nfs_parsed_mount_data *data,
2624 struct vfsmount *mnt)
2625{
2626 char *export_path;
2627 struct vfsmount *root_mnt;
2628 int error;
2629
2630 dfprintk(MOUNT, "--> nfs4_try_mount()\n");
2631
2632 export_path = data->nfs_server.export_path;
2633 data->nfs_server.export_path = "/";
2634 root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
2635 data->nfs_server.hostname);
2636 data->nfs_server.export_path = export_path;
2637
2638 error = PTR_ERR(root_mnt);
2639 if (IS_ERR(root_mnt))
2640 goto out;
2641
2642 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2643
2644out:
2645 dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
2646 error != 0 ? " [error]" : "");
2647 return error;
2648}
2649
2621/* 2650/*
2622 * Get the superblock for an NFS4 mountpoint 2651 * Get the superblock for an NFS4 mountpoint
2623 */ 2652 */
@@ -2625,8 +2654,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2625 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2654 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
2626{ 2655{
2627 struct nfs_parsed_mount_data *data; 2656 struct nfs_parsed_mount_data *data;
2628 char *export_path;
2629 struct vfsmount *root_mnt;
2630 int error = -ENOMEM; 2657 int error = -ENOMEM;
2631 2658
2632 data = kzalloc(sizeof(*data), GFP_KERNEL); 2659 data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -2638,17 +2665,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2638 if (error < 0) 2665 if (error < 0)
2639 goto out; 2666 goto out;
2640 2667
2641 export_path = data->nfs_server.export_path; 2668 error = nfs4_try_mount(flags, dev_name, data, mnt);
2642 data->nfs_server.export_path = "/";
2643 root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
2644 data->nfs_server.hostname);
2645 data->nfs_server.export_path = export_path;
2646
2647 error = PTR_ERR(root_mnt);
2648 if (IS_ERR(root_mnt))
2649 goto out;
2650
2651 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2652 2669
2653out: 2670out:
2654 kfree(data->client_address); 2671 kfree(data->client_address);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0a0a2ff767c3..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/swap.h> 15#include <linux/swap.h>
16#include <linux/migrate.h>
16 17
17#include <linux/sunrpc/clnt.h> 18#include <linux/sunrpc/clnt.h>
18#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
@@ -26,6 +27,7 @@
26#include "internal.h" 27#include "internal.h"
27#include "iostat.h" 28#include "iostat.h"
28#include "nfs4_fs.h" 29#include "nfs4_fs.h"
30#include "fscache.h"
29 31
30#define NFSDBG_FACILITY NFSDBG_PAGECACHE 32#define NFSDBG_FACILITY NFSDBG_PAGECACHE
31 33
@@ -87,17 +89,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
87 return p; 89 return p;
88} 90}
89 91
90static void nfs_writedata_free(struct nfs_write_data *p) 92void nfs_writedata_free(struct nfs_write_data *p)
91{ 93{
92 if (p && (p->pagevec != &p->page_array[0])) 94 if (p && (p->pagevec != &p->page_array[0]))
93 kfree(p->pagevec); 95 kfree(p->pagevec);
94 mempool_free(p, nfs_wdata_mempool); 96 mempool_free(p, nfs_wdata_mempool);
95} 97}
96 98
97void nfs_writedata_release(void *data) 99static void nfs_writedata_release(struct nfs_write_data *wdata)
98{ 100{
99 struct nfs_write_data *wdata = data;
100
101 put_nfs_open_context(wdata->args.context); 101 put_nfs_open_context(wdata->args.context);
102 nfs_writedata_free(wdata); 102 nfs_writedata_free(wdata);
103} 103}
@@ -220,24 +220,17 @@ static void nfs_end_page_writeback(struct page *page)
220 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 220 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
221} 221}
222 222
223/* 223static struct nfs_page *nfs_find_and_lock_request(struct page *page)
224 * Find an associated nfs write request, and prepare to flush it out
225 * May return an error if the user signalled nfs_wait_on_request().
226 */
227static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
228 struct page *page)
229{ 224{
230 struct inode *inode = page->mapping->host; 225 struct inode *inode = page->mapping->host;
231 struct nfs_page *req; 226 struct nfs_page *req;
232 int ret; 227 int ret;
233 228
234 spin_lock(&inode->i_lock); 229 spin_lock(&inode->i_lock);
235 for(;;) { 230 for (;;) {
236 req = nfs_page_find_request_locked(page); 231 req = nfs_page_find_request_locked(page);
237 if (req == NULL) { 232 if (req == NULL)
238 spin_unlock(&inode->i_lock); 233 break;
239 return 0;
240 }
241 if (nfs_set_page_tag_locked(req)) 234 if (nfs_set_page_tag_locked(req))
242 break; 235 break;
243 /* Note: If we hold the page lock, as is the case in nfs_writepage, 236 /* Note: If we hold the page lock, as is the case in nfs_writepage,
@@ -249,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
249 ret = nfs_wait_on_request(req); 242 ret = nfs_wait_on_request(req);
250 nfs_release_request(req); 243 nfs_release_request(req);
251 if (ret != 0) 244 if (ret != 0)
252 return ret; 245 return ERR_PTR(ret);
253 spin_lock(&inode->i_lock); 246 spin_lock(&inode->i_lock);
254 } 247 }
255 if (test_bit(PG_CLEAN, &req->wb_flags)) {
256 spin_unlock(&inode->i_lock);
257 BUG();
258 }
259 if (nfs_set_page_writeback(page) != 0) {
260 spin_unlock(&inode->i_lock);
261 BUG();
262 }
263 spin_unlock(&inode->i_lock); 248 spin_unlock(&inode->i_lock);
249 return req;
250}
251
252/*
253 * Find an associated nfs write request, and prepare to flush it out
254 * May return an error if the user signalled nfs_wait_on_request().
255 */
256static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
257 struct page *page)
258{
259 struct nfs_page *req;
260 int ret = 0;
261
262 req = nfs_find_and_lock_request(page);
263 if (!req)
264 goto out;
265 ret = PTR_ERR(req);
266 if (IS_ERR(req))
267 goto out;
268
269 ret = nfs_set_page_writeback(page);
270 BUG_ON(ret != 0);
271 BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
272
264 if (!nfs_pageio_add_request(pgio, req)) { 273 if (!nfs_pageio_add_request(pgio, req)) {
265 nfs_redirty_request(req); 274 nfs_redirty_request(req);
266 return pgio->pg_error; 275 ret = pgio->pg_error;
267 } 276 }
268 return 0; 277out:
278 return ret;
269} 279}
270 280
271static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) 281static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
@@ -1480,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1480 .nr_to_write = LONG_MAX, 1490 .nr_to_write = LONG_MAX,
1481 .range_start = 0, 1491 .range_start = 0,
1482 .range_end = LLONG_MAX, 1492 .range_end = LLONG_MAX,
1483 .for_writepages = 1,
1484 }; 1493 };
1485 1494
1486 return __nfs_write_mapping(mapping, &wbc, how); 1495 return __nfs_write_mapping(mapping, &wbc, how);
@@ -1582,6 +1591,41 @@ int nfs_wb_page(struct inode *inode, struct page* page)
1582 return nfs_wb_page_priority(inode, page, FLUSH_STABLE); 1591 return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
1583} 1592}
1584 1593
1594#ifdef CONFIG_MIGRATION
1595int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1596 struct page *page)
1597{
1598 struct nfs_page *req;
1599 int ret;
1600
1601 if (PageFsCache(page))
1602 nfs_fscache_release_page(page, GFP_KERNEL);
1603
1604 req = nfs_find_and_lock_request(page);
1605 ret = PTR_ERR(req);
1606 if (IS_ERR(req))
1607 goto out;
1608
1609 ret = migrate_page(mapping, newpage, page);
1610 if (!req)
1611 goto out;
1612 if (ret)
1613 goto out_unlock;
1614 page_cache_get(newpage);
1615 req->wb_page = newpage;
1616 SetPagePrivate(newpage);
1617 set_page_private(newpage, page_private(page));
1618 ClearPagePrivate(page);
1619 set_page_private(page, 0);
1620 page_cache_release(page);
1621out_unlock:
1622 nfs_clear_page_tag_locked(req);
1623 nfs_release_request(req);
1624out:
1625 return ret;
1626}
1627#endif
1628
1585int __init nfs_init_writepagecache(void) 1629int __init nfs_init_writepagecache(void)
1586{ 1630{
1587 nfs_wdata_cachep = kmem_cache_create("nfs_write_data", 1631 nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 5573508f707f..36fcabbf5186 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
34 int flags = nfsexp_flags(rqstp, exp); 34 int flags = nfsexp_flags(rqstp, exp);
35 int ret; 35 int ret;
36 36
37 validate_process_creds();
38
37 /* discard any old override before preparing the new set */ 39 /* discard any old override before preparing the new set */
38 revert_creds(get_cred(current->real_cred)); 40 revert_creds(get_cred(current->real_cred));
39 new = prepare_creds(); 41 new = prepare_creds();
@@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
86 else 88 else
87 new->cap_effective = cap_raise_nfsd_set(new->cap_effective, 89 new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
88 new->cap_permitted); 90 new->cap_permitted);
91 validate_process_creds();
89 put_cred(override_creds(new)); 92 put_cred(override_creds(new));
90 put_cred(new); 93 put_cred(new);
94 validate_process_creds();
91 return 0; 95 return 0;
92 96
93oom: 97oom:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b92a27629fb7..d9462643155c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd,
85 (*bpp)[-1] = '\n'; 85 (*bpp)[-1] = '\n';
86} 86}
87 87
88static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
89{
90 return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
91}
92
88static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); 93static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
89static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); 94static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
90static struct cache_detail svc_expkey_cache; 95static struct cache_detail svc_expkey_cache;
@@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = {
259 .hash_table = expkey_table, 264 .hash_table = expkey_table,
260 .name = "nfsd.fh", 265 .name = "nfsd.fh",
261 .cache_put = expkey_put, 266 .cache_put = expkey_put,
262 .cache_request = expkey_request, 267 .cache_upcall = expkey_upcall,
263 .cache_parse = expkey_parse, 268 .cache_parse = expkey_parse,
264 .cache_show = expkey_show, 269 .cache_show = expkey_show,
265 .match = expkey_match, 270 .match = expkey_match,
@@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd,
355 (*bpp)[-1] = '\n'; 360 (*bpp)[-1] = '\n';
356} 361}
357 362
363static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
364{
365 return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
366}
367
358static struct svc_export *svc_export_update(struct svc_export *new, 368static struct svc_export *svc_export_update(struct svc_export *new,
359 struct svc_export *old); 369 struct svc_export *old);
360static struct svc_export *svc_export_lookup(struct svc_export *); 370static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = {
724 .hash_table = export_table, 734 .hash_table = export_table,
725 .name = "nfsd.export", 735 .name = "nfsd.export",
726 .cache_put = svc_export_put, 736 .cache_put = svc_export_put,
727 .cache_request = svc_export_request, 737 .cache_upcall = svc_export_upcall,
728 .cache_parse = svc_export_parse, 738 .cache_parse = svc_export_parse,
729 .cache_show = svc_export_show, 739 .cache_show = svc_export_show,
730 .match = svc_export_match, 740 .match = svc_export_match,
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5b398421b051..cdfa86fa1471 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -146,6 +146,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
146} 146}
147 147
148static int 148static int
149idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
150{
151 return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
152}
153
154static int
149idtoname_match(struct cache_head *ca, struct cache_head *cb) 155idtoname_match(struct cache_head *ca, struct cache_head *cb)
150{ 156{
151 struct ent *a = container_of(ca, struct ent, h); 157 struct ent *a = container_of(ca, struct ent, h);
@@ -175,10 +181,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
175} 181}
176 182
177static void 183static void
178warn_no_idmapd(struct cache_detail *detail) 184warn_no_idmapd(struct cache_detail *detail, int has_died)
179{ 185{
180 printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", 186 printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
181 detail->last_close? "died" : "not been started"); 187 has_died ? "died" : "not been started");
182} 188}
183 189
184 190
@@ -192,7 +198,7 @@ static struct cache_detail idtoname_cache = {
192 .hash_table = idtoname_table, 198 .hash_table = idtoname_table,
193 .name = "nfs4.idtoname", 199 .name = "nfs4.idtoname",
194 .cache_put = ent_put, 200 .cache_put = ent_put,
195 .cache_request = idtoname_request, 201 .cache_upcall = idtoname_upcall,
196 .cache_parse = idtoname_parse, 202 .cache_parse = idtoname_parse,
197 .cache_show = idtoname_show, 203 .cache_show = idtoname_show,
198 .warn_no_listener = warn_no_idmapd, 204 .warn_no_listener = warn_no_idmapd,
@@ -325,6 +331,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
325} 331}
326 332
327static int 333static int
334nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
335{
336 return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
337}
338
339static int
328nametoid_match(struct cache_head *ca, struct cache_head *cb) 340nametoid_match(struct cache_head *ca, struct cache_head *cb)
329{ 341{
330 struct ent *a = container_of(ca, struct ent, h); 342 struct ent *a = container_of(ca, struct ent, h);
@@ -363,7 +375,7 @@ static struct cache_detail nametoid_cache = {
363 .hash_table = nametoid_table, 375 .hash_table = nametoid_table,
364 .name = "nfs4.nametoid", 376 .name = "nfs4.nametoid",
365 .cache_put = ent_put, 377 .cache_put = ent_put,
366 .cache_request = nametoid_request, 378 .cache_upcall = nametoid_upcall,
367 .cache_parse = nametoid_parse, 379 .cache_parse = nametoid_parse,
368 .cache_show = nametoid_show, 380 .cache_show = nametoid_show,
369 .warn_no_listener = warn_no_idmapd, 381 .warn_no_listener = warn_no_idmapd,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6d0847562d87..7e906c5b7671 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -37,6 +37,7 @@
37#include <linux/nfsd/xdr.h> 37#include <linux/nfsd/xdr.h>
38#include <linux/nfsd/syscall.h> 38#include <linux/nfsd/syscall.h>
39#include <linux/lockd/lockd.h> 39#include <linux/lockd/lockd.h>
40#include <linux/sunrpc/clnt.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <net/ipv6.h> 43#include <net/ipv6.h>
@@ -490,22 +491,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
490 * 491 *
491 * Input: 492 * Input:
492 * buf: '\n'-terminated C string containing a 493 * buf: '\n'-terminated C string containing a
493 * presentation format IPv4 address 494 * presentation format IP address
494 * size: length of C string in @buf 495 * size: length of C string in @buf
495 * Output: 496 * Output:
496 * On success: returns zero if all specified locks were released; 497 * On success: returns zero if all specified locks were released;
497 * returns one if one or more locks were not released 498 * returns one if one or more locks were not released
498 * On error: return code is negative errno value 499 * On error: return code is negative errno value
499 *
500 * Note: Only AF_INET client addresses are passed in
501 */ 500 */
502static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) 501static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
503{ 502{
504 struct sockaddr_in sin = { 503 struct sockaddr_storage address;
505 .sin_family = AF_INET, 504 struct sockaddr *sap = (struct sockaddr *)&address;
506 }; 505 size_t salen = sizeof(address);
507 int b1, b2, b3, b4;
508 char c;
509 char *fo_path; 506 char *fo_path;
510 507
511 /* sanity check */ 508 /* sanity check */
@@ -519,14 +516,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
519 if (qword_get(&buf, fo_path, size) < 0) 516 if (qword_get(&buf, fo_path, size) < 0)
520 return -EINVAL; 517 return -EINVAL;
521 518
522 /* get ipv4 address */ 519 if (rpc_pton(fo_path, size, sap, salen) == 0)
523 if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
524 return -EINVAL;
525 if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
526 return -EINVAL; 520 return -EINVAL;
527 sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
528 521
529 return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); 522 return nlmsvc_unlock_all_by_ip(sap);
530} 523}
531 524
532/** 525/**
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 492c79b7800b..24d58adfe5fd 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -496,7 +496,9 @@ nfsd(void *vrqstp)
496 /* Lock the export hash tables for reading. */ 496 /* Lock the export hash tables for reading. */
497 exp_readlock(); 497 exp_readlock();
498 498
499 validate_process_creds();
499 svc_process(rqstp); 500 svc_process(rqstp);
501 validate_process_creds();
500 502
501 /* Unlock export hash tables */ 503 /* Unlock export hash tables */
502 exp_readunlock(); 504 exp_readunlock();
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23341c1063bc..8fa09bfbcba7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
684 __be32 err; 684 __be32 err;
685 int host_err; 685 int host_err;
686 686
687 validate_process_creds();
688
687 /* 689 /*
688 * If we get here, then the client has already done an "open", 690 * If we get here, then the client has already done an "open",
689 * and (hopefully) checked permission - so allow OWNER_OVERRIDE 691 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
740out_nfserr: 742out_nfserr:
741 err = nfserrno(host_err); 743 err = nfserrno(host_err);
742out: 744out:
745 validate_process_creds();
743 return err; 746 return err;
744} 747}
745 748
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 72da095d4009..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,6 @@
1config NILFS2_FS 1config NILFS2_FS
2 tristate "NILFS2 file system support (EXPERIMENTAL)" 2 tristate "NILFS2 file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on EXPERIMENTAL
4 select CRC32 4 select CRC32
5 help 5 help
6 NILFS2 is a log-structured file system (LFS) supporting continuous 6 NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 99d58a028b94..08834df6ec68 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
36 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); 36 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
37} 37}
38 38
39/**
40 * nilfs_bmap_lookup_at_level - find a data block or node block
41 * @bmap: bmap
42 * @key: key
43 * @level: level
44 * @ptrp: place to store the value associated to @key
45 *
46 * Description: nilfs_bmap_lookup_at_level() finds a record whose key
47 * matches @key in the block at @level of the bmap.
48 *
49 * Return Value: On success, 0 is returned and the record associated with @key
50 * is stored in the place pointed by @ptrp. On error, one of the following
51 * negative error codes is returned.
52 *
53 * %-EIO - I/O error.
54 *
55 * %-ENOMEM - Insufficient amount of memory available.
56 *
57 * %-ENOENT - A record associated with @key does not exist.
58 */
39int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, 59int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
40 __u64 *ptrp) 60 __u64 *ptrp)
41{ 61{
@@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
69 return ret; 89 return ret;
70} 90}
71 91
72/**
73 * nilfs_bmap_lookup - find a record
74 * @bmap: bmap
75 * @key: key
76 * @recp: pointer to record
77 *
78 * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
79 * @bmap.
80 *
81 * Return Value: On success, 0 is returned and the record associated with @key
82 * is stored in the place pointed by @recp. On error, one of the following
83 * negative error codes is returned.
84 *
85 * %-EIO - I/O error.
86 *
87 * %-ENOMEM - Insufficient amount of memory available.
88 *
89 * %-ENOENT - A record associated with @key does not exist.
90 */
91int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
92 unsigned long key,
93 unsigned long *recp)
94{
95 __u64 ptr;
96 int ret;
97
98 /* XXX: use macro for level 1 */
99 ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
100 if (recp != NULL)
101 *recp = ptr;
102 return ret;
103}
104
105static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 92static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
106{ 93{
107 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; 94 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
@@ -469,104 +456,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
469 (entries_per_group / NILFS_BMAP_GROUP_DIV); 456 (entries_per_group / NILFS_BMAP_GROUP_DIV);
470} 457}
471 458
472int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
473 union nilfs_bmap_ptr_req *req)
474{
475 return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
476}
477
478void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
479 union nilfs_bmap_ptr_req *req)
480{
481 nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
482}
483
484void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
485 union nilfs_bmap_ptr_req *req)
486{
487 nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
488}
489
490int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
491 sector_t blocknr)
492{
493 struct inode *dat = nilfs_bmap_get_dat(bmap);
494 int ret;
495
496 ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
497 if (likely(!ret))
498 nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
499 return ret;
500}
501
502int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
503 union nilfs_bmap_ptr_req *req)
504{
505 return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
506}
507
508void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
509 union nilfs_bmap_ptr_req *req)
510{
511 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
512 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
513}
514
515void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
516 union nilfs_bmap_ptr_req *req)
517{
518 nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
519}
520
521int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
522 sector_t blocknr)
523{
524 return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
525}
526
527int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
528{
529 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
530}
531
532int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
533 union nilfs_bmap_ptr_req *oldreq,
534 union nilfs_bmap_ptr_req *newreq)
535{
536 struct inode *dat = nilfs_bmap_get_dat(bmap);
537 int ret;
538
539 ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
540 if (ret < 0)
541 return ret;
542 ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
543 if (ret < 0)
544 nilfs_dat_abort_end(dat, &oldreq->bpr_req);
545
546 return ret;
547}
548
549void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
550 union nilfs_bmap_ptr_req *oldreq,
551 union nilfs_bmap_ptr_req *newreq)
552{
553 struct inode *dat = nilfs_bmap_get_dat(bmap);
554
555 nilfs_dat_commit_end(dat, &oldreq->bpr_req,
556 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
557 nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
558}
559
560void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
561 union nilfs_bmap_ptr_req *oldreq,
562 union nilfs_bmap_ptr_req *newreq)
563{
564 struct inode *dat = nilfs_bmap_get_dat(bmap);
565
566 nilfs_dat_abort_end(dat, &oldreq->bpr_req);
567 nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
568}
569
570static struct lock_class_key nilfs_bmap_dat_lock_key; 459static struct lock_class_key nilfs_bmap_dat_lock_key;
571static struct lock_class_key nilfs_bmap_mdt_lock_key; 460static struct lock_class_key nilfs_bmap_mdt_lock_key;
572 461
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b2890cdcef12..9980d7dbab91 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -28,6 +28,7 @@
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h> 29#include <linux/nilfs2_fs.h>
30#include "alloc.h" 30#include "alloc.h"
31#include "dat.h"
31 32
32#define NILFS_BMAP_INVALID_PTR 0 33#define NILFS_BMAP_INVALID_PTR 0
33 34
@@ -141,7 +142,6 @@ struct nilfs_bmap {
141int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); 142int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
142int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); 143int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
143void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); 144void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
144int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
145int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); 145int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
146int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); 146int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
147int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); 147int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
@@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
160void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); 160void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
161 161
162 162
163static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
164 __u64 *ptr)
165{
166 return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
167}
168
163/* 169/*
164 * Internal use only 170 * Internal use only
165 */ 171 */
166struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); 172struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
167int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
168 union nilfs_bmap_ptr_req *);
169void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
170 union nilfs_bmap_ptr_req *);
171void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
172 union nilfs_bmap_ptr_req *);
173 173
174static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, 174static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
175 union nilfs_bmap_ptr_req *req) 175 union nilfs_bmap_ptr_req *req,
176 struct inode *dat)
176{ 177{
177 if (NILFS_BMAP_USE_VBN(bmap)) 178 if (dat)
178 return nilfs_bmap_prepare_alloc_v(bmap, req); 179 return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
179 /* ignore target ptr */ 180 /* ignore target ptr */
180 req->bpr_ptr = bmap->b_last_allocated_ptr++; 181 req->bpr_ptr = bmap->b_last_allocated_ptr++;
181 return 0; 182 return 0;
182} 183}
183 184
184static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, 185static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
185 union nilfs_bmap_ptr_req *req) 186 union nilfs_bmap_ptr_req *req,
187 struct inode *dat)
186{ 188{
187 if (NILFS_BMAP_USE_VBN(bmap)) 189 if (dat)
188 nilfs_bmap_commit_alloc_v(bmap, req); 190 nilfs_dat_commit_alloc(dat, &req->bpr_req);
189} 191}
190 192
191static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, 193static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
192 union nilfs_bmap_ptr_req *req) 194 union nilfs_bmap_ptr_req *req,
195 struct inode *dat)
193{ 196{
194 if (NILFS_BMAP_USE_VBN(bmap)) 197 if (dat)
195 nilfs_bmap_abort_alloc_v(bmap, req); 198 nilfs_dat_abort_alloc(dat, &req->bpr_req);
196 else 199 else
197 bmap->b_last_allocated_ptr--; 200 bmap->b_last_allocated_ptr--;
198} 201}
199 202
200int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
201void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
202void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
203
204static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, 203static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
205 union nilfs_bmap_ptr_req *req) 204 union nilfs_bmap_ptr_req *req,
205 struct inode *dat)
206{ 206{
207 return NILFS_BMAP_USE_VBN(bmap) ? 207 return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
208 nilfs_bmap_prepare_end_v(bmap, req) : 0;
209} 208}
210 209
211static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, 210static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
212 union nilfs_bmap_ptr_req *req) 211 union nilfs_bmap_ptr_req *req,
212 struct inode *dat)
213{ 213{
214 if (NILFS_BMAP_USE_VBN(bmap)) 214 if (dat)
215 nilfs_bmap_commit_end_v(bmap, req); 215 nilfs_dat_commit_end(dat, &req->bpr_req,
216 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
216} 217}
217 218
218static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, 219static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
219 union nilfs_bmap_ptr_req *req) 220 union nilfs_bmap_ptr_req *req,
221 struct inode *dat)
220{ 222{
221 if (NILFS_BMAP_USE_VBN(bmap)) 223 if (dat)
222 nilfs_bmap_abort_end_v(bmap, req); 224 nilfs_dat_abort_end(dat, &req->bpr_req);
223} 225}
224 226
225int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
226 sector_t);
227int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
228int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
229
230
231__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, 227__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
232 const struct buffer_head *); 228 const struct buffer_head *);
233 229
234__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); 230__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
235__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); 231__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
236 232
237int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
238 union nilfs_bmap_ptr_req *,
239 union nilfs_bmap_ptr_req *);
240void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
241 union nilfs_bmap_ptr_req *,
242 union nilfs_bmap_ptr_req *);
243void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
244 union nilfs_bmap_ptr_req *,
245 union nilfs_bmap_ptr_req *);
246
247void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); 233void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
248void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); 234void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
249 235
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 7e0b61be212e..c668bca579c1 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
209 * We cannot call radix_tree_preload for the kernels older 209 * We cannot call radix_tree_preload for the kernels older
210 * than 2.6.23, because it is not exported for modules. 210 * than 2.6.23, because it is not exported for modules.
211 */ 211 */
212retry:
212 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 213 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
213 if (err) 214 if (err)
214 goto failed_unlock; 215 goto failed_unlock;
@@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
219 (unsigned long long)oldkey, 220 (unsigned long long)oldkey,
220 (unsigned long long)newkey); 221 (unsigned long long)newkey);
221 222
222retry:
223 spin_lock_irq(&btnc->tree_lock); 223 spin_lock_irq(&btnc->tree_lock);
224 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); 224 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
225 spin_unlock_irq(&btnc->tree_lock); 225 spin_unlock_irq(&btnc->tree_lock);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index aa412724b64e..e25b507a474f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void)
71 kmem_cache_destroy(nilfs_btree_path_cache); 71 kmem_cache_destroy(nilfs_btree_path_cache);
72} 72}
73 73
74static inline struct nilfs_btree_path * 74static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
75nilfs_btree_alloc_path(const struct nilfs_btree *btree)
76{ 75{
77 return (struct nilfs_btree_path *) 76 return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
78 kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
79} 77}
80 78
81static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, 79static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
82 struct nilfs_btree_path *path)
83{ 80{
84 kmem_cache_free(nilfs_btree_path_cache, path); 81 kmem_cache_free(nilfs_btree_path_cache, path);
85} 82}
86 83
87static void nilfs_btree_init_path(const struct nilfs_btree *btree, 84static void nilfs_btree_init_path(struct nilfs_btree_path *path)
88 struct nilfs_btree_path *path)
89{ 85{
90 int level; 86 int level;
91 87
@@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree,
101 } 97 }
102} 98}
103 99
104static void nilfs_btree_clear_path(const struct nilfs_btree *btree, 100static void nilfs_btree_release_path(struct nilfs_btree_path *path)
105 struct nilfs_btree_path *path)
106{ 101{
107 int level; 102 int level;
108 103
109 for (level = NILFS_BTREE_LEVEL_DATA; 104 for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
110 level < NILFS_BTREE_LEVEL_MAX; 105 level++)
111 level++) { 106 brelse(path[level].bp_bh);
112 if (path[level].bp_bh != NULL) {
113 brelse(path[level].bp_bh);
114 path[level].bp_bh = NULL;
115 }
116 /* sib_bh is released or deleted by prepare or commit
117 * operations. */
118 path[level].bp_sib_bh = NULL;
119 path[level].bp_index = 0;
120 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
121 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
122 path[level].bp_op = NULL;
123 }
124} 107}
125 108
126/* 109/*
@@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
148} 131}
149 132
150static inline int 133static inline int
151nilfs_btree_node_get_flags(const struct nilfs_btree *btree, 134nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
152 const struct nilfs_btree_node *node)
153{ 135{
154 return node->bn_flags; 136 return node->bn_flags;
155} 137}
156 138
157static inline void 139static inline void
158nilfs_btree_node_set_flags(struct nilfs_btree *btree, 140nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
159 struct nilfs_btree_node *node,
160 int flags)
161{ 141{
162 node->bn_flags = flags; 142 node->bn_flags = flags;
163} 143}
164 144
165static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, 145static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
166 const struct nilfs_btree_node *node)
167{ 146{
168 return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; 147 return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
169} 148}
170 149
171static inline int 150static inline int
172nilfs_btree_node_get_level(const struct nilfs_btree *btree, 151nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
173 const struct nilfs_btree_node *node)
174{ 152{
175 return node->bn_level; 153 return node->bn_level;
176} 154}
177 155
178static inline void 156static inline void
179nilfs_btree_node_set_level(struct nilfs_btree *btree, 157nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
180 struct nilfs_btree_node *node,
181 int level)
182{ 158{
183 node->bn_level = level; 159 node->bn_level = level;
184} 160}
185 161
186static inline int 162static inline int
187nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, 163nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
188 const struct nilfs_btree_node *node)
189{ 164{
190 return le16_to_cpu(node->bn_nchildren); 165 return le16_to_cpu(node->bn_nchildren);
191} 166}
192 167
193static inline void 168static inline void
194nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, 169nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
195 struct nilfs_btree_node *node,
196 int nchildren)
197{ 170{
198 node->bn_nchildren = cpu_to_le16(nchildren); 171 node->bn_nchildren = cpu_to_le16(nchildren);
199} 172}
200 173
201static inline int 174static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
202nilfs_btree_node_size(const struct nilfs_btree *btree)
203{ 175{
204 return 1 << btree->bt_bmap.b_inode->i_blkbits; 176 return 1 << btree->bt_bmap.b_inode->i_blkbits;
205} 177}
206 178
207static inline int 179static inline int
208nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, 180nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
209 const struct nilfs_btree_node *node) 181 const struct nilfs_btree *btree)
210{ 182{
211 return nilfs_btree_node_root(btree, node) ? 183 return nilfs_btree_node_root(node) ?
212 NILFS_BTREE_ROOT_NCHILDREN_MIN : 184 NILFS_BTREE_ROOT_NCHILDREN_MIN :
213 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); 185 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
214} 186}
215 187
216static inline int 188static inline int
217nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, 189nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
218 const struct nilfs_btree_node *node) 190 const struct nilfs_btree *btree)
219{ 191{
220 return nilfs_btree_node_root(btree, node) ? 192 return nilfs_btree_node_root(node) ?
221 NILFS_BTREE_ROOT_NCHILDREN_MAX : 193 NILFS_BTREE_ROOT_NCHILDREN_MAX :
222 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); 194 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
223} 195}
224 196
225static inline __le64 * 197static inline __le64 *
226nilfs_btree_node_dkeys(const struct nilfs_btree *btree, 198nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
227 const struct nilfs_btree_node *node)
228{ 199{
229 return (__le64 *)((char *)(node + 1) + 200 return (__le64 *)((char *)(node + 1) +
230 (nilfs_btree_node_root(btree, node) ? 201 (nilfs_btree_node_root(node) ?
231 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); 202 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
232} 203}
233 204
234static inline __le64 * 205static inline __le64 *
235nilfs_btree_node_dptrs(const struct nilfs_btree *btree, 206nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
236 const struct nilfs_btree_node *node) 207 const struct nilfs_btree *btree)
237{ 208{
238 return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + 209 return (__le64 *)(nilfs_btree_node_dkeys(node) +
239 nilfs_btree_node_nchildren_max(btree, node)); 210 nilfs_btree_node_nchildren_max(node, btree));
240} 211}
241 212
242static inline __u64 213static inline __u64
243nilfs_btree_node_get_key(const struct nilfs_btree *btree, 214nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
244 const struct nilfs_btree_node *node, int index)
245{ 215{
246 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + 216 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
247 index));
248} 217}
249 218
250static inline void 219static inline void
251nilfs_btree_node_set_key(struct nilfs_btree *btree, 220nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
252 struct nilfs_btree_node *node, int index, __u64 key)
253{ 221{
254 *(nilfs_btree_node_dkeys(btree, node) + index) = 222 *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
255 nilfs_bmap_key_to_dkey(key);
256} 223}
257 224
258static inline __u64 225static inline __u64
259nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, 226nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
260 const struct nilfs_btree_node *node, 227 const struct nilfs_btree_node *node, int index)
261 int index)
262{ 228{
263 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + 229 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
264 index)); 230 index));
265} 231}
266 232
267static inline void 233static inline void
268nilfs_btree_node_set_ptr(struct nilfs_btree *btree, 234nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
269 struct nilfs_btree_node *node, 235 struct nilfs_btree_node *node, int index, __u64 ptr)
270 int index,
271 __u64 ptr)
272{ 236{
273 *(nilfs_btree_node_dptrs(btree, node) + index) = 237 *(nilfs_btree_node_dptrs(node, btree) + index) =
274 nilfs_bmap_ptr_to_dptr(ptr); 238 nilfs_bmap_ptr_to_dptr(ptr);
275} 239}
276 240
@@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
283 __le64 *dptrs; 247 __le64 *dptrs;
284 int i; 248 int i;
285 249
286 nilfs_btree_node_set_flags(btree, node, flags); 250 nilfs_btree_node_set_flags(node, flags);
287 nilfs_btree_node_set_level(btree, node, level); 251 nilfs_btree_node_set_level(node, level);
288 nilfs_btree_node_set_nchildren(btree, node, nchildren); 252 nilfs_btree_node_set_nchildren(node, nchildren);
289 253
290 dkeys = nilfs_btree_node_dkeys(btree, node); 254 dkeys = nilfs_btree_node_dkeys(node);
291 dptrs = nilfs_btree_node_dptrs(btree, node); 255 dptrs = nilfs_btree_node_dptrs(node, btree);
292 for (i = 0; i < nchildren; i++) { 256 for (i = 0; i < nchildren; i++) {
293 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); 257 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
294 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); 258 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
@@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
305 __le64 *ldptrs, *rdptrs; 269 __le64 *ldptrs, *rdptrs;
306 int lnchildren, rnchildren; 270 int lnchildren, rnchildren;
307 271
308 ldkeys = nilfs_btree_node_dkeys(btree, left); 272 ldkeys = nilfs_btree_node_dkeys(left);
309 ldptrs = nilfs_btree_node_dptrs(btree, left); 273 ldptrs = nilfs_btree_node_dptrs(left, btree);
310 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 274 lnchildren = nilfs_btree_node_get_nchildren(left);
311 275
312 rdkeys = nilfs_btree_node_dkeys(btree, right); 276 rdkeys = nilfs_btree_node_dkeys(right);
313 rdptrs = nilfs_btree_node_dptrs(btree, right); 277 rdptrs = nilfs_btree_node_dptrs(right, btree);
314 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 278 rnchildren = nilfs_btree_node_get_nchildren(right);
315 279
316 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); 280 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
317 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); 281 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
@@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
320 284
321 lnchildren += n; 285 lnchildren += n;
322 rnchildren -= n; 286 rnchildren -= n;
323 nilfs_btree_node_set_nchildren(btree, left, lnchildren); 287 nilfs_btree_node_set_nchildren(left, lnchildren);
324 nilfs_btree_node_set_nchildren(btree, right, rnchildren); 288 nilfs_btree_node_set_nchildren(right, rnchildren);
325} 289}
326 290
327/* Assume that the buffer heads corresponding to left and right are locked. */ 291/* Assume that the buffer heads corresponding to left and right are locked. */
@@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
334 __le64 *ldptrs, *rdptrs; 298 __le64 *ldptrs, *rdptrs;
335 int lnchildren, rnchildren; 299 int lnchildren, rnchildren;
336 300
337 ldkeys = nilfs_btree_node_dkeys(btree, left); 301 ldkeys = nilfs_btree_node_dkeys(left);
338 ldptrs = nilfs_btree_node_dptrs(btree, left); 302 ldptrs = nilfs_btree_node_dptrs(left, btree);
339 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 303 lnchildren = nilfs_btree_node_get_nchildren(left);
340 304
341 rdkeys = nilfs_btree_node_dkeys(btree, right); 305 rdkeys = nilfs_btree_node_dkeys(right);
342 rdptrs = nilfs_btree_node_dptrs(btree, right); 306 rdptrs = nilfs_btree_node_dptrs(right, btree);
343 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 307 rnchildren = nilfs_btree_node_get_nchildren(right);
344 308
345 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); 309 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
346 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); 310 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
@@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
349 313
350 lnchildren -= n; 314 lnchildren -= n;
351 rnchildren += n; 315 rnchildren += n;
352 nilfs_btree_node_set_nchildren(btree, left, lnchildren); 316 nilfs_btree_node_set_nchildren(left, lnchildren);
353 nilfs_btree_node_set_nchildren(btree, right, rnchildren); 317 nilfs_btree_node_set_nchildren(right, rnchildren);
354} 318}
355 319
356/* Assume that the buffer head corresponding to node is locked. */ 320/* Assume that the buffer head corresponding to node is locked. */
@@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
362 __le64 *dptrs; 326 __le64 *dptrs;
363 int nchildren; 327 int nchildren;
364 328
365 dkeys = nilfs_btree_node_dkeys(btree, node); 329 dkeys = nilfs_btree_node_dkeys(node);
366 dptrs = nilfs_btree_node_dptrs(btree, node); 330 dptrs = nilfs_btree_node_dptrs(node, btree);
367 nchildren = nilfs_btree_node_get_nchildren(btree, node); 331 nchildren = nilfs_btree_node_get_nchildren(node);
368 if (index < nchildren) { 332 if (index < nchildren) {
369 memmove(dkeys + index + 1, dkeys + index, 333 memmove(dkeys + index + 1, dkeys + index,
370 (nchildren - index) * sizeof(*dkeys)); 334 (nchildren - index) * sizeof(*dkeys));
@@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
374 dkeys[index] = nilfs_bmap_key_to_dkey(key); 338 dkeys[index] = nilfs_bmap_key_to_dkey(key);
375 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); 339 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
376 nchildren++; 340 nchildren++;
377 nilfs_btree_node_set_nchildren(btree, node, nchildren); 341 nilfs_btree_node_set_nchildren(node, nchildren);
378} 342}
379 343
380/* Assume that the buffer head corresponding to node is locked. */ 344/* Assume that the buffer head corresponding to node is locked. */
@@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
388 __le64 *dptrs; 352 __le64 *dptrs;
389 int nchildren; 353 int nchildren;
390 354
391 dkeys = nilfs_btree_node_dkeys(btree, node); 355 dkeys = nilfs_btree_node_dkeys(node);
392 dptrs = nilfs_btree_node_dptrs(btree, node); 356 dptrs = nilfs_btree_node_dptrs(node, btree);
393 key = nilfs_bmap_dkey_to_key(dkeys[index]); 357 key = nilfs_bmap_dkey_to_key(dkeys[index]);
394 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); 358 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
395 nchildren = nilfs_btree_node_get_nchildren(btree, node); 359 nchildren = nilfs_btree_node_get_nchildren(node);
396 if (keyp != NULL) 360 if (keyp != NULL)
397 *keyp = key; 361 *keyp = key;
398 if (ptrp != NULL) 362 if (ptrp != NULL)
@@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
405 (nchildren - index - 1) * sizeof(*dptrs)); 369 (nchildren - index - 1) * sizeof(*dptrs));
406 } 370 }
407 nchildren--; 371 nchildren--;
408 nilfs_btree_node_set_nchildren(btree, node, nchildren); 372 nilfs_btree_node_set_nchildren(node, nchildren);
409} 373}
410 374
411static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, 375static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
412 const struct nilfs_btree_node *node,
413 __u64 key, int *indexp) 376 __u64 key, int *indexp)
414{ 377{
415 __u64 nkey; 378 __u64 nkey;
@@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
417 380
418 /* binary search */ 381 /* binary search */
419 low = 0; 382 low = 0;
420 high = nilfs_btree_node_get_nchildren(btree, node) - 1; 383 high = nilfs_btree_node_get_nchildren(node) - 1;
421 index = 0; 384 index = 0;
422 s = 0; 385 s = 0;
423 while (low <= high) { 386 while (low <= high) {
424 index = (low + high) / 2; 387 index = (low + high) / 2;
425 nkey = nilfs_btree_node_get_key(btree, node, index); 388 nkey = nilfs_btree_node_get_key(node, index);
426 if (nkey == key) { 389 if (nkey == key) {
427 s = 0; 390 s = 0;
428 goto out; 391 goto out;
@@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
436 } 399 }
437 400
438 /* adjust index */ 401 /* adjust index */
439 if (nilfs_btree_node_get_level(btree, node) > 402 if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
440 NILFS_BTREE_LEVEL_NODE_MIN) { 403 if (s > 0 && index > 0)
441 if ((s > 0) && (index > 0))
442 index--; 404 index--;
443 } else if (s < 0) 405 } else if (s < 0)
444 index++; 406 index++;
@@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree)
456} 418}
457 419
458static inline struct nilfs_btree_node * 420static inline struct nilfs_btree_node *
459nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, 421nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
460 const struct nilfs_btree_path *path,
461 int level)
462{ 422{
463 return (struct nilfs_btree_node *)path[level].bp_bh->b_data; 423 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
464} 424}
465 425
466static inline struct nilfs_btree_node * 426static inline struct nilfs_btree_node *
467nilfs_btree_get_sib_node(const struct nilfs_btree *btree, 427nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
468 const struct nilfs_btree_path *path,
469 int level)
470{ 428{
471 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; 429 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
472} 430}
473 431
474static inline int nilfs_btree_height(const struct nilfs_btree *btree) 432static inline int nilfs_btree_height(const struct nilfs_btree *btree)
475{ 433{
476 return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) 434 return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
477 + 1;
478} 435}
479 436
480static inline struct nilfs_btree_node * 437static inline struct nilfs_btree_node *
@@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
484{ 441{
485 return (level == nilfs_btree_height(btree) - 1) ? 442 return (level == nilfs_btree_height(btree) - 1) ?
486 nilfs_btree_get_root(btree) : 443 nilfs_btree_get_root(btree) :
487 nilfs_btree_get_nonroot_node(btree, path, level); 444 nilfs_btree_get_nonroot_node(path, level);
488} 445}
489 446
490static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, 447static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
@@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
496 int level, index, found, ret; 453 int level, index, found, ret;
497 454
498 node = nilfs_btree_get_root(btree); 455 node = nilfs_btree_get_root(btree);
499 level = nilfs_btree_node_get_level(btree, node); 456 level = nilfs_btree_node_get_level(node);
500 if ((level < minlevel) || 457 if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
501 (nilfs_btree_node_get_nchildren(btree, node) <= 0))
502 return -ENOENT; 458 return -ENOENT;
503 459
504 found = nilfs_btree_node_lookup(btree, node, key, &index); 460 found = nilfs_btree_node_lookup(node, key, &index);
505 ptr = nilfs_btree_node_get_ptr(btree, node, index); 461 ptr = nilfs_btree_node_get_ptr(btree, node, index);
506 path[level].bp_bh = NULL; 462 path[level].bp_bh = NULL;
507 path[level].bp_index = index; 463 path[level].bp_index = index;
@@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
510 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 466 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
511 if (ret < 0) 467 if (ret < 0)
512 return ret; 468 return ret;
513 node = nilfs_btree_get_nonroot_node(btree, path, level); 469 node = nilfs_btree_get_nonroot_node(path, level);
514 BUG_ON(level != nilfs_btree_node_get_level(btree, node)); 470 BUG_ON(level != nilfs_btree_node_get_level(node));
515 if (!found) 471 if (!found)
516 found = nilfs_btree_node_lookup(btree, node, key, 472 found = nilfs_btree_node_lookup(node, key, &index);
517 &index);
518 else 473 else
519 index = 0; 474 index = 0;
520 if (index < nilfs_btree_node_nchildren_max(btree, node)) 475 if (index < nilfs_btree_node_nchildren_max(node, btree))
521 ptr = nilfs_btree_node_get_ptr(btree, node, index); 476 ptr = nilfs_btree_node_get_ptr(btree, node, index);
522 else { 477 else {
523 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); 478 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
@@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
544 int index, level, ret; 499 int index, level, ret;
545 500
546 node = nilfs_btree_get_root(btree); 501 node = nilfs_btree_get_root(btree);
547 index = nilfs_btree_node_get_nchildren(btree, node) - 1; 502 index = nilfs_btree_node_get_nchildren(node) - 1;
548 if (index < 0) 503 if (index < 0)
549 return -ENOENT; 504 return -ENOENT;
550 level = nilfs_btree_node_get_level(btree, node); 505 level = nilfs_btree_node_get_level(node);
551 ptr = nilfs_btree_node_get_ptr(btree, node, index); 506 ptr = nilfs_btree_node_get_ptr(btree, node, index);
552 path[level].bp_bh = NULL; 507 path[level].bp_bh = NULL;
553 path[level].bp_index = index; 508 path[level].bp_index = index;
@@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
556 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 511 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
557 if (ret < 0) 512 if (ret < 0)
558 return ret; 513 return ret;
559 node = nilfs_btree_get_nonroot_node(btree, path, level); 514 node = nilfs_btree_get_nonroot_node(path, level);
560 BUG_ON(level != nilfs_btree_node_get_level(btree, node)); 515 BUG_ON(level != nilfs_btree_node_get_level(node));
561 index = nilfs_btree_node_get_nchildren(btree, node) - 1; 516 index = nilfs_btree_node_get_nchildren(node) - 1;
562 ptr = nilfs_btree_node_get_ptr(btree, node, index); 517 ptr = nilfs_btree_node_get_ptr(btree, node, index);
563 path[level].bp_index = index; 518 path[level].bp_index = index;
564 } 519 }
565 520
566 if (keyp != NULL) 521 if (keyp != NULL)
567 *keyp = nilfs_btree_node_get_key(btree, node, index); 522 *keyp = nilfs_btree_node_get_key(node, index);
568 if (ptrp != NULL) 523 if (ptrp != NULL)
569 *ptrp = ptr; 524 *ptrp = ptr;
570 525
@@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
580 int ret; 535 int ret;
581 536
582 btree = (struct nilfs_btree *)bmap; 537 btree = (struct nilfs_btree *)bmap;
583 path = nilfs_btree_alloc_path(btree); 538 path = nilfs_btree_alloc_path();
584 if (path == NULL) 539 if (path == NULL)
585 return -ENOMEM; 540 return -ENOMEM;
586 nilfs_btree_init_path(btree, path); 541 nilfs_btree_init_path(path);
587 542
588 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 543 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
589 544
590 if (ptrp != NULL) 545 if (ptrp != NULL)
591 *ptrp = ptr; 546 *ptrp = ptr;
592 547
593 nilfs_btree_clear_path(btree, path); 548 nilfs_btree_release_path(path);
594 nilfs_btree_free_path(btree, path); 549 nilfs_btree_free_path(path);
595 550
596 return ret; 551 return ret;
597} 552}
@@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
608 int level = NILFS_BTREE_LEVEL_NODE_MIN; 563 int level = NILFS_BTREE_LEVEL_NODE_MIN;
609 int ret, cnt, index, maxlevel; 564 int ret, cnt, index, maxlevel;
610 565
611 path = nilfs_btree_alloc_path(btree); 566 path = nilfs_btree_alloc_path();
612 if (path == NULL) 567 if (path == NULL)
613 return -ENOMEM; 568 return -ENOMEM;
614 nilfs_btree_init_path(btree, path); 569 nilfs_btree_init_path(path);
615 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 570 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
616 if (ret < 0) 571 if (ret < 0)
617 goto out; 572 goto out;
@@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
631 node = nilfs_btree_get_node(btree, path, level); 586 node = nilfs_btree_get_node(btree, path, level);
632 index = path[level].bp_index + 1; 587 index = path[level].bp_index + 1;
633 for (;;) { 588 for (;;) {
634 while (index < nilfs_btree_node_get_nchildren(btree, node)) { 589 while (index < nilfs_btree_node_get_nchildren(node)) {
635 if (nilfs_btree_node_get_key(btree, node, index) != 590 if (nilfs_btree_node_get_key(node, index) !=
636 key + cnt) 591 key + cnt)
637 goto end; 592 goto end;
638 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 593 ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
@@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
653 /* look-up right sibling node */ 608 /* look-up right sibling node */
654 node = nilfs_btree_get_node(btree, path, level + 1); 609 node = nilfs_btree_get_node(btree, path, level + 1);
655 index = path[level + 1].bp_index + 1; 610 index = path[level + 1].bp_index + 1;
656 if (index >= nilfs_btree_node_get_nchildren(btree, node) || 611 if (index >= nilfs_btree_node_get_nchildren(node) ||
657 nilfs_btree_node_get_key(btree, node, index) != key + cnt) 612 nilfs_btree_node_get_key(node, index) != key + cnt)
658 break; 613 break;
659 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 614 ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
660 path[level + 1].bp_index = index; 615 path[level + 1].bp_index = index;
@@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
664 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); 619 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
665 if (ret < 0) 620 if (ret < 0)
666 goto out; 621 goto out;
667 node = nilfs_btree_get_nonroot_node(btree, path, level); 622 node = nilfs_btree_get_nonroot_node(path, level);
668 index = 0; 623 index = 0;
669 path[level].bp_index = index; 624 path[level].bp_index = index;
670 } 625 }
@@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
672 *ptrp = ptr; 627 *ptrp = ptr;
673 ret = cnt; 628 ret = cnt;
674 out: 629 out:
675 nilfs_btree_clear_path(btree, path); 630 nilfs_btree_release_path(path);
676 nilfs_btree_free_path(btree, path); 631 nilfs_btree_free_path(path);
677 return ret; 632 return ret;
678} 633}
679 634
@@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
685 do { 640 do {
686 lock_buffer(path[level].bp_bh); 641 lock_buffer(path[level].bp_bh);
687 nilfs_btree_node_set_key( 642 nilfs_btree_node_set_key(
688 btree, 643 nilfs_btree_get_nonroot_node(path, level),
689 nilfs_btree_get_nonroot_node(
690 btree, path, level),
691 path[level].bp_index, key); 644 path[level].bp_index, key);
692 if (!buffer_dirty(path[level].bp_bh)) 645 if (!buffer_dirty(path[level].bp_bh))
693 nilfs_btnode_mark_dirty(path[level].bp_bh); 646 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
698 651
699 /* root */ 652 /* root */
700 if (level == nilfs_btree_height(btree) - 1) { 653 if (level == nilfs_btree_height(btree) - 1) {
701 nilfs_btree_node_set_key(btree, 654 nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
702 nilfs_btree_get_root(btree),
703 path[level].bp_index, key); 655 path[level].bp_index, key);
704 } 656 }
705} 657}
@@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
712 664
713 if (level < nilfs_btree_height(btree) - 1) { 665 if (level < nilfs_btree_height(btree) - 1) {
714 lock_buffer(path[level].bp_bh); 666 lock_buffer(path[level].bp_bh);
715 node = nilfs_btree_get_nonroot_node(btree, path, level); 667 node = nilfs_btree_get_nonroot_node(path, level);
716 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 668 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
717 path[level].bp_index); 669 path[level].bp_index);
718 if (!buffer_dirty(path[level].bp_bh)) 670 if (!buffer_dirty(path[level].bp_bh))
@@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
721 673
722 if (path[level].bp_index == 0) 674 if (path[level].bp_index == 0)
723 nilfs_btree_promote_key(btree, path, level + 1, 675 nilfs_btree_promote_key(btree, path, level + 1,
724 nilfs_btree_node_get_key( 676 nilfs_btree_node_get_key(node,
725 btree, node, 0)); 677 0));
726 } else { 678 } else {
727 node = nilfs_btree_get_root(btree); 679 node = nilfs_btree_get_root(btree);
728 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 680 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
@@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
740 lock_buffer(path[level].bp_bh); 692 lock_buffer(path[level].bp_bh);
741 lock_buffer(path[level].bp_sib_bh); 693 lock_buffer(path[level].bp_sib_bh);
742 694
743 node = nilfs_btree_get_nonroot_node(btree, path, level); 695 node = nilfs_btree_get_nonroot_node(path, level);
744 left = nilfs_btree_get_sib_node(btree, path, level); 696 left = nilfs_btree_get_sib_node(path, level);
745 nchildren = nilfs_btree_node_get_nchildren(btree, node); 697 nchildren = nilfs_btree_node_get_nchildren(node);
746 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 698 lnchildren = nilfs_btree_node_get_nchildren(left);
747 move = 0; 699 move = 0;
748 700
749 n = (nchildren + lnchildren + 1) / 2 - lnchildren; 701 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
764 unlock_buffer(path[level].bp_sib_bh); 716 unlock_buffer(path[level].bp_sib_bh);
765 717
766 nilfs_btree_promote_key(btree, path, level + 1, 718 nilfs_btree_promote_key(btree, path, level + 1,
767 nilfs_btree_node_get_key(btree, node, 0)); 719 nilfs_btree_node_get_key(node, 0));
768 720
769 if (move) { 721 if (move) {
770 brelse(path[level].bp_bh); 722 brelse(path[level].bp_bh);
@@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
791 lock_buffer(path[level].bp_bh); 743 lock_buffer(path[level].bp_bh);
792 lock_buffer(path[level].bp_sib_bh); 744 lock_buffer(path[level].bp_sib_bh);
793 745
794 node = nilfs_btree_get_nonroot_node(btree, path, level); 746 node = nilfs_btree_get_nonroot_node(path, level);
795 right = nilfs_btree_get_sib_node(btree, path, level); 747 right = nilfs_btree_get_sib_node(path, level);
796 nchildren = nilfs_btree_node_get_nchildren(btree, node); 748 nchildren = nilfs_btree_node_get_nchildren(node);
797 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 749 rnchildren = nilfs_btree_node_get_nchildren(right);
798 move = 0; 750 move = 0;
799 751
800 n = (nchildren + rnchildren + 1) / 2 - rnchildren; 752 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
816 768
817 path[level + 1].bp_index++; 769 path[level + 1].bp_index++;
818 nilfs_btree_promote_key(btree, path, level + 1, 770 nilfs_btree_promote_key(btree, path, level + 1,
819 nilfs_btree_node_get_key(btree, right, 0)); 771 nilfs_btree_node_get_key(right, 0));
820 path[level + 1].bp_index--; 772 path[level + 1].bp_index--;
821 773
822 if (move) { 774 if (move) {
823 brelse(path[level].bp_bh); 775 brelse(path[level].bp_bh);
824 path[level].bp_bh = path[level].bp_sib_bh; 776 path[level].bp_bh = path[level].bp_sib_bh;
825 path[level].bp_sib_bh = NULL; 777 path[level].bp_sib_bh = NULL;
826 path[level].bp_index -= 778 path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
827 nilfs_btree_node_get_nchildren(btree, node);
828 path[level + 1].bp_index++; 779 path[level + 1].bp_index++;
829 } else { 780 } else {
830 brelse(path[level].bp_sib_bh); 781 brelse(path[level].bp_sib_bh);
@@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
846 lock_buffer(path[level].bp_bh); 797 lock_buffer(path[level].bp_bh);
847 lock_buffer(path[level].bp_sib_bh); 798 lock_buffer(path[level].bp_sib_bh);
848 799
849 node = nilfs_btree_get_nonroot_node(btree, path, level); 800 node = nilfs_btree_get_nonroot_node(path, level);
850 right = nilfs_btree_get_sib_node(btree, path, level); 801 right = nilfs_btree_get_sib_node(path, level);
851 nchildren = nilfs_btree_node_get_nchildren(btree, node); 802 nchildren = nilfs_btree_node_get_nchildren(node);
852 move = 0; 803 move = 0;
853 804
854 n = (nchildren + 1) / 2; 805 n = (nchildren + 1) / 2;
@@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
867 unlock_buffer(path[level].bp_bh); 818 unlock_buffer(path[level].bp_bh);
868 unlock_buffer(path[level].bp_sib_bh); 819 unlock_buffer(path[level].bp_sib_bh);
869 820
870 newkey = nilfs_btree_node_get_key(btree, right, 0); 821 newkey = nilfs_btree_node_get_key(right, 0);
871 newptr = path[level].bp_newreq.bpr_ptr; 822 newptr = path[level].bp_newreq.bpr_ptr;
872 823
873 if (move) { 824 if (move) {
874 path[level].bp_index -= 825 path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
875 nilfs_btree_node_get_nchildren(btree, node);
876 nilfs_btree_node_insert(btree, right, *keyp, *ptrp, 826 nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
877 path[level].bp_index); 827 path[level].bp_index);
878 828
879 *keyp = nilfs_btree_node_get_key(btree, right, 0); 829 *keyp = nilfs_btree_node_get_key(right, 0);
880 *ptrp = path[level].bp_newreq.bpr_ptr; 830 *ptrp = path[level].bp_newreq.bpr_ptr;
881 831
882 brelse(path[level].bp_bh); 832 brelse(path[level].bp_bh);
@@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
885 } else { 835 } else {
886 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 836 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
887 837
888 *keyp = nilfs_btree_node_get_key(btree, right, 0); 838 *keyp = nilfs_btree_node_get_key(right, 0);
889 *ptrp = path[level].bp_newreq.bpr_ptr; 839 *ptrp = path[level].bp_newreq.bpr_ptr;
890 840
891 brelse(path[level].bp_sib_bh); 841 brelse(path[level].bp_sib_bh);
@@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
905 lock_buffer(path[level].bp_sib_bh); 855 lock_buffer(path[level].bp_sib_bh);
906 856
907 root = nilfs_btree_get_root(btree); 857 root = nilfs_btree_get_root(btree);
908 child = nilfs_btree_get_sib_node(btree, path, level); 858 child = nilfs_btree_get_sib_node(path, level);
909 859
910 n = nilfs_btree_node_get_nchildren(btree, root); 860 n = nilfs_btree_node_get_nchildren(root);
911 861
912 nilfs_btree_node_move_right(btree, root, child, n); 862 nilfs_btree_node_move_right(btree, root, child, n);
913 nilfs_btree_node_set_level(btree, root, level + 1); 863 nilfs_btree_node_set_level(root, level + 1);
914 864
915 if (!buffer_dirty(path[level].bp_sib_bh)) 865 if (!buffer_dirty(path[level].bp_sib_bh))
916 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 866 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
922 872
923 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 873 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
924 874
925 *keyp = nilfs_btree_node_get_key(btree, child, 0); 875 *keyp = nilfs_btree_node_get_key(child, 0);
926 *ptrp = path[level].bp_newreq.bpr_ptr; 876 *ptrp = path[level].bp_newreq.bpr_ptr;
927} 877}
928 878
@@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
990 struct nilfs_btree_node *node, *parent, *sib; 940 struct nilfs_btree_node *node, *parent, *sib;
991 __u64 sibptr; 941 __u64 sibptr;
992 int pindex, level, ret; 942 int pindex, level, ret;
943 struct inode *dat = NULL;
993 944
994 stats->bs_nblocks = 0; 945 stats->bs_nblocks = 0;
995 level = NILFS_BTREE_LEVEL_DATA; 946 level = NILFS_BTREE_LEVEL_DATA;
996 947
997 /* allocate a new ptr for data block */ 948 /* allocate a new ptr for data block */
998 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) 949 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
999 path[level].bp_newreq.bpr_ptr = 950 path[level].bp_newreq.bpr_ptr =
1000 nilfs_btree_find_target_v(btree, path, key); 951 nilfs_btree_find_target_v(btree, path, key);
952 dat = nilfs_bmap_get_dat(&btree->bt_bmap);
953 }
1001 954
1002 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 955 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
1003 &path[level].bp_newreq); 956 &path[level].bp_newreq, dat);
1004 if (ret < 0) 957 if (ret < 0)
1005 goto err_out_data; 958 goto err_out_data;
1006 959
1007 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 960 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1008 level < nilfs_btree_height(btree) - 1; 961 level < nilfs_btree_height(btree) - 1;
1009 level++) { 962 level++) {
1010 node = nilfs_btree_get_nonroot_node(btree, path, level); 963 node = nilfs_btree_get_nonroot_node(path, level);
1011 if (nilfs_btree_node_get_nchildren(btree, node) < 964 if (nilfs_btree_node_get_nchildren(node) <
1012 nilfs_btree_node_nchildren_max(btree, node)) { 965 nilfs_btree_node_nchildren_max(node, btree)) {
1013 path[level].bp_op = nilfs_btree_do_insert; 966 path[level].bp_op = nilfs_btree_do_insert;
1014 stats->bs_nblocks++; 967 stats->bs_nblocks++;
1015 goto out; 968 goto out;
@@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1026 if (ret < 0) 979 if (ret < 0)
1027 goto err_out_child_node; 980 goto err_out_child_node;
1028 sib = (struct nilfs_btree_node *)bh->b_data; 981 sib = (struct nilfs_btree_node *)bh->b_data;
1029 if (nilfs_btree_node_get_nchildren(btree, sib) < 982 if (nilfs_btree_node_get_nchildren(sib) <
1030 nilfs_btree_node_nchildren_max(btree, sib)) { 983 nilfs_btree_node_nchildren_max(sib, btree)) {
1031 path[level].bp_sib_bh = bh; 984 path[level].bp_sib_bh = bh;
1032 path[level].bp_op = nilfs_btree_carry_left; 985 path[level].bp_op = nilfs_btree_carry_left;
1033 stats->bs_nblocks++; 986 stats->bs_nblocks++;
@@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1038 991
1039 /* right sibling */ 992 /* right sibling */
1040 if (pindex < 993 if (pindex <
1041 nilfs_btree_node_get_nchildren(btree, parent) - 1) { 994 nilfs_btree_node_get_nchildren(parent) - 1) {
1042 sibptr = nilfs_btree_node_get_ptr(btree, parent, 995 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1043 pindex + 1); 996 pindex + 1);
1044 ret = nilfs_btree_get_block(btree, sibptr, &bh); 997 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1045 if (ret < 0) 998 if (ret < 0)
1046 goto err_out_child_node; 999 goto err_out_child_node;
1047 sib = (struct nilfs_btree_node *)bh->b_data; 1000 sib = (struct nilfs_btree_node *)bh->b_data;
1048 if (nilfs_btree_node_get_nchildren(btree, sib) < 1001 if (nilfs_btree_node_get_nchildren(sib) <
1049 nilfs_btree_node_nchildren_max(btree, sib)) { 1002 nilfs_btree_node_nchildren_max(sib, btree)) {
1050 path[level].bp_sib_bh = bh; 1003 path[level].bp_sib_bh = bh;
1051 path[level].bp_op = nilfs_btree_carry_right; 1004 path[level].bp_op = nilfs_btree_carry_right;
1052 stats->bs_nblocks++; 1005 stats->bs_nblocks++;
@@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1059 path[level].bp_newreq.bpr_ptr = 1012 path[level].bp_newreq.bpr_ptr =
1060 path[level - 1].bp_newreq.bpr_ptr + 1; 1013 path[level - 1].bp_newreq.bpr_ptr + 1;
1061 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1014 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
1062 &path[level].bp_newreq); 1015 &path[level].bp_newreq, dat);
1063 if (ret < 0) 1016 if (ret < 0)
1064 goto err_out_child_node; 1017 goto err_out_child_node;
1065 ret = nilfs_btree_get_new_block(btree, 1018 ret = nilfs_btree_get_new_block(btree,
@@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1081 1034
1082 /* root */ 1035 /* root */
1083 node = nilfs_btree_get_root(btree); 1036 node = nilfs_btree_get_root(btree);
1084 if (nilfs_btree_node_get_nchildren(btree, node) < 1037 if (nilfs_btree_node_get_nchildren(node) <
1085 nilfs_btree_node_nchildren_max(btree, node)) { 1038 nilfs_btree_node_nchildren_max(node, btree)) {
1086 path[level].bp_op = nilfs_btree_do_insert; 1039 path[level].bp_op = nilfs_btree_do_insert;
1087 stats->bs_nblocks++; 1040 stats->bs_nblocks++;
1088 goto out; 1041 goto out;
@@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1091 /* grow */ 1044 /* grow */
1092 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; 1045 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
1093 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1046 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
1094 &path[level].bp_newreq); 1047 &path[level].bp_newreq, dat);
1095 if (ret < 0) 1048 if (ret < 0)
1096 goto err_out_child_node; 1049 goto err_out_child_node;
1097 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, 1050 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1119 1072
1120 /* error */ 1073 /* error */
1121 err_out_curr_node: 1074 err_out_curr_node:
1122 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); 1075 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
1076 dat);
1123 err_out_child_node: 1077 err_out_child_node:
1124 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { 1078 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1125 nilfs_btnode_delete(path[level].bp_sib_bh); 1079 nilfs_btnode_delete(path[level].bp_sib_bh);
1126 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, 1080 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
1127 &path[level].bp_newreq); 1081 &path[level].bp_newreq, dat);
1128 1082
1129 } 1083 }
1130 1084
1131 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); 1085 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
1086 dat);
1132 err_out_data: 1087 err_out_data:
1133 *levelp = level; 1088 *levelp = level;
1134 stats->bs_nblocks = 0; 1089 stats->bs_nblocks = 0;
@@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1139 struct nilfs_btree_path *path, 1094 struct nilfs_btree_path *path,
1140 int maxlevel, __u64 key, __u64 ptr) 1095 int maxlevel, __u64 key, __u64 ptr)
1141{ 1096{
1097 struct inode *dat = NULL;
1142 int level; 1098 int level;
1143 1099
1144 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1100 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1145 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; 1101 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1146 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) 1102 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
1147 nilfs_btree_set_target_v(btree, key, ptr); 1103 nilfs_btree_set_target_v(btree, key, ptr);
1104 dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1105 }
1148 1106
1149 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1107 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1150 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, 1108 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
1151 &path[level - 1].bp_newreq); 1109 &path[level - 1].bp_newreq, dat);
1152 path[level].bp_op(btree, path, level, &key, &ptr); 1110 path[level].bp_op(btree, path, level, &key, &ptr);
1153 } 1111 }
1154 1112
@@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1164 int level, ret; 1122 int level, ret;
1165 1123
1166 btree = (struct nilfs_btree *)bmap; 1124 btree = (struct nilfs_btree *)bmap;
1167 path = nilfs_btree_alloc_path(btree); 1125 path = nilfs_btree_alloc_path();
1168 if (path == NULL) 1126 if (path == NULL)
1169 return -ENOMEM; 1127 return -ENOMEM;
1170 nilfs_btree_init_path(btree, path); 1128 nilfs_btree_init_path(path);
1171 1129
1172 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1130 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1173 NILFS_BTREE_LEVEL_NODE_MIN); 1131 NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1184 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1142 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1185 1143
1186 out: 1144 out:
1187 nilfs_btree_clear_path(btree, path); 1145 nilfs_btree_release_path(path);
1188 nilfs_btree_free_path(btree, path); 1146 nilfs_btree_free_path(path);
1189 return ret; 1147 return ret;
1190} 1148}
1191 1149
@@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1197 1155
1198 if (level < nilfs_btree_height(btree) - 1) { 1156 if (level < nilfs_btree_height(btree) - 1) {
1199 lock_buffer(path[level].bp_bh); 1157 lock_buffer(path[level].bp_bh);
1200 node = nilfs_btree_get_nonroot_node(btree, path, level); 1158 node = nilfs_btree_get_nonroot_node(path, level);
1201 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1159 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1202 path[level].bp_index); 1160 path[level].bp_index);
1203 if (!buffer_dirty(path[level].bp_bh)) 1161 if (!buffer_dirty(path[level].bp_bh))
@@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1205 unlock_buffer(path[level].bp_bh); 1163 unlock_buffer(path[level].bp_bh);
1206 if (path[level].bp_index == 0) 1164 if (path[level].bp_index == 0)
1207 nilfs_btree_promote_key(btree, path, level + 1, 1165 nilfs_btree_promote_key(btree, path, level + 1,
1208 nilfs_btree_node_get_key(btree, node, 0)); 1166 nilfs_btree_node_get_key(node, 0));
1209 } else { 1167 } else {
1210 node = nilfs_btree_get_root(btree); 1168 node = nilfs_btree_get_root(btree);
1211 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1169 nilfs_btree_node_delete(btree, node, keyp, ptrp,
@@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1225 lock_buffer(path[level].bp_bh); 1183 lock_buffer(path[level].bp_bh);
1226 lock_buffer(path[level].bp_sib_bh); 1184 lock_buffer(path[level].bp_sib_bh);
1227 1185
1228 node = nilfs_btree_get_nonroot_node(btree, path, level); 1186 node = nilfs_btree_get_nonroot_node(path, level);
1229 left = nilfs_btree_get_sib_node(btree, path, level); 1187 left = nilfs_btree_get_sib_node(path, level);
1230 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1188 nchildren = nilfs_btree_node_get_nchildren(node);
1231 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 1189 lnchildren = nilfs_btree_node_get_nchildren(left);
1232 1190
1233 n = (nchildren + lnchildren) / 2 - nchildren; 1191 n = (nchildren + lnchildren) / 2 - nchildren;
1234 1192
@@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1243 unlock_buffer(path[level].bp_sib_bh); 1201 unlock_buffer(path[level].bp_sib_bh);
1244 1202
1245 nilfs_btree_promote_key(btree, path, level + 1, 1203 nilfs_btree_promote_key(btree, path, level + 1,
1246 nilfs_btree_node_get_key(btree, node, 0)); 1204 nilfs_btree_node_get_key(node, 0));
1247 1205
1248 brelse(path[level].bp_sib_bh); 1206 brelse(path[level].bp_sib_bh);
1249 path[level].bp_sib_bh = NULL; 1207 path[level].bp_sib_bh = NULL;
@@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1262 lock_buffer(path[level].bp_bh); 1220 lock_buffer(path[level].bp_bh);
1263 lock_buffer(path[level].bp_sib_bh); 1221 lock_buffer(path[level].bp_sib_bh);
1264 1222
1265 node = nilfs_btree_get_nonroot_node(btree, path, level); 1223 node = nilfs_btree_get_nonroot_node(path, level);
1266 right = nilfs_btree_get_sib_node(btree, path, level); 1224 right = nilfs_btree_get_sib_node(path, level);
1267 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1225 nchildren = nilfs_btree_node_get_nchildren(node);
1268 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 1226 rnchildren = nilfs_btree_node_get_nchildren(right);
1269 1227
1270 n = (nchildren + rnchildren) / 2 - nchildren; 1228 n = (nchildren + rnchildren) / 2 - nchildren;
1271 1229
@@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1281 1239
1282 path[level + 1].bp_index++; 1240 path[level + 1].bp_index++;
1283 nilfs_btree_promote_key(btree, path, level + 1, 1241 nilfs_btree_promote_key(btree, path, level + 1,
1284 nilfs_btree_node_get_key(btree, right, 0)); 1242 nilfs_btree_node_get_key(right, 0));
1285 path[level + 1].bp_index--; 1243 path[level + 1].bp_index--;
1286 1244
1287 brelse(path[level].bp_sib_bh); 1245 brelse(path[level].bp_sib_bh);
@@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1300 lock_buffer(path[level].bp_bh); 1258 lock_buffer(path[level].bp_bh);
1301 lock_buffer(path[level].bp_sib_bh); 1259 lock_buffer(path[level].bp_sib_bh);
1302 1260
1303 node = nilfs_btree_get_nonroot_node(btree, path, level); 1261 node = nilfs_btree_get_nonroot_node(path, level);
1304 left = nilfs_btree_get_sib_node(btree, path, level); 1262 left = nilfs_btree_get_sib_node(path, level);
1305 1263
1306 n = nilfs_btree_node_get_nchildren(btree, node); 1264 n = nilfs_btree_node_get_nchildren(node);
1307 1265
1308 nilfs_btree_node_move_left(btree, left, node, n); 1266 nilfs_btree_node_move_left(btree, left, node, n);
1309 1267
@@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1316 nilfs_btnode_delete(path[level].bp_bh); 1274 nilfs_btnode_delete(path[level].bp_bh);
1317 path[level].bp_bh = path[level].bp_sib_bh; 1275 path[level].bp_bh = path[level].bp_sib_bh;
1318 path[level].bp_sib_bh = NULL; 1276 path[level].bp_sib_bh = NULL;
1319 path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); 1277 path[level].bp_index += nilfs_btree_node_get_nchildren(left);
1320} 1278}
1321 1279
1322static void nilfs_btree_concat_right(struct nilfs_btree *btree, 1280static void nilfs_btree_concat_right(struct nilfs_btree *btree,
@@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1331 lock_buffer(path[level].bp_bh); 1289 lock_buffer(path[level].bp_bh);
1332 lock_buffer(path[level].bp_sib_bh); 1290 lock_buffer(path[level].bp_sib_bh);
1333 1291
1334 node = nilfs_btree_get_nonroot_node(btree, path, level); 1292 node = nilfs_btree_get_nonroot_node(path, level);
1335 right = nilfs_btree_get_sib_node(btree, path, level); 1293 right = nilfs_btree_get_sib_node(path, level);
1336 1294
1337 n = nilfs_btree_node_get_nchildren(btree, right); 1295 n = nilfs_btree_node_get_nchildren(right);
1338 1296
1339 nilfs_btree_node_move_left(btree, node, right, n); 1297 nilfs_btree_node_move_left(btree, node, right, n);
1340 1298
@@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
1360 1318
1361 lock_buffer(path[level].bp_bh); 1319 lock_buffer(path[level].bp_bh);
1362 root = nilfs_btree_get_root(btree); 1320 root = nilfs_btree_get_root(btree);
1363 child = nilfs_btree_get_nonroot_node(btree, path, level); 1321 child = nilfs_btree_get_nonroot_node(path, level);
1364 1322
1365 nilfs_btree_node_delete(btree, root, NULL, NULL, 0); 1323 nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
1366 nilfs_btree_node_set_level(btree, root, level); 1324 nilfs_btree_node_set_level(root, level);
1367 n = nilfs_btree_node_get_nchildren(btree, child); 1325 n = nilfs_btree_node_get_nchildren(child);
1368 nilfs_btree_node_move_left(btree, root, child, n); 1326 nilfs_btree_node_move_left(btree, root, child, n);
1369 unlock_buffer(path[level].bp_bh); 1327 unlock_buffer(path[level].bp_bh);
1370 1328
@@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
1376static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, 1334static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1377 struct nilfs_btree_path *path, 1335 struct nilfs_btree_path *path,
1378 int *levelp, 1336 int *levelp,
1379 struct nilfs_bmap_stats *stats) 1337 struct nilfs_bmap_stats *stats,
1338 struct inode *dat)
1380{ 1339{
1381 struct buffer_head *bh; 1340 struct buffer_head *bh;
1382 struct nilfs_btree_node *node, *parent, *sib; 1341 struct nilfs_btree_node *node, *parent, *sib;
@@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1388 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 1347 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1389 level < nilfs_btree_height(btree) - 1; 1348 level < nilfs_btree_height(btree) - 1;
1390 level++) { 1349 level++) {
1391 node = nilfs_btree_get_nonroot_node(btree, path, level); 1350 node = nilfs_btree_get_nonroot_node(path, level);
1392 path[level].bp_oldreq.bpr_ptr = 1351 path[level].bp_oldreq.bpr_ptr =
1393 nilfs_btree_node_get_ptr(btree, node, 1352 nilfs_btree_node_get_ptr(btree, node,
1394 path[level].bp_index); 1353 path[level].bp_index);
1395 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1354 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
1396 &path[level].bp_oldreq); 1355 &path[level].bp_oldreq, dat);
1397 if (ret < 0) 1356 if (ret < 0)
1398 goto err_out_child_node; 1357 goto err_out_child_node;
1399 1358
1400 if (nilfs_btree_node_get_nchildren(btree, node) > 1359 if (nilfs_btree_node_get_nchildren(node) >
1401 nilfs_btree_node_nchildren_min(btree, node)) { 1360 nilfs_btree_node_nchildren_min(node, btree)) {
1402 path[level].bp_op = nilfs_btree_do_delete; 1361 path[level].bp_op = nilfs_btree_do_delete;
1403 stats->bs_nblocks++; 1362 stats->bs_nblocks++;
1404 goto out; 1363 goto out;
@@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1415 if (ret < 0) 1374 if (ret < 0)
1416 goto err_out_curr_node; 1375 goto err_out_curr_node;
1417 sib = (struct nilfs_btree_node *)bh->b_data; 1376 sib = (struct nilfs_btree_node *)bh->b_data;
1418 if (nilfs_btree_node_get_nchildren(btree, sib) > 1377 if (nilfs_btree_node_get_nchildren(sib) >
1419 nilfs_btree_node_nchildren_min(btree, sib)) { 1378 nilfs_btree_node_nchildren_min(sib, btree)) {
1420 path[level].bp_sib_bh = bh; 1379 path[level].bp_sib_bh = bh;
1421 path[level].bp_op = nilfs_btree_borrow_left; 1380 path[level].bp_op = nilfs_btree_borrow_left;
1422 stats->bs_nblocks++; 1381 stats->bs_nblocks++;
@@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1428 /* continue; */ 1387 /* continue; */
1429 } 1388 }
1430 } else if (pindex < 1389 } else if (pindex <
1431 nilfs_btree_node_get_nchildren(btree, parent) - 1) { 1390 nilfs_btree_node_get_nchildren(parent) - 1) {
1432 /* right sibling */ 1391 /* right sibling */
1433 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1392 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1434 pindex + 1); 1393 pindex + 1);
@@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1436 if (ret < 0) 1395 if (ret < 0)
1437 goto err_out_curr_node; 1396 goto err_out_curr_node;
1438 sib = (struct nilfs_btree_node *)bh->b_data; 1397 sib = (struct nilfs_btree_node *)bh->b_data;
1439 if (nilfs_btree_node_get_nchildren(btree, sib) > 1398 if (nilfs_btree_node_get_nchildren(sib) >
1440 nilfs_btree_node_nchildren_min(btree, sib)) { 1399 nilfs_btree_node_nchildren_min(sib, btree)) {
1441 path[level].bp_sib_bh = bh; 1400 path[level].bp_sib_bh = bh;
1442 path[level].bp_op = nilfs_btree_borrow_right; 1401 path[level].bp_op = nilfs_btree_borrow_right;
1443 stats->bs_nblocks++; 1402 stats->bs_nblocks++;
@@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1452 /* no siblings */ 1411 /* no siblings */
1453 /* the only child of the root node */ 1412 /* the only child of the root node */
1454 WARN_ON(level != nilfs_btree_height(btree) - 2); 1413 WARN_ON(level != nilfs_btree_height(btree) - 2);
1455 if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= 1414 if (nilfs_btree_node_get_nchildren(node) - 1 <=
1456 NILFS_BTREE_ROOT_NCHILDREN_MAX) { 1415 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1457 path[level].bp_op = nilfs_btree_shrink; 1416 path[level].bp_op = nilfs_btree_shrink;
1458 stats->bs_nblocks += 2; 1417 stats->bs_nblocks += 2;
@@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1471 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); 1430 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
1472 1431
1473 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1432 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
1474 &path[level].bp_oldreq); 1433 &path[level].bp_oldreq, dat);
1475 if (ret < 0) 1434 if (ret < 0)
1476 goto err_out_child_node; 1435 goto err_out_child_node;
1477 1436
@@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1486 1445
1487 /* error */ 1446 /* error */
1488 err_out_curr_node: 1447 err_out_curr_node:
1489 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq); 1448 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
1490 err_out_child_node: 1449 err_out_child_node:
1491 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { 1450 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1492 brelse(path[level].bp_sib_bh); 1451 brelse(path[level].bp_sib_bh);
1493 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, 1452 nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
1494 &path[level].bp_oldreq); 1453 &path[level].bp_oldreq, dat);
1495 } 1454 }
1496 *levelp = level; 1455 *levelp = level;
1497 stats->bs_nblocks = 0; 1456 stats->bs_nblocks = 0;
@@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1500 1459
1501static void nilfs_btree_commit_delete(struct nilfs_btree *btree, 1460static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
1502 struct nilfs_btree_path *path, 1461 struct nilfs_btree_path *path,
1503 int maxlevel) 1462 int maxlevel, struct inode *dat)
1504{ 1463{
1505 int level; 1464 int level;
1506 1465
1507 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1466 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1508 nilfs_bmap_commit_end_ptr(&btree->bt_bmap, 1467 nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
1509 &path[level].bp_oldreq); 1468 &path[level].bp_oldreq, dat);
1510 path[level].bp_op(btree, path, level, NULL, NULL); 1469 path[level].bp_op(btree, path, level, NULL, NULL);
1511 } 1470 }
1512 1471
@@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1520 struct nilfs_btree *btree; 1479 struct nilfs_btree *btree;
1521 struct nilfs_btree_path *path; 1480 struct nilfs_btree_path *path;
1522 struct nilfs_bmap_stats stats; 1481 struct nilfs_bmap_stats stats;
1482 struct inode *dat;
1523 int level, ret; 1483 int level, ret;
1524 1484
1525 btree = (struct nilfs_btree *)bmap; 1485 btree = (struct nilfs_btree *)bmap;
1526 path = nilfs_btree_alloc_path(btree); 1486 path = nilfs_btree_alloc_path();
1527 if (path == NULL) 1487 if (path == NULL)
1528 return -ENOMEM; 1488 return -ENOMEM;
1529 nilfs_btree_init_path(btree, path); 1489 nilfs_btree_init_path(path);
1530 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1490 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1531 NILFS_BTREE_LEVEL_NODE_MIN); 1491 NILFS_BTREE_LEVEL_NODE_MIN);
1532 if (ret < 0) 1492 if (ret < 0)
1533 goto out; 1493 goto out;
1534 1494
1535 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); 1495
1496 dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
1497 nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
1498
1499 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
1536 if (ret < 0) 1500 if (ret < 0)
1537 goto out; 1501 goto out;
1538 nilfs_btree_commit_delete(btree, path, level); 1502 nilfs_btree_commit_delete(btree, path, level, dat);
1539 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); 1503 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1540 1504
1541out: 1505out:
1542 nilfs_btree_clear_path(btree, path); 1506 nilfs_btree_release_path(path);
1543 nilfs_btree_free_path(btree, path); 1507 nilfs_btree_free_path(path);
1544 return ret; 1508 return ret;
1545} 1509}
1546 1510
@@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1551 int ret; 1515 int ret;
1552 1516
1553 btree = (struct nilfs_btree *)bmap; 1517 btree = (struct nilfs_btree *)bmap;
1554 path = nilfs_btree_alloc_path(btree); 1518 path = nilfs_btree_alloc_path();
1555 if (path == NULL) 1519 if (path == NULL)
1556 return -ENOMEM; 1520 return -ENOMEM;
1557 nilfs_btree_init_path(btree, path); 1521 nilfs_btree_init_path(path);
1558 1522
1559 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); 1523 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1560 1524
1561 nilfs_btree_clear_path(btree, path); 1525 nilfs_btree_release_path(path);
1562 nilfs_btree_free_path(btree, path); 1526 nilfs_btree_free_path(path);
1563 1527
1564 return ret; 1528 return ret;
1565} 1529}
@@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1581 node = root; 1545 node = root;
1582 break; 1546 break;
1583 case 3: 1547 case 3:
1584 nchildren = nilfs_btree_node_get_nchildren(btree, root); 1548 nchildren = nilfs_btree_node_get_nchildren(root);
1585 if (nchildren > 1) 1549 if (nchildren > 1)
1586 return 0; 1550 return 0;
1587 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1551 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
@@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1594 return 0; 1558 return 0;
1595 } 1559 }
1596 1560
1597 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1561 nchildren = nilfs_btree_node_get_nchildren(node);
1598 maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); 1562 maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
1599 nextmaxkey = (nchildren > 1) ? 1563 nextmaxkey = (nchildren > 1) ?
1600 nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; 1564 nilfs_btree_node_get_key(node, nchildren - 2) : 0;
1601 if (bh != NULL) 1565 if (bh != NULL)
1602 brelse(bh); 1566 brelse(bh);
1603 1567
@@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1623 node = root; 1587 node = root;
1624 break; 1588 break;
1625 case 3: 1589 case 3:
1626 nchildren = nilfs_btree_node_get_nchildren(btree, root); 1590 nchildren = nilfs_btree_node_get_nchildren(root);
1627 WARN_ON(nchildren > 1); 1591 WARN_ON(nchildren > 1);
1628 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1592 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1629 ret = nilfs_btree_get_block(btree, ptr, &bh); 1593 ret = nilfs_btree_get_block(btree, ptr, &bh);
@@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1636 return -EINVAL; 1600 return -EINVAL;
1637 } 1601 }
1638 1602
1639 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1603 nchildren = nilfs_btree_node_get_nchildren(node);
1640 if (nchildren < nitems) 1604 if (nchildren < nitems)
1641 nitems = nchildren; 1605 nitems = nchildren;
1642 dkeys = nilfs_btree_node_dkeys(btree, node); 1606 dkeys = nilfs_btree_node_dkeys(node);
1643 dptrs = nilfs_btree_node_dptrs(btree, node); 1607 dptrs = nilfs_btree_node_dptrs(node, btree);
1644 for (i = 0; i < nitems; i++) { 1608 for (i = 0; i < nitems; i++) {
1645 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); 1609 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
1646 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); 1610 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
@@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1660 struct nilfs_bmap_stats *stats) 1624 struct nilfs_bmap_stats *stats)
1661{ 1625{
1662 struct buffer_head *bh; 1626 struct buffer_head *bh;
1663 struct nilfs_btree *btree; 1627 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1628 struct inode *dat = NULL;
1664 int ret; 1629 int ret;
1665 1630
1666 btree = (struct nilfs_btree *)bmap;
1667 stats->bs_nblocks = 0; 1631 stats->bs_nblocks = 0;
1668 1632
1669 /* for data */ 1633 /* for data */
1670 /* cannot find near ptr */ 1634 /* cannot find near ptr */
1671 if (NILFS_BMAP_USE_VBN(bmap)) 1635 if (NILFS_BMAP_USE_VBN(bmap)) {
1672 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); 1636 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
1637 dat = nilfs_bmap_get_dat(bmap);
1638 }
1673 1639
1674 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq); 1640 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
1675 if (ret < 0) 1641 if (ret < 0)
1676 return ret; 1642 return ret;
1677 1643
@@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1679 stats->bs_nblocks++; 1645 stats->bs_nblocks++;
1680 if (nreq != NULL) { 1646 if (nreq != NULL) {
1681 nreq->bpr_ptr = dreq->bpr_ptr + 1; 1647 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1682 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq); 1648 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
1683 if (ret < 0) 1649 if (ret < 0)
1684 goto err_out_dreq; 1650 goto err_out_dreq;
1685 1651
@@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1696 1662
1697 /* error */ 1663 /* error */
1698 err_out_nreq: 1664 err_out_nreq:
1699 nilfs_bmap_abort_alloc_ptr(bmap, nreq); 1665 nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
1700 err_out_dreq: 1666 err_out_dreq:
1701 nilfs_bmap_abort_alloc_ptr(bmap, dreq); 1667 nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
1702 stats->bs_nblocks = 0; 1668 stats->bs_nblocks = 0;
1703 return ret; 1669 return ret;
1704 1670
@@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1713 union nilfs_bmap_ptr_req *nreq, 1679 union nilfs_bmap_ptr_req *nreq,
1714 struct buffer_head *bh) 1680 struct buffer_head *bh)
1715{ 1681{
1716 struct nilfs_btree *btree; 1682 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1717 struct nilfs_btree_node *node; 1683 struct nilfs_btree_node *node;
1684 struct inode *dat;
1718 __u64 tmpptr; 1685 __u64 tmpptr;
1719 1686
1720 /* free resources */ 1687 /* free resources */
@@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1725 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1692 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1726 1693
1727 /* convert and insert */ 1694 /* convert and insert */
1728 btree = (struct nilfs_btree *)bmap; 1695 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
1729 nilfs_btree_init(bmap); 1696 nilfs_btree_init(bmap);
1730 if (nreq != NULL) { 1697 if (nreq != NULL) {
1731 nilfs_bmap_commit_alloc_ptr(bmap, dreq); 1698 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
1732 nilfs_bmap_commit_alloc_ptr(bmap, nreq); 1699 nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
1733 1700
1734 /* create child node at level 1 */ 1701 /* create child node at level 1 */
1735 lock_buffer(bh); 1702 lock_buffer(bh);
@@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1751 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1718 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1752 2, 1, &keys[0], &tmpptr); 1719 2, 1, &keys[0], &tmpptr);
1753 } else { 1720 } else {
1754 nilfs_bmap_commit_alloc_ptr(bmap, dreq); 1721 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
1755 1722
1756 /* create root node at level 1 */ 1723 /* create root node at level 1 */
1757 node = nilfs_btree_get_root(btree); 1724 node = nilfs_btree_get_root(btree);
@@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1822 1789
1823static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, 1790static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1824 struct nilfs_btree_path *path, 1791 struct nilfs_btree_path *path,
1825 int level) 1792 int level, struct inode *dat)
1826{ 1793{
1827 struct nilfs_btree_node *parent; 1794 struct nilfs_btree_node *parent;
1828 int ret; 1795 int ret;
@@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1832 nilfs_btree_node_get_ptr(btree, parent, 1799 nilfs_btree_node_get_ptr(btree, parent,
1833 path[level + 1].bp_index); 1800 path[level + 1].bp_index);
1834 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; 1801 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1835 ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap, 1802 ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
1836 &path[level].bp_oldreq, 1803 &path[level].bp_newreq.bpr_req);
1837 &path[level].bp_newreq);
1838 if (ret < 0) 1804 if (ret < 0)
1839 return ret; 1805 return ret;
1840 1806
@@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1846 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1812 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1847 &path[level].bp_ctxt); 1813 &path[level].bp_ctxt);
1848 if (ret < 0) { 1814 if (ret < 0) {
1849 nilfs_bmap_abort_update_v(&btree->bt_bmap, 1815 nilfs_dat_abort_update(dat,
1850 &path[level].bp_oldreq, 1816 &path[level].bp_oldreq.bpr_req,
1851 &path[level].bp_newreq); 1817 &path[level].bp_newreq.bpr_req);
1852 return ret; 1818 return ret;
1853 } 1819 }
1854 } 1820 }
@@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1858 1824
1859static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, 1825static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1860 struct nilfs_btree_path *path, 1826 struct nilfs_btree_path *path,
1861 int level) 1827 int level, struct inode *dat)
1862{ 1828{
1863 struct nilfs_btree_node *parent; 1829 struct nilfs_btree_node *parent;
1864 1830
1865 nilfs_bmap_commit_update_v(&btree->bt_bmap, 1831 nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
1866 &path[level].bp_oldreq, 1832 &path[level].bp_newreq.bpr_req,
1867 &path[level].bp_newreq); 1833 btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
1868 1834
1869 if (buffer_nilfs_node(path[level].bp_bh)) { 1835 if (buffer_nilfs_node(path[level].bp_bh)) {
1870 nilfs_btnode_commit_change_key( 1836 nilfs_btnode_commit_change_key(
@@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1881 1847
1882static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, 1848static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1883 struct nilfs_btree_path *path, 1849 struct nilfs_btree_path *path,
1884 int level) 1850 int level, struct inode *dat)
1885{ 1851{
1886 nilfs_bmap_abort_update_v(&btree->bt_bmap, 1852 nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
1887 &path[level].bp_oldreq, 1853 &path[level].bp_newreq.bpr_req);
1888 &path[level].bp_newreq);
1889 if (buffer_nilfs_node(path[level].bp_bh)) 1854 if (buffer_nilfs_node(path[level].bp_bh))
1890 nilfs_btnode_abort_change_key( 1855 nilfs_btnode_abort_change_key(
1891 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1856 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1894 1859
1895static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, 1860static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1896 struct nilfs_btree_path *path, 1861 struct nilfs_btree_path *path,
1897 int minlevel, 1862 int minlevel, int *maxlevelp,
1898 int *maxlevelp) 1863 struct inode *dat)
1899{ 1864{
1900 int level, ret; 1865 int level, ret;
1901 1866
1902 level = minlevel; 1867 level = minlevel;
1903 if (!buffer_nilfs_volatile(path[level].bp_bh)) { 1868 if (!buffer_nilfs_volatile(path[level].bp_bh)) {
1904 ret = nilfs_btree_prepare_update_v(btree, path, level); 1869 ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
1905 if (ret < 0) 1870 if (ret < 0)
1906 return ret; 1871 return ret;
1907 } 1872 }
@@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1909 !buffer_dirty(path[level].bp_bh)) { 1874 !buffer_dirty(path[level].bp_bh)) {
1910 1875
1911 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); 1876 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
1912 ret = nilfs_btree_prepare_update_v(btree, path, level); 1877 ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
1913 if (ret < 0) 1878 if (ret < 0)
1914 goto out; 1879 goto out;
1915 } 1880 }
@@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1921 /* error */ 1886 /* error */
1922 out: 1887 out:
1923 while (--level > minlevel) 1888 while (--level > minlevel)
1924 nilfs_btree_abort_update_v(btree, path, level); 1889 nilfs_btree_abort_update_v(btree, path, level, dat);
1925 if (!buffer_nilfs_volatile(path[level].bp_bh)) 1890 if (!buffer_nilfs_volatile(path[level].bp_bh))
1926 nilfs_btree_abort_update_v(btree, path, level); 1891 nilfs_btree_abort_update_v(btree, path, level, dat);
1927 return ret; 1892 return ret;
1928} 1893}
1929 1894
1930static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, 1895static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1931 struct nilfs_btree_path *path, 1896 struct nilfs_btree_path *path,
1932 int minlevel, 1897 int minlevel, int maxlevel,
1933 int maxlevel, 1898 struct buffer_head *bh,
1934 struct buffer_head *bh) 1899 struct inode *dat)
1935{ 1900{
1936 int level; 1901 int level;
1937 1902
1938 if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) 1903 if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
1939 nilfs_btree_commit_update_v(btree, path, minlevel); 1904 nilfs_btree_commit_update_v(btree, path, minlevel, dat);
1940 1905
1941 for (level = minlevel + 1; level <= maxlevel; level++) 1906 for (level = minlevel + 1; level <= maxlevel; level++)
1942 nilfs_btree_commit_update_v(btree, path, level); 1907 nilfs_btree_commit_update_v(btree, path, level, dat);
1943} 1908}
1944 1909
1945static int nilfs_btree_propagate_v(struct nilfs_btree *btree, 1910static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1946 struct nilfs_btree_path *path, 1911 struct nilfs_btree_path *path,
1947 int level, 1912 int level, struct buffer_head *bh)
1948 struct buffer_head *bh)
1949{ 1913{
1950 int maxlevel, ret; 1914 int maxlevel, ret;
1951 struct nilfs_btree_node *parent; 1915 struct nilfs_btree_node *parent;
1916 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1952 __u64 ptr; 1917 __u64 ptr;
1953 1918
1954 get_bh(bh); 1919 get_bh(bh);
1955 path[level].bp_bh = bh; 1920 path[level].bp_bh = bh;
1956 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); 1921 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
1922 dat);
1957 if (ret < 0) 1923 if (ret < 0)
1958 goto out; 1924 goto out;
1959 1925
@@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1961 parent = nilfs_btree_get_node(btree, path, level + 1); 1927 parent = nilfs_btree_get_node(btree, path, level + 1);
1962 ptr = nilfs_btree_node_get_ptr(btree, parent, 1928 ptr = nilfs_btree_node_get_ptr(btree, parent,
1963 path[level + 1].bp_index); 1929 path[level + 1].bp_index);
1964 ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); 1930 ret = nilfs_dat_mark_dirty(dat, ptr);
1965 if (ret < 0) 1931 if (ret < 0)
1966 goto out; 1932 goto out;
1967 } 1933 }
1968 1934
1969 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); 1935 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
1970 1936
1971 out: 1937 out:
1972 brelse(path[level].bp_bh); 1938 brelse(path[level].bp_bh);
@@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1986 WARN_ON(!buffer_dirty(bh)); 1952 WARN_ON(!buffer_dirty(bh));
1987 1953
1988 btree = (struct nilfs_btree *)bmap; 1954 btree = (struct nilfs_btree *)bmap;
1989 path = nilfs_btree_alloc_path(btree); 1955 path = nilfs_btree_alloc_path();
1990 if (path == NULL) 1956 if (path == NULL)
1991 return -ENOMEM; 1957 return -ENOMEM;
1992 nilfs_btree_init_path(btree, path); 1958 nilfs_btree_init_path(path);
1993 1959
1994 if (buffer_nilfs_node(bh)) { 1960 if (buffer_nilfs_node(bh)) {
1995 node = (struct nilfs_btree_node *)bh->b_data; 1961 node = (struct nilfs_btree_node *)bh->b_data;
1996 key = nilfs_btree_node_get_key(btree, node, 0); 1962 key = nilfs_btree_node_get_key(node, 0);
1997 level = nilfs_btree_node_get_level(btree, node); 1963 level = nilfs_btree_node_get_level(node);
1998 } else { 1964 } else {
1999 key = nilfs_bmap_data_get_key(bmap, bh); 1965 key = nilfs_bmap_data_get_key(bmap, bh);
2000 level = NILFS_BTREE_LEVEL_DATA; 1966 level = NILFS_BTREE_LEVEL_DATA;
@@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
2013 nilfs_btree_propagate_p(btree, path, level, bh); 1979 nilfs_btree_propagate_p(btree, path, level, bh);
2014 1980
2015 out: 1981 out:
2016 nilfs_btree_clear_path(btree, path); 1982 nilfs_btree_release_path(path);
2017 nilfs_btree_free_path(btree, path); 1983 nilfs_btree_free_path(path);
2018 1984
2019 return ret; 1985 return ret;
2020} 1986}
@@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
2022static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, 1988static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
2023 struct buffer_head *bh) 1989 struct buffer_head *bh)
2024{ 1990{
2025 return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); 1991 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
2026} 1992}
2027 1993
2028static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, 1994static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
@@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
2037 2003
2038 get_bh(bh); 2004 get_bh(bh);
2039 node = (struct nilfs_btree_node *)bh->b_data; 2005 node = (struct nilfs_btree_node *)bh->b_data;
2040 key = nilfs_btree_node_get_key(btree, node, 0); 2006 key = nilfs_btree_node_get_key(node, 0);
2041 level = nilfs_btree_node_get_level(btree, node); 2007 level = nilfs_btree_node_get_level(node);
2042 list_for_each(head, &lists[level]) { 2008 list_for_each(head, &lists[level]) {
2043 cbh = list_entry(head, struct buffer_head, b_assoc_buffers); 2009 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
2044 cnode = (struct nilfs_btree_node *)cbh->b_data; 2010 cnode = (struct nilfs_btree_node *)cbh->b_data;
2045 ckey = nilfs_btree_node_get_key(btree, cnode, 0); 2011 ckey = nilfs_btree_node_get_key(cnode, 0);
2046 if (key < ckey) 2012 if (key < ckey)
2047 break; 2013 break;
2048 } 2014 }
@@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
2120 nilfs_btree_node_set_ptr(btree, parent, 2086 nilfs_btree_node_set_ptr(btree, parent,
2121 path[level + 1].bp_index, blocknr); 2087 path[level + 1].bp_index, blocknr);
2122 2088
2123 key = nilfs_btree_node_get_key(btree, parent, 2089 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2124 path[level + 1].bp_index);
2125 /* on-disk format */ 2090 /* on-disk format */
2126 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2091 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2127 binfo->bi_dat.bi_level = level; 2092 binfo->bi_dat.bi_level = level;
@@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2137 union nilfs_binfo *binfo) 2102 union nilfs_binfo *binfo)
2138{ 2103{
2139 struct nilfs_btree_node *parent; 2104 struct nilfs_btree_node *parent;
2105 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
2140 __u64 key; 2106 __u64 key;
2141 __u64 ptr; 2107 __u64 ptr;
2142 union nilfs_bmap_ptr_req req; 2108 union nilfs_bmap_ptr_req req;
@@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2146 ptr = nilfs_btree_node_get_ptr(btree, parent, 2112 ptr = nilfs_btree_node_get_ptr(btree, parent,
2147 path[level + 1].bp_index); 2113 path[level + 1].bp_index);
2148 req.bpr_ptr = ptr; 2114 req.bpr_ptr = ptr;
2149 ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr); 2115 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
2150 if (unlikely(ret < 0)) 2116 if (ret < 0)
2151 return ret; 2117 return ret;
2118 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
2152 2119
2153 key = nilfs_btree_node_get_key(btree, parent, 2120 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2154 path[level + 1].bp_index);
2155 /* on-disk format */ 2121 /* on-disk format */
2156 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 2122 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
2157 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2123 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2171 int level, ret; 2137 int level, ret;
2172 2138
2173 btree = (struct nilfs_btree *)bmap; 2139 btree = (struct nilfs_btree *)bmap;
2174 path = nilfs_btree_alloc_path(btree); 2140 path = nilfs_btree_alloc_path();
2175 if (path == NULL) 2141 if (path == NULL)
2176 return -ENOMEM; 2142 return -ENOMEM;
2177 nilfs_btree_init_path(btree, path); 2143 nilfs_btree_init_path(path);
2178 2144
2179 if (buffer_nilfs_node(*bh)) { 2145 if (buffer_nilfs_node(*bh)) {
2180 node = (struct nilfs_btree_node *)(*bh)->b_data; 2146 node = (struct nilfs_btree_node *)(*bh)->b_data;
2181 key = nilfs_btree_node_get_key(btree, node, 0); 2147 key = nilfs_btree_node_get_key(node, 0);
2182 level = nilfs_btree_node_get_level(btree, node); 2148 level = nilfs_btree_node_get_level(node);
2183 } else { 2149 } else {
2184 key = nilfs_bmap_data_get_key(bmap, *bh); 2150 key = nilfs_bmap_data_get_key(bmap, *bh);
2185 level = NILFS_BTREE_LEVEL_DATA; 2151 level = NILFS_BTREE_LEVEL_DATA;
@@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2196 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); 2162 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2197 2163
2198 out: 2164 out:
2199 nilfs_btree_clear_path(btree, path); 2165 nilfs_btree_release_path(path);
2200 nilfs_btree_free_path(btree, path); 2166 nilfs_btree_free_path(path);
2201 2167
2202 return ret; 2168 return ret;
2203} 2169}
@@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2207 sector_t blocknr, 2173 sector_t blocknr,
2208 union nilfs_binfo *binfo) 2174 union nilfs_binfo *binfo)
2209{ 2175{
2210 struct nilfs_btree *btree;
2211 struct nilfs_btree_node *node; 2176 struct nilfs_btree_node *node;
2212 __u64 key; 2177 __u64 key;
2213 int ret; 2178 int ret;
2214 2179
2215 btree = (struct nilfs_btree *)bmap; 2180 ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
2216 ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); 2181 blocknr);
2217 if (ret < 0) 2182 if (ret < 0)
2218 return ret; 2183 return ret;
2219 2184
2220 if (buffer_nilfs_node(*bh)) { 2185 if (buffer_nilfs_node(*bh)) {
2221 node = (struct nilfs_btree_node *)(*bh)->b_data; 2186 node = (struct nilfs_btree_node *)(*bh)->b_data;
2222 key = nilfs_btree_node_get_key(btree, node, 0); 2187 key = nilfs_btree_node_get_key(node, 0);
2223 } else 2188 } else
2224 key = nilfs_bmap_data_get_key(bmap, *bh); 2189 key = nilfs_bmap_data_get_key(bmap, *bh);
2225 2190
@@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2239 int ret; 2204 int ret;
2240 2205
2241 btree = (struct nilfs_btree *)bmap; 2206 btree = (struct nilfs_btree *)bmap;
2242 path = nilfs_btree_alloc_path(btree); 2207 path = nilfs_btree_alloc_path();
2243 if (path == NULL) 2208 if (path == NULL)
2244 return -ENOMEM; 2209 return -ENOMEM;
2245 nilfs_btree_init_path(btree, path); 2210 nilfs_btree_init_path(path);
2246 2211
2247 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); 2212 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2248 if (ret < 0) { 2213 if (ret < 0) {
@@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2262 nilfs_bmap_set_dirty(&btree->bt_bmap); 2227 nilfs_bmap_set_dirty(&btree->bt_bmap);
2263 2228
2264 out: 2229 out:
2265 nilfs_btree_clear_path(btree, path); 2230 nilfs_btree_release_path(path);
2266 nilfs_btree_free_path(btree, path); 2231 nilfs_btree_free_path(path);
2267 return ret; 2232 return ret;
2268} 2233}
2269 2234
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aec942cf79e3..1c6cfb59128d 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
815 void *kaddr; 815 void *kaddr;
816 int ret; 816 int ret;
817 817
818 if (cno == 0) 818 /* CP number is invalid if it's zero or larger than the
819 return -ENOENT; /* checkpoint number 0 is invalid */ 819 largest exist one.*/
820 if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
821 return -ENOENT;
820 down_read(&NILFS_MDT(cpfile)->mi_sem); 822 down_read(&NILFS_MDT(cpfile)->mi_sem);
821 823
822 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); 824 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
@@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
824 goto out; 826 goto out;
825 kaddr = kmap_atomic(bh->b_page, KM_USER0); 827 kaddr = kmap_atomic(bh->b_page, KM_USER0);
826 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); 828 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
827 ret = nilfs_checkpoint_snapshot(cp); 829 if (nilfs_checkpoint_invalid(cp))
830 ret = -ENOENT;
831 else
832 ret = nilfs_checkpoint_snapshot(cp);
828 kunmap_atomic(kaddr, KM_USER0); 833 kunmap_atomic(kaddr, KM_USER0);
829 brelse(bh); 834 brelse(bh);
830 835
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 788a45950197..debea896e701 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -27,8 +27,6 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h> 28#include <linux/nilfs2_fs.h>
29 29
30#define NILFS_CPFILE_GFP NILFS_MDT_GFP
31
32 30
33int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, 31int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
34 struct nilfs_checkpoint **, 32 struct nilfs_checkpoint **,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8927ca27e6f7..1ff8e15bd36b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
109 nilfs_palloc_commit_free_entry(dat, req); 109 nilfs_palloc_commit_free_entry(dat, req);
110} 110}
111 111
112void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
113{
114 nilfs_dat_abort_entry(dat, req);
115 nilfs_palloc_abort_free_entry(dat, req);
116}
117
118int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) 112int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
119{ 113{
120 int ret; 114 int ret;
@@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
140 nilfs_dat_commit_entry(dat, req); 134 nilfs_dat_commit_entry(dat, req);
141} 135}
142 136
143void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
144{
145 nilfs_dat_abort_entry(dat, req);
146}
147
148int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) 137int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
149{ 138{
150 struct nilfs_dat_entry *entry; 139 struct nilfs_dat_entry *entry;
@@ -222,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
222 nilfs_dat_abort_entry(dat, req); 211 nilfs_dat_abort_entry(dat, req);
223} 212}
224 213
214int nilfs_dat_prepare_update(struct inode *dat,
215 struct nilfs_palloc_req *oldreq,
216 struct nilfs_palloc_req *newreq)
217{
218 int ret;
219
220 ret = nilfs_dat_prepare_end(dat, oldreq);
221 if (!ret) {
222 ret = nilfs_dat_prepare_alloc(dat, newreq);
223 if (ret < 0)
224 nilfs_dat_abort_end(dat, oldreq);
225 }
226 return ret;
227}
228
229void nilfs_dat_commit_update(struct inode *dat,
230 struct nilfs_palloc_req *oldreq,
231 struct nilfs_palloc_req *newreq, int dead)
232{
233 nilfs_dat_commit_end(dat, oldreq, dead);
234 nilfs_dat_commit_alloc(dat, newreq);
235}
236
237void nilfs_dat_abort_update(struct inode *dat,
238 struct nilfs_palloc_req *oldreq,
239 struct nilfs_palloc_req *newreq)
240{
241 nilfs_dat_abort_end(dat, oldreq);
242 nilfs_dat_abort_alloc(dat, newreq);
243}
244
225/** 245/**
226 * nilfs_dat_mark_dirty - 246 * nilfs_dat_mark_dirty -
227 * @dat: DAT file inode 247 * @dat: DAT file inode
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d328b81eead4..406070d3ff49 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -27,7 +27,6 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29 29
30#define NILFS_DAT_GFP NILFS_MDT_GFP
31 30
32struct nilfs_palloc_req; 31struct nilfs_palloc_req;
33 32
@@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
39int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); 38int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
40void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, 39void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
41 sector_t); 40 sector_t);
42void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
43int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); 41int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
44void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); 42void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
45void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); 43void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
44int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
45 struct nilfs_palloc_req *);
46void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
47 struct nilfs_palloc_req *, int);
48void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
49 struct nilfs_palloc_req *);
46 50
47int nilfs_dat_mark_dirty(struct inode *, __u64); 51int nilfs_dat_mark_dirty(struct inode *, __u64);
48int nilfs_dat_freev(struct inode *, __u64 *, size_t); 52int nilfs_dat_freev(struct inode *, __u64 *, size_t);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 342d9765df8d..d369ac718277 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
125 direct->d_bmap.b_last_allocated_ptr = ptr; 125 direct->d_bmap.b_last_allocated_ptr = ptr;
126} 126}
127 127
128static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
129 __u64 key,
130 union nilfs_bmap_ptr_req *req,
131 struct nilfs_bmap_stats *stats)
132{
133 int ret;
134
135 if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
136 req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
137 ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
138 if (ret < 0)
139 return ret;
140
141 stats->bs_nblocks = 1;
142 return 0;
143}
144
145static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
146 union nilfs_bmap_ptr_req *req,
147 __u64 key, __u64 ptr)
148{
149 struct buffer_head *bh;
150
151 /* ptr must be a pointer to a buffer head. */
152 bh = (struct buffer_head *)((unsigned long)ptr);
153 set_buffer_nilfs_volatile(bh);
154
155 nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
156 nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
157
158 if (!nilfs_bmap_dirty(&direct->d_bmap))
159 nilfs_bmap_set_dirty(&direct->d_bmap);
160
161 if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
162 nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
163}
164
165static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 128static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
166{ 129{
167 struct nilfs_direct *direct; 130 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
168 union nilfs_bmap_ptr_req req; 131 union nilfs_bmap_ptr_req req;
169 struct nilfs_bmap_stats stats; 132 struct inode *dat = NULL;
133 struct buffer_head *bh;
170 int ret; 134 int ret;
171 135
172 direct = (struct nilfs_direct *)bmap;
173 if (key > NILFS_DIRECT_KEY_MAX) 136 if (key > NILFS_DIRECT_KEY_MAX)
174 return -ENOENT; 137 return -ENOENT;
175 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) 138 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
176 return -EEXIST; 139 return -EEXIST;
177 140
178 ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); 141 if (NILFS_BMAP_USE_VBN(bmap)) {
179 if (ret < 0) 142 req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
180 return ret; 143 dat = nilfs_bmap_get_dat(bmap);
181 nilfs_direct_commit_insert(direct, &req, key, ptr); 144 }
182 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 145 ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
146 if (!ret) {
147 /* ptr must be a pointer to a buffer head. */
148 bh = (struct buffer_head *)((unsigned long)ptr);
149 set_buffer_nilfs_volatile(bh);
183 150
184 return 0; 151 nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
185} 152 nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
186 153
187static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, 154 if (!nilfs_bmap_dirty(bmap))
188 union nilfs_bmap_ptr_req *req, 155 nilfs_bmap_set_dirty(bmap);
189 __u64 key,
190 struct nilfs_bmap_stats *stats)
191{
192 int ret;
193 156
194 req->bpr_ptr = nilfs_direct_get_ptr(direct, key); 157 if (NILFS_BMAP_USE_VBN(bmap))
195 ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req); 158 nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
196 if (!ret)
197 stats->bs_nblocks = 1;
198 return ret;
199}
200 159
201static void nilfs_direct_commit_delete(struct nilfs_direct *direct, 160 nilfs_bmap_add_blocks(bmap, 1);
202 union nilfs_bmap_ptr_req *req, 161 }
203 __u64 key) 162 return ret;
204{
205 nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
206 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
207} 163}
208 164
209static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) 165static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
210{ 166{
211 struct nilfs_direct *direct; 167 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
212 union nilfs_bmap_ptr_req req; 168 union nilfs_bmap_ptr_req req;
213 struct nilfs_bmap_stats stats; 169 struct inode *dat;
214 int ret; 170 int ret;
215 171
216 direct = (struct nilfs_direct *)bmap; 172 if (key > NILFS_DIRECT_KEY_MAX ||
217 if ((key > NILFS_DIRECT_KEY_MAX) ||
218 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) 173 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
219 return -ENOENT; 174 return -ENOENT;
220 175
221 ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); 176 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
222 if (ret < 0) 177 req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
223 return ret;
224 nilfs_direct_commit_delete(direct, &req, key);
225 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
226 178
227 return 0; 179 ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
180 if (!ret) {
181 nilfs_bmap_commit_end_ptr(bmap, &req, dat);
182 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
183 nilfs_bmap_sub_blocks(bmap, 1);
184 }
185 return ret;
228} 186}
229 187
230static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) 188static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
@@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
310 return 0; 268 return 0;
311} 269}
312 270
313static int nilfs_direct_propagate_v(struct nilfs_direct *direct, 271static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
314 struct buffer_head *bh) 272 struct buffer_head *bh)
315{ 273{
316 union nilfs_bmap_ptr_req oldreq, newreq; 274 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
275 struct nilfs_palloc_req oldreq, newreq;
276 struct inode *dat;
317 __u64 key; 277 __u64 key;
318 __u64 ptr; 278 __u64 ptr;
319 int ret; 279 int ret;
320 280
321 key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); 281 if (!NILFS_BMAP_USE_VBN(bmap))
282 return 0;
283
284 dat = nilfs_bmap_get_dat(bmap);
285 key = nilfs_bmap_data_get_key(bmap, bh);
322 ptr = nilfs_direct_get_ptr(direct, key); 286 ptr = nilfs_direct_get_ptr(direct, key);
323 if (!buffer_nilfs_volatile(bh)) { 287 if (!buffer_nilfs_volatile(bh)) {
324 oldreq.bpr_ptr = ptr; 288 oldreq.pr_entry_nr = ptr;
325 newreq.bpr_ptr = ptr; 289 newreq.pr_entry_nr = ptr;
326 ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq, 290 ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
327 &newreq);
328 if (ret < 0) 291 if (ret < 0)
329 return ret; 292 return ret;
330 nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq); 293 nilfs_dat_commit_update(dat, &oldreq, &newreq,
294 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
331 set_buffer_nilfs_volatile(bh); 295 set_buffer_nilfs_volatile(bh);
332 nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); 296 nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
333 } else 297 } else
334 ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); 298 ret = nilfs_dat_mark_dirty(dat, ptr);
335 299
336 return ret; 300 return ret;
337} 301}
338 302
339static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
340 struct buffer_head *bh)
341{
342 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
343
344 return NILFS_BMAP_USE_VBN(bmap) ?
345 nilfs_direct_propagate_v(direct, bh) : 0;
346}
347
348static int nilfs_direct_assign_v(struct nilfs_direct *direct, 303static int nilfs_direct_assign_v(struct nilfs_direct *direct,
349 __u64 key, __u64 ptr, 304 __u64 key, __u64 ptr,
350 struct buffer_head **bh, 305 struct buffer_head **bh,
351 sector_t blocknr, 306 sector_t blocknr,
352 union nilfs_binfo *binfo) 307 union nilfs_binfo *binfo)
353{ 308{
309 struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
354 union nilfs_bmap_ptr_req req; 310 union nilfs_bmap_ptr_req req;
355 int ret; 311 int ret;
356 312
357 req.bpr_ptr = ptr; 313 req.bpr_ptr = ptr;
358 ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr); 314 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
359 if (unlikely(ret < 0)) 315 if (!ret) {
360 return ret; 316 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
361 317 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
362 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 318 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
363 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 319 }
364 320 return ret;
365 return 0;
366} 321}
367 322
368static int nilfs_direct_assign_p(struct nilfs_direct *direct, 323static int nilfs_direct_assign_p(struct nilfs_direct *direct,
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 5d30a35679b5..ecc3ba76db47 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -31,7 +31,6 @@
31#include "mdt.h" 31#include "mdt.h"
32#include "alloc.h" 32#include "alloc.h"
33 33
34#define NILFS_IFILE_GFP NILFS_MDT_GFP
35 34
36static inline struct nilfs_inode * 35static inline struct nilfs_inode *
37nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) 36nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index fe9d8f2a13f8..807e584b163d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
430 430
431 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); 431 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
432 432
433 if (nilfs_read_inode_common(inode, raw_inode)) 433 err = nilfs_read_inode_common(inode, raw_inode);
434 if (err)
434 goto failed_unmap; 435 goto failed_unmap;
435 436
436 if (S_ISREG(inode->i_mode)) { 437 if (S_ISREG(inode->i_mode)) {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6ea5f872e2de..6572ea4bc4df 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
442 const char *msg; 442 const char *msg;
443 int ret; 443 int ret;
444 444
445 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
446 if (ret < 0) {
447 msg = "cannot read source blocks";
448 goto failed;
449 }
450
451 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]); 445 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
452 if (ret < 0) { 446 if (ret < 0) {
453 /* 447 /*
@@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
548 } 542 }
549 } 543 }
550 544
551 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 545 /*
546 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
547 * which will operates an inode list without blocking.
548 * To protect the list from concurrent operations,
549 * nilfs_ioctl_move_blocks should be atomic operation.
550 */
551 if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
552 ret = -EBUSY;
553 goto out_free;
554 }
555
556 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
557 if (ret < 0)
558 printk(KERN_ERR "NILFS: GC failed during preparation: "
559 "cannot read source blocks: err=%d\n", ret);
560 else
561 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
562
563 clear_nilfs_gc_running(nilfs);
552 564
553 out_free: 565 out_free:
554 while (--n >= 0) 566 while (--n >= 0)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 3d3ddb3f5177..156bf6091a96 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
103 goto failed_unlock; 103 goto failed_unlock;
104 104
105 err = -EEXIST; 105 err = -EEXIST;
106 if (buffer_uptodate(bh) || buffer_mapped(bh)) 106 if (buffer_uptodate(bh))
107 goto failed_bh; 107 goto failed_bh;
108#if 0 108
109 /* The uptodate flag is not protected by the page lock, but
110 the mapped flag is. Thus, we don't have to wait the buffer. */
111 wait_on_buffer(bh); 109 wait_on_buffer(bh);
112 if (buffer_uptodate(bh)) 110 if (buffer_uptodate(bh))
113 goto failed_bh; 111 goto failed_bh;
114#endif
115 112
116 bh->b_bdev = nilfs->ns_bdev; 113 bh->b_bdev = nilfs->ns_bdev;
117 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); 114 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
@@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
139 int mode, struct buffer_head **out_bh) 136 int mode, struct buffer_head **out_bh)
140{ 137{
141 struct buffer_head *bh; 138 struct buffer_head *bh;
142 unsigned long blknum = 0; 139 __u64 blknum = 0;
143 int ret = -ENOMEM; 140 int ret = -ENOMEM;
144 141
145 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); 142 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
@@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
162 unlock_buffer(bh); 159 unlock_buffer(bh);
163 goto out; 160 goto out;
164 } 161 }
165 if (!buffer_mapped(bh)) { /* unused buffer */ 162
166 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, 163 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
167 &blknum); 164 if (unlikely(ret)) {
168 if (unlikely(ret)) { 165 unlock_buffer(bh);
169 unlock_buffer(bh); 166 goto failed_bh;
170 goto failed_bh;
171 }
172 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
173 bh->b_blocknr = blknum;
174 set_buffer_mapped(bh);
175 } 167 }
168 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
169 bh->b_blocknr = (sector_t)blknum;
170 set_buffer_mapped(bh);
176 171
177 bh->b_end_io = end_buffer_read_sync; 172 bh->b_end_io = end_buffer_read_sync;
178 get_bh(bh); 173 get_bh(bh);
@@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
402 struct inode *inode = container_of(page->mapping, 397 struct inode *inode = container_of(page->mapping,
403 struct inode, i_data); 398 struct inode, i_data);
404 struct super_block *sb = inode->i_sb; 399 struct super_block *sb = inode->i_sb;
400 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
405 struct nilfs_sb_info *writer = NULL; 401 struct nilfs_sb_info *writer = NULL;
406 int err = 0; 402 int err = 0;
407 403
@@ -411,9 +407,12 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
411 if (page->mapping->assoc_mapping) 407 if (page->mapping->assoc_mapping)
412 return 0; /* Do not request flush for shadow page cache */ 408 return 0; /* Do not request flush for shadow page cache */
413 if (!sb) { 409 if (!sb) {
414 writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); 410 down_read(&nilfs->ns_writer_sem);
415 if (!writer) 411 writer = nilfs->ns_writer;
412 if (!writer) {
413 up_read(&nilfs->ns_writer_sem);
416 return -EROFS; 414 return -EROFS;
415 }
417 sb = writer->s_super; 416 sb = writer->s_super;
418 } 417 }
419 418
@@ -423,7 +422,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
423 nilfs_flush_segment(sb, inode->i_ino); 422 nilfs_flush_segment(sb, inode->i_ino);
424 423
425 if (writer) 424 if (writer)
426 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); 425 up_read(&nilfs->ns_writer_sem);
427 return err; 426 return err;
428} 427}
429 428
@@ -514,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
514} 513}
515 514
516struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, 515struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
517 ino_t ino, gfp_t gfp_mask) 516 ino_t ino)
518{ 517{
519 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); 518 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
519 NILFS_MDT_GFP);
520 520
521 if (!inode) 521 if (!inode)
522 return NULL; 522 return NULL;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index df683e0bca6a..431599733c9b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); 74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
75int nilfs_mdt_fetch_dirty(struct inode *); 75int nilfs_mdt_fetch_dirty(struct inode *);
76 76
77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, 77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
78 gfp_t);
79struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, 78struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
80 ino_t, gfp_t); 79 ino_t, gfp_t);
81void nilfs_mdt_destroy(struct inode *); 80void nilfs_mdt_destroy(struct inode *);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d80cc71be749..6dc83591d118 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
552 printk(KERN_WARNING 552 printk(KERN_WARNING
553 "NILFS warning: error recovering data block " 553 "NILFS warning: error recovering data block "
554 "(err=%d, ino=%lu, block-offset=%llu)\n", 554 "(err=%d, ino=%lu, block-offset=%llu)\n",
555 err, rb->ino, (unsigned long long)rb->blkoff); 555 err, (unsigned long)rb->ino,
556 (unsigned long long)rb->blkoff);
556 if (!err2) 557 if (!err2)
557 err2 = err; 558 err2 = err;
558 next: 559 next:
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 9e3fe17bb96b..e6d9e37fa241 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
316{ 316{
317 struct bio *bio; 317 struct bio *bio;
318 318
319 bio = bio_alloc(GFP_NOWAIT, nr_vecs); 319 bio = bio_alloc(GFP_NOIO, nr_vecs);
320 if (bio == NULL) { 320 if (bio == NULL) {
321 while (!bio && (nr_vecs >>= 1)) 321 while (!bio && (nr_vecs >>= 1))
322 bio = bio_alloc(GFP_NOWAIT, nr_vecs); 322 bio = bio_alloc(GFP_NOIO, nr_vecs);
323 } 323 }
324 if (likely(bio)) { 324 if (likely(bio)) {
325 bio->bi_bdev = sb->s_bdev; 325 bio->bi_bdev = sb->s_bdev;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 8b5e4778cf28..683df89dbae5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1859,12 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err)
1859 if (!page) 1859 if (!page)
1860 return; 1860 return;
1861 1861
1862 if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) 1862 if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
1863 /* 1863 /*
1864 * For b-tree node pages, this function may be called twice 1864 * For b-tree node pages, this function may be called twice
1865 * or more because they might be split in a segment. 1865 * or more because they might be split in a segment.
1866 */ 1866 */
1867 if (PageDirty(page)) {
1868 /*
1869 * For pages holding split b-tree node buffers, dirty
1870 * flag on the buffers may be cleared discretely.
1871 * In that case, the page is once redirtied for
1872 * remaining buffers, and it must be cancelled if
1873 * all the buffers get cleaned later.
1874 */
1875 lock_page(page);
1876 if (nilfs_page_buffers_clean(page))
1877 __nilfs_clear_page_dirty(page);
1878 unlock_page(page);
1879 }
1867 return; 1880 return;
1881 }
1868 1882
1869 __nilfs_end_page_io(page, err); 1883 __nilfs_end_page_io(page, err);
1870} 1884}
@@ -2487,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
2487 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2501 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2488 nilfs_discontinued(nilfs)) { 2502 nilfs_discontinued(nilfs)) {
2489 down_write(&nilfs->ns_sem); 2503 down_write(&nilfs->ns_sem);
2490 req->sb_err = nilfs_commit_super(sbi, 0); 2504 req->sb_err = nilfs_commit_super(sbi,
2505 nilfs_altsb_need_update(nilfs));
2491 up_write(&nilfs->ns_sem); 2506 up_write(&nilfs->ns_sem);
2492 } 2507 }
2493 } 2508 }
@@ -2675,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg)
2675 } else { 2690 } else {
2676 DEFINE_WAIT(wait); 2691 DEFINE_WAIT(wait);
2677 int should_sleep = 1; 2692 int should_sleep = 1;
2693 struct the_nilfs *nilfs;
2678 2694
2679 prepare_to_wait(&sci->sc_wait_daemon, &wait, 2695 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2680 TASK_INTERRUPTIBLE); 2696 TASK_INTERRUPTIBLE);
@@ -2695,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg)
2695 finish_wait(&sci->sc_wait_daemon, &wait); 2711 finish_wait(&sci->sc_wait_daemon, &wait);
2696 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2712 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2697 time_after_eq(jiffies, sci->sc_timer->expires)); 2713 time_after_eq(jiffies, sci->sc_timer->expires));
2714 nilfs = sci->sc_sbi->s_nilfs;
2715 if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
2716 set_nilfs_discontinued(nilfs);
2698 } 2717 }
2699 goto loop; 2718 goto loop;
2700 2719
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2c4d76c3366..0e99e5c0bd0f 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -28,7 +28,6 @@
28#include <linux/nilfs2_fs.h> 28#include <linux/nilfs2_fs.h>
29#include "mdt.h" 29#include "mdt.h"
30 30
31#define NILFS_SUFILE_GFP NILFS_MDT_GFP
32 31
33static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) 32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
34{ 33{
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8e2ec43b18f4..55f3d6b60732 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -50,6 +50,8 @@
50#include <linux/writeback.h> 50#include <linux/writeback.h>
51#include <linux/kobject.h> 51#include <linux/kobject.h>
52#include <linux/exportfs.h> 52#include <linux/exportfs.h>
53#include <linux/seq_file.h>
54#include <linux/mount.h>
53#include "nilfs.h" 55#include "nilfs.h"
54#include "mdt.h" 56#include "mdt.h"
55#include "alloc.h" 57#include "alloc.h"
@@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
65 "(NILFS)"); 67 "(NILFS)");
66MODULE_LICENSE("GPL"); 68MODULE_LICENSE("GPL");
67 69
68static void nilfs_write_super(struct super_block *sb);
69static int nilfs_remount(struct super_block *sb, int *flags, char *data); 70static int nilfs_remount(struct super_block *sb, int *flags, char *data);
70 71
71/** 72/**
@@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb)
311 312
312 lock_kernel(); 313 lock_kernel();
313 314
314 if (sb->s_dirt)
315 nilfs_write_super(sb);
316
317 nilfs_detach_segment_constructor(sbi); 315 nilfs_detach_segment_constructor(sbi);
318 316
319 if (!(sb->s_flags & MS_RDONLY)) { 317 if (!(sb->s_flags & MS_RDONLY)) {
@@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb)
336 unlock_kernel(); 334 unlock_kernel();
337} 335}
338 336
339/** 337static int nilfs_sync_fs(struct super_block *sb, int wait)
340 * nilfs_write_super - write super block(s) of NILFS
341 * @sb: super_block
342 *
343 * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
344 * clears s_dirt. This function is called in the section protected by
345 * lock_super().
346 *
347 * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
348 * of the struct the_nilfs. Lock order must be as follows:
349 *
350 * 1. lock_super()
351 * 2. down_write(&nilfs->ns_sem)
352 *
353 * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
354 * of the super block (nilfs->ns_sbp[]).
355 *
356 * In most cases, VFS functions call lock_super() before calling these
357 * methods. So we must be careful not to bring on deadlocks when using
358 * lock_super(); see generic_shutdown_super(), write_super(), and so on.
359 *
360 * Note that order of lock_kernel() and lock_super() depends on contexts
361 * of VFS. We should also note that lock_kernel() can be used in its
362 * protective section and only the outermost one has an effect.
363 */
364static void nilfs_write_super(struct super_block *sb)
365{ 338{
366 struct nilfs_sb_info *sbi = NILFS_SB(sb); 339 struct nilfs_sb_info *sbi = NILFS_SB(sb);
367 struct the_nilfs *nilfs = sbi->s_nilfs; 340 struct the_nilfs *nilfs = sbi->s_nilfs;
368
369 down_write(&nilfs->ns_sem);
370 if (!(sb->s_flags & MS_RDONLY)) {
371 struct nilfs_super_block **sbp = nilfs->ns_sbp;
372 u64 t = get_seconds();
373 int dupsb;
374
375 if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
376 t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
377 up_write(&nilfs->ns_sem);
378 return;
379 }
380 dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
381 nilfs_commit_super(sbi, dupsb);
382 }
383 sb->s_dirt = 0;
384 up_write(&nilfs->ns_sem);
385}
386
387static int nilfs_sync_fs(struct super_block *sb, int wait)
388{
389 int err = 0; 341 int err = 0;
390 342
391 nilfs_write_super(sb);
392
393 /* This function is called when super block should be written back */ 343 /* This function is called when super block should be written back */
394 if (wait) 344 if (wait)
395 err = nilfs_construct_segment(sb); 345 err = nilfs_construct_segment(sb);
346
347 down_write(&nilfs->ns_sem);
348 if (sb->s_dirt)
349 nilfs_commit_super(sbi, 1);
350 up_write(&nilfs->ns_sem);
351
396 return err; 352 return err;
397} 353}
398 354
@@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
407 list_add(&sbi->s_list, &nilfs->ns_supers); 363 list_add(&sbi->s_list, &nilfs->ns_supers);
408 up_write(&nilfs->ns_super_sem); 364 up_write(&nilfs->ns_super_sem);
409 365
410 sbi->s_ifile = nilfs_mdt_new( 366 sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
411 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
412 if (!sbi->s_ifile) 367 if (!sbi->s_ifile)
413 return -ENOMEM; 368 return -ENOMEM;
414 369
@@ -416,8 +371,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
416 if (unlikely(err)) 371 if (unlikely(err))
417 goto failed; 372 goto failed;
418 373
374 down_read(&nilfs->ns_segctor_sem);
419 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 375 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
420 &bh_cp); 376 &bh_cp);
377 up_read(&nilfs->ns_segctor_sem);
421 if (unlikely(err)) { 378 if (unlikely(err)) {
422 if (err == -ENOENT || err == -EINVAL) { 379 if (err == -ENOENT || err == -EINVAL) {
423 printk(KERN_ERR 380 printk(KERN_ERR
@@ -527,6 +484,26 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
527 return 0; 484 return 0;
528} 485}
529 486
487static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
488{
489 struct super_block *sb = vfs->mnt_sb;
490 struct nilfs_sb_info *sbi = NILFS_SB(sb);
491
492 if (!nilfs_test_opt(sbi, BARRIER))
493 seq_printf(seq, ",barrier=off");
494 if (nilfs_test_opt(sbi, SNAPSHOT))
495 seq_printf(seq, ",cp=%llu",
496 (unsigned long long int)sbi->s_snapshot_cno);
497 if (nilfs_test_opt(sbi, ERRORS_RO))
498 seq_printf(seq, ",errors=remount-ro");
499 if (nilfs_test_opt(sbi, ERRORS_PANIC))
500 seq_printf(seq, ",errors=panic");
501 if (nilfs_test_opt(sbi, STRICT_ORDER))
502 seq_printf(seq, ",order=strict");
503
504 return 0;
505}
506
530static struct super_operations nilfs_sops = { 507static struct super_operations nilfs_sops = {
531 .alloc_inode = nilfs_alloc_inode, 508 .alloc_inode = nilfs_alloc_inode,
532 .destroy_inode = nilfs_destroy_inode, 509 .destroy_inode = nilfs_destroy_inode,
@@ -536,7 +513,7 @@ static struct super_operations nilfs_sops = {
536 /* .drop_inode = nilfs_drop_inode, */ 513 /* .drop_inode = nilfs_drop_inode, */
537 .delete_inode = nilfs_delete_inode, 514 .delete_inode = nilfs_delete_inode,
538 .put_super = nilfs_put_super, 515 .put_super = nilfs_put_super,
539 .write_super = nilfs_write_super, 516 /* .write_super = nilfs_write_super, */
540 .sync_fs = nilfs_sync_fs, 517 .sync_fs = nilfs_sync_fs,
541 /* .write_super_lockfs */ 518 /* .write_super_lockfs */
542 /* .unlockfs */ 519 /* .unlockfs */
@@ -544,7 +521,7 @@ static struct super_operations nilfs_sops = {
544 .remount_fs = nilfs_remount, 521 .remount_fs = nilfs_remount,
545 .clear_inode = nilfs_clear_inode, 522 .clear_inode = nilfs_clear_inode,
546 /* .umount_begin */ 523 /* .umount_begin */
547 /* .show_options */ 524 .show_options = nilfs_show_options
548}; 525};
549 526
550static struct inode * 527static struct inode *
@@ -814,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
814 791
815 if (sb->s_flags & MS_RDONLY) { 792 if (sb->s_flags & MS_RDONLY) {
816 if (nilfs_test_opt(sbi, SNAPSHOT)) { 793 if (nilfs_test_opt(sbi, SNAPSHOT)) {
794 down_read(&nilfs->ns_segctor_sem);
817 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, 795 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
818 sbi->s_snapshot_cno); 796 sbi->s_snapshot_cno);
819 if (err < 0) 797 up_read(&nilfs->ns_segctor_sem);
798 if (err < 0) {
799 if (err == -ENOENT)
800 err = -EINVAL;
820 goto failed_sbi; 801 goto failed_sbi;
802 }
821 if (!err) { 803 if (!err) {
822 printk(KERN_ERR 804 printk(KERN_ERR
823 "NILFS: The specified checkpoint is " 805 "NILFS: The specified checkpoint is "
@@ -1125,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1125 */ 1107 */
1126 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); 1108 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
1127 1109
1128 if (!sd.cno)
1129 /* trying to get the latest checkpoint. */
1130 sd.cno = nilfs_last_cno(nilfs);
1131
1132 /* 1110 /*
1133 * Get super block instance holding the nilfs_sb_info struct. 1111 * Get super block instance holding the nilfs_sb_info struct.
1134 * A new instance is allocated if no existing mount is present or 1112 * A new instance is allocated if no existing mount is present or
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b8889825716..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
68 68
69 nilfs->ns_bdev = bdev; 69 nilfs->ns_bdev = bdev;
70 atomic_set(&nilfs->ns_count, 1); 70 atomic_set(&nilfs->ns_count, 1);
71 atomic_set(&nilfs->ns_writer_refcount, -1);
72 atomic_set(&nilfs->ns_ndirtyblks, 0); 71 atomic_set(&nilfs->ns_ndirtyblks, 0);
73 init_rwsem(&nilfs->ns_sem); 72 init_rwsem(&nilfs->ns_sem);
74 init_rwsem(&nilfs->ns_super_sem); 73 init_rwsem(&nilfs->ns_super_sem);
75 mutex_init(&nilfs->ns_mount_mutex); 74 mutex_init(&nilfs->ns_mount_mutex);
76 mutex_init(&nilfs->ns_writer_mutex); 75 init_rwsem(&nilfs->ns_writer_sem);
77 INIT_LIST_HEAD(&nilfs->ns_list); 76 INIT_LIST_HEAD(&nilfs->ns_list);
78 INIT_LIST_HEAD(&nilfs->ns_supers); 77 INIT_LIST_HEAD(&nilfs->ns_supers);
79 spin_lock_init(&nilfs->ns_last_segment_lock); 78 spin_lock_init(&nilfs->ns_last_segment_lock);
@@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
188 inode_size = nilfs->ns_inode_size; 187 inode_size = nilfs->ns_inode_size;
189 188
190 err = -ENOMEM; 189 err = -ENOMEM;
191 nilfs->ns_dat = nilfs_mdt_new( 190 nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
192 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
193 if (unlikely(!nilfs->ns_dat)) 191 if (unlikely(!nilfs->ns_dat))
194 goto failed; 192 goto failed;
195 193
196 nilfs->ns_gc_dat = nilfs_mdt_new( 194 nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
197 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
198 if (unlikely(!nilfs->ns_gc_dat)) 195 if (unlikely(!nilfs->ns_gc_dat))
199 goto failed_dat; 196 goto failed_dat;
200 197
201 nilfs->ns_cpfile = nilfs_mdt_new( 198 nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
202 nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
203 if (unlikely(!nilfs->ns_cpfile)) 199 if (unlikely(!nilfs->ns_cpfile))
204 goto failed_gc_dat; 200 goto failed_gc_dat;
205 201
206 nilfs->ns_sufile = nilfs_mdt_new( 202 nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
207 nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
208 if (unlikely(!nilfs->ns_sufile)) 203 if (unlikely(!nilfs->ns_sufile))
209 goto failed_cpfile; 204 goto failed_cpfile;
210 205
@@ -596,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
596 591
597 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); 592 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
598 593
599 bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; 594 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
600 if (!bdi)
601 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
602 nilfs->ns_bdi = bdi ? : &default_backing_dev_info; 595 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
603 596
604 /* Finding last segment */ 597 /* Finding last segment */
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e8adbffc626f..20abd55881e0 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,6 +37,7 @@ enum {
37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and 37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
38 the latest checkpoint was loaded */ 38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40 THE_NILFS_GC_RUNNING, /* gc process is running */
40}; 41};
41 42
42/** 43/**
@@ -50,8 +51,7 @@ enum {
50 * @ns_sem: semaphore for shared states 51 * @ns_sem: semaphore for shared states
51 * @ns_super_sem: semaphore for global operations across super block instances 52 * @ns_super_sem: semaphore for global operations across super block instances
52 * @ns_mount_mutex: mutex protecting mount process of nilfs 53 * @ns_mount_mutex: mutex protecting mount process of nilfs
53 * @ns_writer_mutex: mutex protecting ns_writer attach/detach 54 * @ns_writer_sem: semaphore protecting ns_writer attach/detach
54 * @ns_writer_refcount: number of referrers on ns_writer
55 * @ns_current: back pointer to current mount 55 * @ns_current: back pointer to current mount
56 * @ns_sbh: buffer heads of on-disk super blocks 56 * @ns_sbh: buffer heads of on-disk super blocks
57 * @ns_sbp: pointers to super block data 57 * @ns_sbp: pointers to super block data
@@ -100,8 +100,7 @@ struct the_nilfs {
100 struct rw_semaphore ns_sem; 100 struct rw_semaphore ns_sem;
101 struct rw_semaphore ns_super_sem; 101 struct rw_semaphore ns_super_sem;
102 struct mutex ns_mount_mutex; 102 struct mutex ns_mount_mutex;
103 struct mutex ns_writer_mutex; 103 struct rw_semaphore ns_writer_sem;
104 atomic_t ns_writer_refcount;
105 104
106 /* 105 /*
107 * components protected by ns_super_sem 106 * components protected by ns_super_sem
@@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \
197THE_NILFS_FNS(INIT, init) 196THE_NILFS_FNS(INIT, init)
198THE_NILFS_FNS(LOADED, loaded) 197THE_NILFS_FNS(LOADED, loaded)
199THE_NILFS_FNS(DISCONTINUED, discontinued) 198THE_NILFS_FNS(DISCONTINUED, discontinued)
199THE_NILFS_FNS(GC_RUNNING, gc_running)
200 200
201/* Minimum interval of periodical update of superblocks (in seconds) */ 201/* Minimum interval of periodical update of superblocks (in seconds) */
202#define NILFS_SB_FREQ 10 202#define NILFS_SB_FREQ 10
203#define NILFS_ALTSB_FREQ 60 /* spare superblock */ 203#define NILFS_ALTSB_FREQ 60 /* spare superblock */
204 204
205static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
206{
207 u64 t = get_seconds();
208 return t < nilfs->ns_sbwtime[0] ||
209 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
210}
211
212static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
213{
214 u64 t = get_seconds();
215 struct nilfs_super_block **sbp = nilfs->ns_sbp;
216 return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
217}
218
205void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 219void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
206struct the_nilfs *find_or_create_nilfs(struct block_device *); 220struct the_nilfs *find_or_create_nilfs(struct block_device *);
207void put_nilfs(struct the_nilfs *); 221void put_nilfs(struct the_nilfs *);
@@ -221,39 +235,26 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
221 atomic_inc(&nilfs->ns_count); 235 atomic_inc(&nilfs->ns_count);
222} 236}
223 237
224static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
225{
226 if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
227 mutex_lock(&nilfs->ns_writer_mutex);
228 return nilfs->ns_writer;
229}
230
231static inline void nilfs_put_writer(struct the_nilfs *nilfs)
232{
233 if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
234 mutex_unlock(&nilfs->ns_writer_mutex);
235}
236
237static inline void 238static inline void
238nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) 239nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
239{ 240{
240 mutex_lock(&nilfs->ns_writer_mutex); 241 down_write(&nilfs->ns_writer_sem);
241 nilfs->ns_writer = sbi; 242 nilfs->ns_writer = sbi;
242 mutex_unlock(&nilfs->ns_writer_mutex); 243 up_write(&nilfs->ns_writer_sem);
243} 244}
244 245
245static inline void 246static inline void
246nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) 247nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
247{ 248{
248 mutex_lock(&nilfs->ns_writer_mutex); 249 down_write(&nilfs->ns_writer_sem);
249 if (sbi == nilfs->ns_writer) 250 if (sbi == nilfs->ns_writer)
250 nilfs->ns_writer = NULL; 251 nilfs->ns_writer = NULL;
251 mutex_unlock(&nilfs->ns_writer_mutex); 252 up_write(&nilfs->ns_writer_sem);
252} 253}
253 254
254static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) 255static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
255{ 256{
256 if (!atomic_dec_and_test(&sbi->s_count)) 257 if (atomic_dec_and_test(&sbi->s_count))
257 kfree(sbi); 258 kfree(sbi);
258} 259}
259 260
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 31dac7e3b0f1..dffbb0911d02 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,15 +1,5 @@
1config FSNOTIFY 1config FSNOTIFY
2 bool "Filesystem notification backend" 2 def_bool n
3 default y
4 ---help---
5 fsnotify is a backend for filesystem notification. fsnotify does
6 not provide any userspace interface but does provide the basis
7 needed for other notification schemes such as dnotify, inotify,
8 and fanotify.
9
10 Say Y here to enable fsnotify suport.
11
12 If unsure, say Y.
13 3
14source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
15source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 904ff8d5405a..f9c1ca139d8f 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,6 +1,6 @@
1config DNOTIFY 1config DNOTIFY
2 bool "Dnotify support" 2 bool "Dnotify support"
3 depends on FSNOTIFY 3 select FSNOTIFY
4 default y 4 default y
5 help 5 help
6 Dnotify is a directory-based per-fd file change notification system 6 Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ec2f7bd76818..037e878e03fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
159 if (!group->ops->should_send_event(group, to_tell, mask)) 159 if (!group->ops->should_send_event(group, to_tell, mask))
160 continue; 160 continue;
161 if (!event) { 161 if (!event) {
162 event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); 162 event = fsnotify_create_event(to_tell, mask, data,
163 data_is, file_name, cookie,
164 GFP_KERNEL);
163 /* shit, we OOM'd and now we can't tell, maybe 165 /* shit, we OOM'd and now we can't tell, maybe
164 * someday someone else will want to do something 166 * someday someone else will want to do something
165 * here */ 167 * here */
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 5356884289a1..3e56dbffe729 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,7 +15,7 @@ config INOTIFY
15 15
16config INOTIFY_USER 16config INOTIFY_USER
17 bool "Inotify support for userspace" 17 bool "Inotify support for userspace"
18 depends on FSNOTIFY 18 select FSNOTIFY
19 default y 19 default y
20 ---help--- 20 ---help---
21 Say Y here to enable inotify support for userspace, including the 21 Say Y here to enable inotify support for userspace, including the
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 47cd258fd24d..c9ee67b442e1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
62 event_priv->wd = wd; 62 event_priv->wd = wd;
63 63
64 ret = fsnotify_add_notify_event(group, event, fsn_event_priv); 64 ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
65 /* EEXIST is not an error */ 65 if (ret) {
66 if (ret == -EEXIST)
67 ret = 0;
68
69 /* did event_priv get attached? */
70 if (list_empty(&fsn_event_priv->event_list))
71 inotify_free_event_priv(fsn_event_priv); 66 inotify_free_event_priv(fsn_event_priv);
67 /* EEXIST says we tail matched, EOVERFLOW isn't something
68 * to report up the stack. */
69 if ((ret == -EEXIST) ||
70 (ret == -EOVERFLOW))
71 ret = 0;
72 }
72 73
73 /* 74 /*
74 * If we hold the entry until after the event is on the queue 75 * If we hold the entry until after the event is on the queue
@@ -104,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
104 return send; 105 return send;
105} 106}
106 107
108/*
109 * This is NEVER supposed to be called. Inotify marks should either have been
110 * removed from the idr when the watch was removed or in the
111 * fsnotify_destroy_mark_by_group() call when the inotify instance was being
112 * torn down. This is only called if the idr is about to be freed but there
113 * are still marks in it.
114 */
107static int idr_callback(int id, void *p, void *data) 115static int idr_callback(int id, void *p, void *data)
108{ 116{
109 BUG(); 117 struct fsnotify_mark_entry *entry;
118 struct inotify_inode_mark_entry *ientry;
119 static bool warned = false;
120
121 if (warned)
122 return 0;
123
124 warned = false;
125 entry = p;
126 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
127
128 WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
129 "idr. Probably leaking memory\n", id, p, data);
130
131 /*
132 * I'm taking the liberty of assuming that the mark in question is a
133 * valid address and I'm dereferencing it. This might help to figure
134 * out why we got here and the panic is no worse than the original
135 * BUG() that was here.
136 */
137 if (entry)
138 printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
139 entry->group, entry->inode, ientry->wd);
110 return 0; 140 return 0;
111} 141}
112 142
113static void inotify_free_group_priv(struct fsnotify_group *group) 143static void inotify_free_group_priv(struct fsnotify_group *group)
114{ 144{
115 /* ideally the idr is empty and we won't hit the BUG in teh callback */ 145 /* ideally the idr is empty and we won't hit the BUG in teh callback */
116 idr_for_each(&group->inotify_data.idr, idr_callback, NULL); 146 idr_for_each(&group->inotify_data.idr, idr_callback, group);
117 idr_remove_all(&group->inotify_data.idr); 147 idr_remove_all(&group->inotify_data.idr);
118 idr_destroy(&group->inotify_data.idr); 148 idr_destroy(&group->inotify_data.idr);
119} 149}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ff27a2965844..dcd2040d330c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,9 +47,6 @@
47 47
48static struct vfsmount *inotify_mnt __read_mostly; 48static struct vfsmount *inotify_mnt __read_mostly;
49 49
50/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */
51static struct inotify_event nul_inotify_event;
52
53/* these are configurable via /proc/sys/fs/inotify/ */ 50/* these are configurable via /proc/sys/fs/inotify/ */
54static int inotify_max_user_instances __read_mostly; 51static int inotify_max_user_instances __read_mostly;
55static int inotify_max_queued_events __read_mostly; 52static int inotify_max_queued_events __read_mostly;
@@ -57,7 +54,6 @@ int inotify_max_user_watches __read_mostly;
57 54
58static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 55static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
59struct kmem_cache *event_priv_cachep __read_mostly; 56struct kmem_cache *event_priv_cachep __read_mostly;
60static struct fsnotify_event *inotify_ignored_event;
61 57
62/* 58/*
63 * When inotify registers a new group it increments this and uses that 59 * When inotify registers a new group it increments this and uses that
@@ -158,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
158 154
159 event = fsnotify_peek_notify_event(group); 155 event = fsnotify_peek_notify_event(group);
160 156
161 event_size += roundup(event->name_len, event_size); 157 if (event->name_len)
158 event_size += roundup(event->name_len + 1, event_size);
162 159
163 if (event_size > count) 160 if (event_size > count)
164 return ERR_PTR(-EINVAL); 161 return ERR_PTR(-EINVAL);
@@ -184,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
184 struct fsnotify_event_private_data *fsn_priv; 181 struct fsnotify_event_private_data *fsn_priv;
185 struct inotify_event_private_data *priv; 182 struct inotify_event_private_data *priv;
186 size_t event_size = sizeof(struct inotify_event); 183 size_t event_size = sizeof(struct inotify_event);
187 size_t name_len; 184 size_t name_len = 0;
188 185
189 /* we get the inotify watch descriptor from the event private data */ 186 /* we get the inotify watch descriptor from the event private data */
190 spin_lock(&event->lock); 187 spin_lock(&event->lock);
@@ -200,8 +197,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
200 inotify_free_event_priv(fsn_priv); 197 inotify_free_event_priv(fsn_priv);
201 } 198 }
202 199
203 /* round up event->name_len so it is a multiple of event_size */ 200 /*
204 name_len = roundup(event->name_len, event_size); 201 * round up event->name_len so it is a multiple of event_size
202 * plus an extra byte for the terminating '\0'.
203 */
204 if (event->name_len)
205 name_len = roundup(event->name_len + 1, event_size);
205 inotify_event.len = name_len; 206 inotify_event.len = name_len;
206 207
207 inotify_event.mask = inotify_mask_to_arg(event->mask); 208 inotify_event.mask = inotify_mask_to_arg(event->mask);
@@ -225,8 +226,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
225 return -EFAULT; 226 return -EFAULT;
226 buf += event->name_len; 227 buf += event->name_len;
227 228
228 /* fill userspace with 0's from nul_inotify_event */ 229 /* fill userspace with 0's */
229 if (copy_to_user(buf, &nul_inotify_event, len_to_zero)) 230 if (clear_user(buf, len_to_zero))
230 return -EFAULT; 231 return -EFAULT;
231 buf += len_to_zero; 232 buf += len_to_zero;
232 event_size += name_len; 233 event_size += name_len;
@@ -327,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
327 list_for_each_entry(holder, &group->notification_list, event_list) { 328 list_for_each_entry(holder, &group->notification_list, event_list) {
328 event = holder->event; 329 event = holder->event;
329 send_len += sizeof(struct inotify_event); 330 send_len += sizeof(struct inotify_event);
330 send_len += roundup(event->name_len, 331 if (event->name_len)
331 sizeof(struct inotify_event)); 332 send_len += roundup(event->name_len + 1,
333 sizeof(struct inotify_event));
332 } 334 }
333 mutex_unlock(&group->notification_mutex); 335 mutex_unlock(&group->notification_mutex);
334 ret = put_user(send_len, (int __user *) p); 336 ret = put_user(send_len, (int __user *) p);
@@ -366,20 +368,71 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
366} 368}
367 369
368/* 370/*
369 * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the 371 * Remove the mark from the idr (if present) and drop the reference
370 * internal reference help on the mark because it is in the idr. 372 * on the mark because it was in the idr.
373 */
374static void inotify_remove_from_idr(struct fsnotify_group *group,
375 struct inotify_inode_mark_entry *ientry)
376{
377 struct idr *idr;
378 struct fsnotify_mark_entry *entry;
379 struct inotify_inode_mark_entry *found_ientry;
380 int wd;
381
382 spin_lock(&group->inotify_data.idr_lock);
383 idr = &group->inotify_data.idr;
384 wd = ientry->wd;
385
386 if (wd == -1)
387 goto out;
388
389 entry = idr_find(&group->inotify_data.idr, wd);
390 if (unlikely(!entry))
391 goto out;
392
393 found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
394 if (unlikely(found_ientry != ientry)) {
395 /* We found an entry in the idr with the right wd, but it's
396 * not the entry we were told to remove. eparis seriously
397 * fucked up somewhere. */
398 WARN_ON(1);
399 ientry->wd = -1;
400 goto out;
401 }
402
403 /* One ref for being in the idr, one ref held by the caller */
404 BUG_ON(atomic_read(&entry->refcnt) < 2);
405
406 idr_remove(idr, wd);
407 ientry->wd = -1;
408
409 /* removed from the idr, drop that ref */
410 fsnotify_put_mark(entry);
411out:
412 spin_unlock(&group->inotify_data.idr_lock);
413}
414
415/*
416 * Send IN_IGNORED for this wd, remove this wd from the idr.
371 */ 417 */
372void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 418void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
373 struct fsnotify_group *group) 419 struct fsnotify_group *group)
374{ 420{
375 struct inotify_inode_mark_entry *ientry; 421 struct inotify_inode_mark_entry *ientry;
422 struct fsnotify_event *ignored_event;
376 struct inotify_event_private_data *event_priv; 423 struct inotify_event_private_data *event_priv;
377 struct fsnotify_event_private_data *fsn_event_priv; 424 struct fsnotify_event_private_data *fsn_event_priv;
378 struct idr *idr; 425 int ret;
426
427 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
428 FSNOTIFY_EVENT_NONE, NULL, 0,
429 GFP_NOFS);
430 if (!ignored_event)
431 return;
379 432
380 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 433 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
381 434
382 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 435 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
383 if (unlikely(!event_priv)) 436 if (unlikely(!event_priv))
384 goto skip_send_ignore; 437 goto skip_send_ignore;
385 438
@@ -388,22 +441,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
388 fsn_event_priv->group = group; 441 fsn_event_priv->group = group;
389 event_priv->wd = ientry->wd; 442 event_priv->wd = ientry->wd;
390 443
391 fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); 444 ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
392 445 if (ret)
393 /* did the private data get added? */
394 if (list_empty(&fsn_event_priv->event_list))
395 inotify_free_event_priv(fsn_event_priv); 446 inotify_free_event_priv(fsn_event_priv);
396 447
397skip_send_ignore: 448skip_send_ignore:
398 449
450 /* matches the reference taken when the event was created */
451 fsnotify_put_event(ignored_event);
452
399 /* remove this entry from the idr */ 453 /* remove this entry from the idr */
400 spin_lock(&group->inotify_data.idr_lock); 454 inotify_remove_from_idr(group, ientry);
401 idr = &group->inotify_data.idr;
402 idr_remove(idr, ientry->wd);
403 spin_unlock(&group->inotify_data.idr_lock);
404 455
405 /* removed from idr, drop that reference */ 456 atomic_dec(&group->inotify_data.user->inotify_watches);
406 fsnotify_put_mark(entry);
407} 457}
408 458
409/* ding dong the mark is dead */ 459/* ding dong the mark is dead */
@@ -414,67 +464,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry)
414 kmem_cache_free(inotify_inode_mark_cachep, ientry); 464 kmem_cache_free(inotify_inode_mark_cachep, ientry);
415} 465}
416 466
417static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) 467static int inotify_update_existing_watch(struct fsnotify_group *group,
468 struct inode *inode,
469 u32 arg)
418{ 470{
419 struct fsnotify_mark_entry *entry = NULL; 471 struct fsnotify_mark_entry *entry;
420 struct inotify_inode_mark_entry *ientry; 472 struct inotify_inode_mark_entry *ientry;
421 int ret = 0;
422 int add = (arg & IN_MASK_ADD);
423 __u32 mask;
424 __u32 old_mask, new_mask; 473 __u32 old_mask, new_mask;
474 __u32 mask;
475 int add = (arg & IN_MASK_ADD);
476 int ret;
425 477
426 /* don't allow invalid bits: we don't want flags set */ 478 /* don't allow invalid bits: we don't want flags set */
427 mask = inotify_arg_to_mask(arg); 479 mask = inotify_arg_to_mask(arg);
428 if (unlikely(!mask)) 480 if (unlikely(!mask))
429 return -EINVAL; 481 return -EINVAL;
430 482
431 ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
432 if (unlikely(!ientry))
433 return -ENOMEM;
434 /* we set the mask at the end after attaching it */
435 fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
436 ientry->wd = 0;
437
438find_entry:
439 spin_lock(&inode->i_lock); 483 spin_lock(&inode->i_lock);
440 entry = fsnotify_find_mark_entry(group, inode); 484 entry = fsnotify_find_mark_entry(group, inode);
441 spin_unlock(&inode->i_lock); 485 spin_unlock(&inode->i_lock);
442 if (entry) { 486 if (!entry)
443 kmem_cache_free(inotify_inode_mark_cachep, ientry); 487 return -ENOENT;
444 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
445 } else {
446 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
447 ret = -ENOSPC;
448 goto out_err;
449 }
450
451 ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
452 if (ret == -EEXIST)
453 goto find_entry;
454 else if (ret)
455 goto out_err;
456 488
457 entry = &ientry->fsn_entry; 489 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
458retry:
459 ret = -ENOMEM;
460 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
461 goto out_err;
462
463 spin_lock(&group->inotify_data.idr_lock);
464 /* if entry is added to the idr we keep the reference obtained
465 * through fsnotify_mark_add. remember to drop this reference
466 * when entry is removed from idr */
467 ret = idr_get_new_above(&group->inotify_data.idr, entry,
468 ++group->inotify_data.last_wd,
469 &ientry->wd);
470 spin_unlock(&group->inotify_data.idr_lock);
471 if (ret) {
472 if (ret == -EAGAIN)
473 goto retry;
474 goto out_err;
475 }
476 atomic_inc(&group->inotify_data.user->inotify_watches);
477 }
478 490
479 spin_lock(&entry->lock); 491 spin_lock(&entry->lock);
480 492
@@ -506,14 +518,108 @@ retry:
506 fsnotify_recalc_group_mask(group); 518 fsnotify_recalc_group_mask(group);
507 } 519 }
508 520
509 return ientry->wd; 521 /* return the wd */
522 ret = ientry->wd;
510 523
511out_err: 524 /* match the get from fsnotify_find_mark_entry() */
512 /* see this isn't supposed to happen, just kill the watch */ 525 fsnotify_put_mark(entry);
513 if (entry) { 526
514 fsnotify_destroy_mark_by_entry(entry); 527 return ret;
515 fsnotify_put_mark(entry); 528}
529
530static int inotify_new_watch(struct fsnotify_group *group,
531 struct inode *inode,
532 u32 arg)
533{
534 struct inotify_inode_mark_entry *tmp_ientry;
535 __u32 mask;
536 int ret;
537
538 /* don't allow invalid bits: we don't want flags set */
539 mask = inotify_arg_to_mask(arg);
540 if (unlikely(!mask))
541 return -EINVAL;
542
543 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
544 if (unlikely(!tmp_ientry))
545 return -ENOMEM;
546
547 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
548 tmp_ientry->fsn_entry.mask = mask;
549 tmp_ientry->wd = -1;
550
551 ret = -ENOSPC;
552 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
553 goto out_err;
554retry:
555 ret = -ENOMEM;
556 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
557 goto out_err;
558
559 spin_lock(&group->inotify_data.idr_lock);
560 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
561 group->inotify_data.last_wd,
562 &tmp_ientry->wd);
563 spin_unlock(&group->inotify_data.idr_lock);
564 if (ret) {
565 /* idr was out of memory allocate and try again */
566 if (ret == -EAGAIN)
567 goto retry;
568 goto out_err;
569 }
570
571 /* we put the mark on the idr, take a reference */
572 fsnotify_get_mark(&tmp_ientry->fsn_entry);
573
574 /* we are on the idr, now get on the inode */
575 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
576 if (ret) {
577 /* we failed to get on the inode, get off the idr */
578 inotify_remove_from_idr(group, tmp_ientry);
579 goto out_err;
516 } 580 }
581
582 /* update the idr hint, who cares about races, it's just a hint */
583 group->inotify_data.last_wd = tmp_ientry->wd;
584
585 /* increment the number of watches the user has */
586 atomic_inc(&group->inotify_data.user->inotify_watches);
587
588 /* return the watch descriptor for this new entry */
589 ret = tmp_ientry->wd;
590
591 /* match the ref from fsnotify_init_markentry() */
592 fsnotify_put_mark(&tmp_ientry->fsn_entry);
593
594 /* if this mark added a new event update the group mask */
595 if (mask & ~group->mask)
596 fsnotify_recalc_group_mask(group);
597
598out_err:
599 if (ret < 0)
600 kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
601
602 return ret;
603}
604
605static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
606{
607 int ret = 0;
608
609retry:
610 /* try to update and existing watch with the new arg */
611 ret = inotify_update_existing_watch(group, inode, arg);
612 /* no mark present, try to add a new one */
613 if (ret == -ENOENT)
614 ret = inotify_new_watch(group, inode, arg);
615 /*
616 * inotify_new_watch could race with another thread which did an
617 * inotify_new_watch between the update_existing and the add watch
618 * here, go back and try to update an existing mark again.
619 */
620 if (ret == -EEXIST)
621 goto retry;
622
517 return ret; 623 return ret;
518} 624}
519 625
@@ -532,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
532 638
533 spin_lock_init(&group->inotify_data.idr_lock); 639 spin_lock_init(&group->inotify_data.idr_lock);
534 idr_init(&group->inotify_data.idr); 640 idr_init(&group->inotify_data.idr);
535 group->inotify_data.last_wd = 0; 641 group->inotify_data.last_wd = 1;
536 group->inotify_data.user = user; 642 group->inotify_data.user = user;
537 group->inotify_data.fa = NULL; 643 group->inotify_data.fa = NULL;
538 644
@@ -721,9 +827,6 @@ static int __init inotify_user_setup(void)
721 827
722 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 828 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
723 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 829 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
724 inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
725 if (!inotify_ignored_event)
726 panic("unable to allocate the inotify ignored event\n");
727 830
728 inotify_max_queued_events = 16384; 831 inotify_max_queued_events = 16384;
729 inotify_max_user_instances = 128; 832 inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 959b73e756fd..3816d5750dd5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -136,18 +136,28 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
136{ 136{
137 if ((old->mask == new->mask) && 137 if ((old->mask == new->mask) &&
138 (old->to_tell == new->to_tell) && 138 (old->to_tell == new->to_tell) &&
139 (old->data_type == new->data_type)) { 139 (old->data_type == new->data_type) &&
140 (old->name_len == new->name_len)) {
140 switch (old->data_type) { 141 switch (old->data_type) {
141 case (FSNOTIFY_EVENT_INODE): 142 case (FSNOTIFY_EVENT_INODE):
142 if (old->inode == new->inode) 143 /* remember, after old was put on the wait_q we aren't
144 * allowed to look at the inode any more, only thing
145 * left to check was if the file_name is the same */
146 if (old->name_len &&
147 !strcmp(old->file_name, new->file_name))
143 return true; 148 return true;
144 break; 149 break;
145 case (FSNOTIFY_EVENT_PATH): 150 case (FSNOTIFY_EVENT_PATH):
146 if ((old->path.mnt == new->path.mnt) && 151 if ((old->path.mnt == new->path.mnt) &&
147 (old->path.dentry == new->path.dentry)) 152 (old->path.dentry == new->path.dentry))
148 return true; 153 return true;
154 break;
149 case (FSNOTIFY_EVENT_NONE): 155 case (FSNOTIFY_EVENT_NONE):
150 return true; 156 if (old->mask & FS_Q_OVERFLOW)
157 return true;
158 else if (old->mask & FS_IN_IGNORED)
159 return false;
160 return false;
151 }; 161 };
152 } 162 }
153 return false; 163 return false;
@@ -165,9 +175,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
165 struct list_head *list = &group->notification_list; 175 struct list_head *list = &group->notification_list;
166 struct fsnotify_event_holder *last_holder; 176 struct fsnotify_event_holder *last_holder;
167 struct fsnotify_event *last_event; 177 struct fsnotify_event *last_event;
168 178 int ret = 0;
169 /* easy to tell if priv was attached to the event */
170 INIT_LIST_HEAD(&priv->event_list);
171 179
172 /* 180 /*
173 * There is one fsnotify_event_holder embedded inside each fsnotify_event. 181 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -188,6 +196,7 @@ alloc_holder:
188 196
189 if (group->q_len >= group->max_events) { 197 if (group->q_len >= group->max_events) {
190 event = &q_overflow_event; 198 event = &q_overflow_event;
199 ret = -EOVERFLOW;
191 /* sorry, no private data on the overflow event */ 200 /* sorry, no private data on the overflow event */
192 priv = NULL; 201 priv = NULL;
193 } 202 }
@@ -229,7 +238,7 @@ alloc_holder:
229 mutex_unlock(&group->notification_mutex); 238 mutex_unlock(&group->notification_mutex);
230 239
231 wake_up(&group->notification_waitq); 240 wake_up(&group->notification_waitq);
232 return 0; 241 return ret;
233} 242}
234 243
235/* 244/*
@@ -339,18 +348,19 @@ static void initialize_event(struct fsnotify_event *event)
339 * @name the filename, if available 348 * @name the filename, if available
340 */ 349 */
341struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 350struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
342 int data_type, const char *name, u32 cookie) 351 int data_type, const char *name, u32 cookie,
352 gfp_t gfp)
343{ 353{
344 struct fsnotify_event *event; 354 struct fsnotify_event *event;
345 355
346 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); 356 event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
347 if (!event) 357 if (!event)
348 return NULL; 358 return NULL;
349 359
350 initialize_event(event); 360 initialize_event(event);
351 361
352 if (name) { 362 if (name) {
353 event->file_name = kstrdup(name, GFP_KERNEL); 363 event->file_name = kstrdup(name, gfp);
354 if (!event->file_name) { 364 if (!event->file_name) {
355 kmem_cache_free(fsnotify_event_cachep, event); 365 kmem_cache_free(fsnotify_event_cachep, event);
356 return NULL; 366 return NULL;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3140a4429af1..4350d4993b18 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2076,14 +2076,6 @@ err_out:
2076 *ppos = pos; 2076 *ppos = pos;
2077 if (cached_page) 2077 if (cached_page)
2078 page_cache_release(cached_page); 2078 page_cache_release(cached_page);
2079 /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
2080 if (likely(!status)) {
2081 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
2082 if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
2083 status = generic_osync_inode(vi, mapping,
2084 OSYNC_METADATA|OSYNC_DATA);
2085 }
2086 }
2087 pagevec_lru_add_file(&lru_pvec); 2079 pagevec_lru_add_file(&lru_pvec);
2088 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2080 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2089 written ? "written" : "status", (unsigned long)written, 2081 written ? "written" : "status", (unsigned long)written,
@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2145 mutex_lock(&inode->i_mutex); 2137 mutex_lock(&inode->i_mutex);
2146 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); 2138 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
2147 mutex_unlock(&inode->i_mutex); 2139 mutex_unlock(&inode->i_mutex);
2148 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2140 if (ret > 0) {
2149 int err = sync_page_range(inode, mapping, pos, ret); 2141 int err = generic_write_sync(file, pos, ret);
2150 if (err < 0) 2142 if (err < 0)
2151 ret = err; 2143 ret = err;
2152 } 2144 }
@@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2173 if (ret == -EIOCBQUEUED) 2165 if (ret == -EIOCBQUEUED)
2174 ret = wait_on_sync_kiocb(&kiocb); 2166 ret = wait_on_sync_kiocb(&kiocb);
2175 mutex_unlock(&inode->i_mutex); 2167 mutex_unlock(&inode->i_mutex);
2176 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2168 if (ret > 0) {
2177 int err = sync_page_range(inode, mapping, *ppos - ret, ret); 2169 int err = generic_write_sync(file, *ppos - ret, ret);
2178 if (err < 0) 2170 if (err < 0)
2179 ret = err; 2171 ret = err;
2180 } 2172 }
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 23bf68453d7d..1caa0ef0b2bb 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -384,13 +384,12 @@ unm_err_out:
384 * it is dirty in the inode meta data rather than the data page cache of the 384 * it is dirty in the inode meta data rather than the data page cache of the
385 * inode, and thus there are no data pages that need writing out. Therefore, a 385 * inode, and thus there are no data pages that need writing out. Therefore, a
386 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the 386 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
387 * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to 387 * other hand, is not sufficient, because ->write_inode needs to be called even
388 * ensure ->write_inode is called from generic_osync_inode() and this needs to 388 * in case of fdatasync. This needs to happen or the file data would not
389 * happen or the file data would not necessarily hit the device synchronously, 389 * necessarily hit the device synchronously, even though the vfs inode has the
390 * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC 390 * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
391 * simply "feels" better than just I_DIRTY_SYNC, since the file data has not 391 * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
392 * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own 392 * which is not what I_DIRTY_SYNC on its own would suggest.
393 * would suggest.
394 */ 393 */
395void __mark_mft_record_dirty(ntfs_inode *ni) 394void __mark_mft_record_dirty(ntfs_inode *ni)
396{ 395{
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9edcde4974aa..ab513ddaeff2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1914 * immediately to their right. 1914 * immediately to their right.
1915 */ 1915 */
1916 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1916 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1917 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { 1917 if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1918 BUG_ON(right_child_el->l_tree_depth);
1918 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); 1919 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1919 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); 1920 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1920 } 1921 }
@@ -2476,15 +2477,37 @@ out_ret_path:
2476 return ret; 2477 return ret;
2477} 2478}
2478 2479
2479static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, 2480static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2480 struct ocfs2_path *path) 2481 int subtree_index, struct ocfs2_path *path)
2481{ 2482{
2482 int i, idx; 2483 int i, idx, ret;
2483 struct ocfs2_extent_rec *rec; 2484 struct ocfs2_extent_rec *rec;
2484 struct ocfs2_extent_list *el; 2485 struct ocfs2_extent_list *el;
2485 struct ocfs2_extent_block *eb; 2486 struct ocfs2_extent_block *eb;
2486 u32 range; 2487 u32 range;
2487 2488
2489 /*
2490 * In normal tree rotation process, we will never touch the
2491 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2492 * doesn't reserve the credits for them either.
2493 *
2494 * But we do have a special case here which will update the rightmost
2495 * records for all the bh in the path.
2496 * So we have to allocate extra credits and access them.
2497 */
2498 ret = ocfs2_extend_trans(handle,
2499 handle->h_buffer_credits + subtree_index);
2500 if (ret) {
2501 mlog_errno(ret);
2502 goto out;
2503 }
2504
2505 ret = ocfs2_journal_access_path(inode, handle, path);
2506 if (ret) {
2507 mlog_errno(ret);
2508 goto out;
2509 }
2510
2488 /* Path should always be rightmost. */ 2511 /* Path should always be rightmost. */
2489 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2512 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2490 BUG_ON(eb->h_next_leaf_blk != 0ULL); 2513 BUG_ON(eb->h_next_leaf_blk != 0ULL);
@@ -2505,6 +2528,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2505 2528
2506 ocfs2_journal_dirty(handle, path->p_node[i].bh); 2529 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2507 } 2530 }
2531out:
2532 return ret;
2508} 2533}
2509 2534
2510static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, 2535static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
@@ -2717,7 +2742,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2717 if (del_right_subtree) { 2742 if (del_right_subtree) {
2718 ocfs2_unlink_subtree(inode, handle, left_path, right_path, 2743 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2719 subtree_index, dealloc); 2744 subtree_index, dealloc);
2720 ocfs2_update_edge_lengths(inode, handle, left_path); 2745 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
2746 left_path);
2747 if (ret) {
2748 mlog_errno(ret);
2749 goto out;
2750 }
2721 2751
2722 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2752 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2723 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2753 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -3034,7 +3064,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3034 3064
3035 ocfs2_unlink_subtree(inode, handle, left_path, path, 3065 ocfs2_unlink_subtree(inode, handle, left_path, path,
3036 subtree_index, dealloc); 3066 subtree_index, dealloc);
3037 ocfs2_update_edge_lengths(inode, handle, left_path); 3067 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
3068 left_path);
3069 if (ret) {
3070 mlog_errno(ret);
3071 goto out;
3072 }
3038 3073
3039 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 3074 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3040 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 3075 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -6816,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6816 } 6851 }
6817 status = 0; 6852 status = 0;
6818bail: 6853bail:
6819 6854 brelse(last_eb_bh);
6820 mlog_exit(status); 6855 mlog_exit(status);
6821 return status; 6856 return status;
6822} 6857}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b2c52b3a1484..8a1e61545f41 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
193 (unsigned long long)OCFS2_I(inode)->ip_blkno); 193 (unsigned long long)OCFS2_I(inode)->ip_blkno);
194 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); 194 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
195 dump_stack(); 195 dump_stack();
196 goto bail;
196 } 197 }
197 198
198 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 199 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
@@ -894,18 +895,17 @@ struct ocfs2_write_cluster_desc {
894 */ 895 */
895 unsigned c_new; 896 unsigned c_new;
896 unsigned c_unwritten; 897 unsigned c_unwritten;
898 unsigned c_needs_zero;
897}; 899};
898 900
899static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
900{
901 return d->c_new || d->c_unwritten;
902}
903
904struct ocfs2_write_ctxt { 901struct ocfs2_write_ctxt {
905 /* Logical cluster position / len of write */ 902 /* Logical cluster position / len of write */
906 u32 w_cpos; 903 u32 w_cpos;
907 u32 w_clen; 904 u32 w_clen;
908 905
906 /* First cluster allocated in a nonsparse extend */
907 u32 w_first_new_cpos;
908
909 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; 909 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
910 910
911 /* 911 /*
@@ -983,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
983 return -ENOMEM; 983 return -ENOMEM;
984 984
985 wc->w_cpos = pos >> osb->s_clustersize_bits; 985 wc->w_cpos = pos >> osb->s_clustersize_bits;
986 wc->w_first_new_cpos = UINT_MAX;
986 cend = (pos + len - 1) >> osb->s_clustersize_bits; 987 cend = (pos + len - 1) >> osb->s_clustersize_bits;
987 wc->w_clen = cend - wc->w_cpos + 1; 988 wc->w_clen = cend - wc->w_cpos + 1;
988 get_bh(di_bh); 989 get_bh(di_bh);
@@ -1217,20 +1218,18 @@ out:
1217 */ 1218 */
1218static int ocfs2_write_cluster(struct address_space *mapping, 1219static int ocfs2_write_cluster(struct address_space *mapping,
1219 u32 phys, unsigned int unwritten, 1220 u32 phys, unsigned int unwritten,
1221 unsigned int should_zero,
1220 struct ocfs2_alloc_context *data_ac, 1222 struct ocfs2_alloc_context *data_ac,
1221 struct ocfs2_alloc_context *meta_ac, 1223 struct ocfs2_alloc_context *meta_ac,
1222 struct ocfs2_write_ctxt *wc, u32 cpos, 1224 struct ocfs2_write_ctxt *wc, u32 cpos,
1223 loff_t user_pos, unsigned user_len) 1225 loff_t user_pos, unsigned user_len)
1224{ 1226{
1225 int ret, i, new, should_zero = 0; 1227 int ret, i, new;
1226 u64 v_blkno, p_blkno; 1228 u64 v_blkno, p_blkno;
1227 struct inode *inode = mapping->host; 1229 struct inode *inode = mapping->host;
1228 struct ocfs2_extent_tree et; 1230 struct ocfs2_extent_tree et;
1229 1231
1230 new = phys == 0 ? 1 : 0; 1232 new = phys == 0 ? 1 : 0;
1231 if (new || unwritten)
1232 should_zero = 1;
1233
1234 if (new) { 1233 if (new) {
1235 u32 tmp_pos; 1234 u32 tmp_pos;
1236 1235
@@ -1301,7 +1300,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1301 if (tmpret) { 1300 if (tmpret) {
1302 mlog_errno(tmpret); 1301 mlog_errno(tmpret);
1303 if (ret == 0) 1302 if (ret == 0)
1304 tmpret = ret; 1303 ret = tmpret;
1305 } 1304 }
1306 } 1305 }
1307 1306
@@ -1341,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1341 local_len = osb->s_clustersize - cluster_off; 1340 local_len = osb->s_clustersize - cluster_off;
1342 1341
1343 ret = ocfs2_write_cluster(mapping, desc->c_phys, 1342 ret = ocfs2_write_cluster(mapping, desc->c_phys,
1344 desc->c_unwritten, data_ac, meta_ac, 1343 desc->c_unwritten,
1344 desc->c_needs_zero,
1345 data_ac, meta_ac,
1345 wc, desc->c_cpos, pos, local_len); 1346 wc, desc->c_cpos, pos, local_len);
1346 if (ret) { 1347 if (ret) {
1347 mlog_errno(ret); 1348 mlog_errno(ret);
@@ -1391,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1391 * newly allocated cluster. 1392 * newly allocated cluster.
1392 */ 1393 */
1393 desc = &wc->w_desc[0]; 1394 desc = &wc->w_desc[0];
1394 if (ocfs2_should_zero_cluster(desc)) 1395 if (desc->c_needs_zero)
1395 ocfs2_figure_cluster_boundaries(osb, 1396 ocfs2_figure_cluster_boundaries(osb,
1396 desc->c_cpos, 1397 desc->c_cpos,
1397 &wc->w_target_from, 1398 &wc->w_target_from,
1398 NULL); 1399 NULL);
1399 1400
1400 desc = &wc->w_desc[wc->w_clen - 1]; 1401 desc = &wc->w_desc[wc->w_clen - 1];
1401 if (ocfs2_should_zero_cluster(desc)) 1402 if (desc->c_needs_zero)
1402 ocfs2_figure_cluster_boundaries(osb, 1403 ocfs2_figure_cluster_boundaries(osb,
1403 desc->c_cpos, 1404 desc->c_cpos,
1404 NULL, 1405 NULL,
@@ -1466,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode,
1466 phys++; 1467 phys++;
1467 } 1468 }
1468 1469
1470 /*
1471 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
1472 * file that got extended. w_first_new_cpos tells us
1473 * where the newly allocated clusters are so we can
1474 * zero them.
1475 */
1476 if (desc->c_cpos >= wc->w_first_new_cpos) {
1477 BUG_ON(phys == 0);
1478 desc->c_needs_zero = 1;
1479 }
1480
1469 desc->c_phys = phys; 1481 desc->c_phys = phys;
1470 if (phys == 0) { 1482 if (phys == 0) {
1471 desc->c_new = 1; 1483 desc->c_new = 1;
1484 desc->c_needs_zero = 1;
1472 *clusters_to_alloc = *clusters_to_alloc + 1; 1485 *clusters_to_alloc = *clusters_to_alloc + 1;
1473 } 1486 }
1474 if (ext_flags & OCFS2_EXT_UNWRITTEN) 1487
1488 if (ext_flags & OCFS2_EXT_UNWRITTEN) {
1475 desc->c_unwritten = 1; 1489 desc->c_unwritten = 1;
1490 desc->c_needs_zero = 1;
1491 }
1476 1492
1477 num_clusters--; 1493 num_clusters--;
1478 } 1494 }
@@ -1632,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
1632 if (newsize <= i_size_read(inode)) 1648 if (newsize <= i_size_read(inode))
1633 return 0; 1649 return 0;
1634 1650
1635 ret = ocfs2_extend_no_holes(inode, newsize, newsize - len); 1651 ret = ocfs2_extend_no_holes(inode, newsize, pos);
1636 if (ret) 1652 if (ret)
1637 mlog_errno(ret); 1653 mlog_errno(ret);
1638 1654
1655 wc->w_first_new_cpos =
1656 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
1657
1639 return ret; 1658 return ret;
1640} 1659}
1641 1660
@@ -1644,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1644 struct page **pagep, void **fsdata, 1663 struct page **pagep, void **fsdata,
1645 struct buffer_head *di_bh, struct page *mmap_page) 1664 struct buffer_head *di_bh, struct page *mmap_page)
1646{ 1665{
1647 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1666 int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
1648 unsigned int clusters_to_alloc, extents_to_split; 1667 unsigned int clusters_to_alloc, extents_to_split;
1649 struct ocfs2_write_ctxt *wc; 1668 struct ocfs2_write_ctxt *wc;
1650 struct inode *inode = mapping->host; 1669 struct inode *inode = mapping->host;
@@ -1722,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1722 1741
1723 } 1742 }
1724 1743
1725 ocfs2_set_target_boundaries(osb, wc, pos, len, 1744 /*
1726 clusters_to_alloc + extents_to_split); 1745 * We have to zero sparse allocated clusters, unwritten extent clusters,
1746 * and non-sparse clusters we just extended. For non-sparse writes,
1747 * we know zeros will only be needed in the first and/or last cluster.
1748 */
1749 if (clusters_to_alloc || extents_to_split ||
1750 (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
1751 wc->w_desc[wc->w_clen - 1].c_needs_zero)))
1752 cluster_of_pages = 1;
1753 else
1754 cluster_of_pages = 0;
1755
1756 ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
1727 1757
1728 handle = ocfs2_start_trans(osb, credits); 1758 handle = ocfs2_start_trans(osb, credits);
1729 if (IS_ERR(handle)) { 1759 if (IS_ERR(handle)) {
@@ -1756,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1756 * extent. 1786 * extent.
1757 */ 1787 */
1758 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, 1788 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
1759 clusters_to_alloc + extents_to_split, 1789 cluster_of_pages, mmap_page);
1760 mmap_page);
1761 if (ret) { 1790 if (ret) {
1762 mlog_errno(ret); 1791 mlog_errno(ret);
1763 goto out_quota; 1792 goto out_quota;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431a031d..b4957c7d9fe2 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
85 goto bail; 85 goto bail;
86 } 86 }
87 87
88 /*
89 * If the last lookup failed to create dentry lock, let us
90 * redo it.
91 */
92 if (!dentry->d_fsdata) {
93 mlog(0, "Inode %llu doesn't have dentry lock, "
94 "returning false\n",
95 (unsigned long long)OCFS2_I(inode)->ip_blkno);
96 goto bail;
97 }
98
88 ret = 1; 99 ret = 1;
89 100
90bail: 101bail:
@@ -310,22 +321,19 @@ out_attach:
310 return ret; 321 return ret;
311} 322}
312 323
313static DEFINE_SPINLOCK(dentry_list_lock); 324DEFINE_SPINLOCK(dentry_list_lock);
314 325
315/* We limit the number of dentry locks to drop in one go. We have 326/* We limit the number of dentry locks to drop in one go. We have
316 * this limit so that we don't starve other users of ocfs2_wq. */ 327 * this limit so that we don't starve other users of ocfs2_wq. */
317#define DL_INODE_DROP_COUNT 64 328#define DL_INODE_DROP_COUNT 64
318 329
319/* Drop inode references from dentry locks */ 330/* Drop inode references from dentry locks */
320void ocfs2_drop_dl_inodes(struct work_struct *work) 331static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
321{ 332{
322 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
323 dentry_lock_work);
324 struct ocfs2_dentry_lock *dl; 333 struct ocfs2_dentry_lock *dl;
325 int drop_count = DL_INODE_DROP_COUNT;
326 334
327 spin_lock(&dentry_list_lock); 335 spin_lock(&dentry_list_lock);
328 while (osb->dentry_lock_list && drop_count--) { 336 while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
329 dl = osb->dentry_lock_list; 337 dl = osb->dentry_lock_list;
330 osb->dentry_lock_list = dl->dl_next; 338 osb->dentry_lock_list = dl->dl_next;
331 spin_unlock(&dentry_list_lock); 339 spin_unlock(&dentry_list_lock);
@@ -333,11 +341,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
333 kfree(dl); 341 kfree(dl);
334 spin_lock(&dentry_list_lock); 342 spin_lock(&dentry_list_lock);
335 } 343 }
336 if (osb->dentry_lock_list) 344 spin_unlock(&dentry_list_lock);
345}
346
347void ocfs2_drop_dl_inodes(struct work_struct *work)
348{
349 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
350 dentry_lock_work);
351
352 __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
353 /*
354 * Don't queue dropping if umount is in progress. We flush the
355 * list in ocfs2_dismount_volume
356 */
357 spin_lock(&dentry_list_lock);
358 if (osb->dentry_lock_list &&
359 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
337 queue_work(ocfs2_wq, &osb->dentry_lock_work); 360 queue_work(ocfs2_wq, &osb->dentry_lock_work);
338 spin_unlock(&dentry_list_lock); 361 spin_unlock(&dentry_list_lock);
339} 362}
340 363
364/* Flush the whole work queue */
365void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
366{
367 __ocfs2_drop_dl_inodes(osb, -1);
368}
369
341/* 370/*
342 * ocfs2_dentry_iput() and friends. 371 * ocfs2_dentry_iput() and friends.
343 * 372 *
@@ -368,7 +397,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
368 /* We leave dropping of inode reference to ocfs2_wq as that can 397 /* We leave dropping of inode reference to ocfs2_wq as that can
369 * possibly lead to inode deletion which gets tricky */ 398 * possibly lead to inode deletion which gets tricky */
370 spin_lock(&dentry_list_lock); 399 spin_lock(&dentry_list_lock);
371 if (!osb->dentry_lock_list) 400 if (!osb->dentry_lock_list &&
401 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
372 queue_work(ocfs2_wq, &osb->dentry_lock_work); 402 queue_work(ocfs2_wq, &osb->dentry_lock_work);
373 dl->dl_next = osb->dentry_lock_list; 403 dl->dl_next = osb->dentry_lock_list;
374 osb->dentry_lock_list = dl; 404 osb->dentry_lock_list = dl;
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e75f98d..f5dd1789acf1 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -49,10 +49,13 @@ struct ocfs2_dentry_lock {
49int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, 49int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
50 u64 parent_blkno); 50 u64 parent_blkno);
51 51
52extern spinlock_t dentry_list_lock;
53
52void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 54void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
53 struct ocfs2_dentry_lock *dl); 55 struct ocfs2_dentry_lock *dl);
54 56
55void ocfs2_drop_dl_inodes(struct work_struct *work); 57void ocfs2_drop_dl_inodes(struct work_struct *work);
58void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
56 59
57struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, 60struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
58 int skip_unhashed); 61 int skip_unhashed);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index d07ddbe4b283..81eff8e58322 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
103 lock->ast_pending, lock->ml.type); 103 lock->ast_pending, lock->ml.type);
104 BUG(); 104 BUG();
105 } 105 }
106 BUG_ON(!list_empty(&lock->ast_list));
107 if (lock->ast_pending) 106 if (lock->ast_pending)
108 mlog(0, "lock has an ast getting flushed right now\n"); 107 mlog(0, "lock has an ast getting flushed right now\n");
109 108
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
325} 325}
326 326
327static struct backing_dev_info dlmfs_backing_dev_info = { 327static struct backing_dev_info dlmfs_backing_dev_info = {
328 .name = "ocfs2-dlmfs",
328 .ra_pages = 0, /* No readahead */ 329 .ra_pages = 0, /* No readahead */
329 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 330 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
330}; 331};
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bcb9260c3735..43e6e3280569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1118 1118
1119 mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", 1119 mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
1120 dlm->name, res->lockname.len, res->lockname.name, 1120 dlm->name, res->lockname.len, res->lockname.name,
1121 orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", 1121 orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
1122 send_to); 1122 send_to);
1123 1123
1124 /* send it */ 1124 /* send it */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index fcf879ed6930..756f5b0998e0 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
122 * that still has AST's pending... */ 122 * that still has AST's pending... */
123 in_use = !list_empty(&lock->ast_list); 123 in_use = !list_empty(&lock->ast_list);
124 spin_unlock(&dlm->ast_lock); 124 spin_unlock(&dlm->ast_lock);
125 if (in_use) { 125 if (in_use && !(flags & LKM_CANCEL)) {
126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " 126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
127 "while waiting for an ast!", res->lockname.len, 127 "while waiting for an ast!", res->lockname.len,
128 res->lockname.name); 128 res->lockname.name);
@@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
131 131
132 spin_lock(&res->spinlock); 132 spin_lock(&res->spinlock);
133 if (res->state & DLM_LOCK_RES_IN_PROGRESS) { 133 if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
134 if (master_node) { 134 if (master_node && !(flags & LKM_CANCEL)) {
135 mlog(ML_ERROR, "lockres in progress!\n"); 135 mlog(ML_ERROR, "lockres in progress!\n");
136 spin_unlock(&res->spinlock); 136 spin_unlock(&res->spinlock);
137 return DLM_FORWARD; 137 return DLM_FORWARD;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 62442e413a00..221c5e98957b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1851,6 +1851,7 @@ relock:
1851 if (ret) 1851 if (ret)
1852 goto out_dio; 1852 goto out_dio;
1853 1853
1854 count = ocount;
1854 ret = generic_write_checks(file, ppos, &count, 1855 ret = generic_write_checks(file, ppos, &count,
1855 S_ISBLK(inode->i_mode)); 1856 S_ISBLK(inode->i_mode));
1856 if (ret) 1857 if (ret)
@@ -1870,8 +1871,7 @@ relock:
1870 goto out_dio; 1871 goto out_dio;
1871 } 1872 }
1872 } else { 1873 } else {
1873 written = generic_file_aio_write_nolock(iocb, iov, nr_segs, 1874 written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
1874 *ppos);
1875 } 1875 }
1876 1876
1877out_dio: 1877out_dio:
@@ -1879,18 +1879,21 @@ out_dio:
1879 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 1879 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1880 1880
1881 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { 1881 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
1882 /* 1882 ret = filemap_fdatawrite_range(file->f_mapping, pos,
1883 * The generic write paths have handled getting data 1883 pos + count - 1);
1884 * to disk, but since we don't make use of the dirty 1884 if (ret < 0)
1885 * inode list, a manual journal commit is necessary 1885 written = ret;
1886 * here. 1886
1887 */ 1887 if (!ret && (old_size != i_size_read(inode) ||
1888 if (old_size != i_size_read(inode) || 1888 old_clusters != OCFS2_I(inode)->ip_clusters)) {
1889 old_clusters != OCFS2_I(inode)->ip_clusters) {
1890 ret = jbd2_journal_force_commit(osb->journal->j_journal); 1889 ret = jbd2_journal_force_commit(osb->journal->j_journal);
1891 if (ret < 0) 1890 if (ret < 0)
1892 written = ret; 1891 written = ret;
1893 } 1892 }
1893
1894 if (!ret)
1895 ret = filemap_fdatawait_range(file->f_mapping, pos,
1896 pos + count - 1);
1894 } 1897 }
1895 1898
1896 /* 1899 /*
@@ -1918,8 +1921,10 @@ out_sems:
1918 1921
1919 mutex_unlock(&inode->i_mutex); 1922 mutex_unlock(&inode->i_mutex);
1920 1923
1924 if (written)
1925 ret = written;
1921 mlog_exit(ret); 1926 mlog_exit(ret);
1922 return written ? written : ret; 1927 return ret;
1923} 1928}
1924 1929
1925static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, 1930static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
@@ -1988,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1988 1993
1989 if (ret > 0) { 1994 if (ret > 0) {
1990 unsigned long nr_pages; 1995 unsigned long nr_pages;
1996 int err;
1991 1997
1992 *ppos += ret;
1993 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1998 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1994 1999
1995 /* 2000 err = generic_write_sync(out, *ppos, ret);
1996 * If file or inode is SYNC and we actually wrote some data, 2001 if (err)
1997 * sync it. 2002 ret = err;
1998 */ 2003 else
1999 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 2004 *ppos += ret;
2000 int err;
2001
2002 mutex_lock(&inode->i_mutex);
2003 err = ocfs2_rw_lock(inode, 1);
2004 if (err < 0) {
2005 mlog_errno(err);
2006 } else {
2007 err = generic_osync_inode(inode, mapping,
2008 OSYNC_METADATA|OSYNC_DATA);
2009 ocfs2_rw_unlock(inode, 1);
2010 }
2011 mutex_unlock(&inode->i_mutex);
2012 2005
2013 if (err)
2014 ret = err;
2015 }
2016 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2006 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2017 } 2007 }
2018 2008
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f033760ecbea..c48b93ac6b65 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
1954 os->os_osb = osb; 1954 os->os_osb = osb;
1955 os->os_count = 0; 1955 os->os_count = 0;
1956 os->os_seqno = 0; 1956 os->os_seqno = 0;
1957 os->os_scantime = CURRENT_TIME;
1958 mutex_init(&os->os_lock); 1957 mutex_init(&os->os_lock);
1959 INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work); 1958 INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
1959}
1960 1960
1961void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
1962{
1963 struct ocfs2_orphan_scan *os;
1964
1965 os = &osb->osb_orphan_scan;
1966 os->os_scantime = CURRENT_TIME;
1961 if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) 1967 if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
1962 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); 1968 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
1963 else { 1969 else {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 5432c7f79cc6..2c3222aec622 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
145 145
146/* Exported only for the journal struct init code in super.c. Do not call. */ 146/* Exported only for the journal struct init code in super.c. Do not call. */
147void ocfs2_orphan_scan_init(struct ocfs2_super *osb); 147void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
148void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
148void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); 149void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
149void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); 150void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
150 151
@@ -329,20 +330,27 @@ int ocfs2_journal_dirty(handle_t *handle,
329/* extended attribute block update */ 330/* extended attribute block update */
330#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 331#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
331 332
333/* Update of a single quota block */
334#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
335
332/* global quotafile inode update, data block */ 336/* global quotafile inode update, data block */
333#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 337#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
338 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
334 339
340#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
335/* 341/*
336 * The two writes below can accidentally see global info dirty due 342 * The two writes below can accidentally see global info dirty due
337 * to set_info() quotactl so make them prepared for the writes. 343 * to set_info() quotactl so make them prepared for the writes.
338 */ 344 */
339/* quota data block, global info */ 345/* quota data block, global info */
340/* Write to local quota file */ 346/* Write to local quota file */
341#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1) 347#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
348 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
342 349
343/* global quota data block, local quota data block, global quota inode, 350/* global quota data block, local quota data block, global quota inode,
344 * global quota info */ 351 * global quota info */
345#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3) 352#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
353 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
346 354
347static inline int ocfs2_quota_trans_credits(struct super_block *sb) 355static inline int ocfs2_quota_trans_credits(struct super_block *sb)
348{ 356{
@@ -355,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb)
355 return credits; 363 return credits;
356} 364}
357 365
358/* Number of credits needed for removing quota structure from file */
359int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
360/* Number of credits needed for initialization of new quota structure */
361int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
362
363/* group extend. inode update and last group update. */ 366/* group extend. inode update and last group update. */
364#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 367#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
365 368
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c9345ebb8493..39e1d5a39505 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,10 +224,12 @@ enum ocfs2_mount_options
224 OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ 224 OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
225}; 225};
226 226
227#define OCFS2_OSB_SOFT_RO 0x0001 227#define OCFS2_OSB_SOFT_RO 0x0001
228#define OCFS2_OSB_HARD_RO 0x0002 228#define OCFS2_OSB_HARD_RO 0x0002
229#define OCFS2_OSB_ERROR_FS 0x0004 229#define OCFS2_OSB_ERROR_FS 0x0004
230#define OCFS2_DEFAULT_ATIME_QUANTUM 60 230#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008
231
232#define OCFS2_DEFAULT_ATIME_QUANTUM 60
231 233
232struct ocfs2_journal; 234struct ocfs2_journal;
233struct ocfs2_slot_info; 235struct ocfs2_slot_info;
@@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
490 spin_unlock(&osb->osb_lock); 492 spin_unlock(&osb->osb_lock);
491} 493}
492 494
495
496static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
497 unsigned long flag)
498{
499 unsigned long ret;
500
501 spin_lock(&osb->osb_lock);
502 ret = osb->osb_flags & flag;
503 spin_unlock(&osb->osb_lock);
504 return ret;
505}
506
493static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, 507static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
494 int hard) 508 int hard)
495{ 509{
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index fcdba091af3d..c212cf5a2bdf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -108,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
108 [OCFS2_LOCK_TYPE_OPEN] = "Open", 108 [OCFS2_LOCK_TYPE_OPEN] = "Open",
109 [OCFS2_LOCK_TYPE_FLOCK] = "Flock", 109 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
110 [OCFS2_LOCK_TYPE_QINFO] = "Quota", 110 [OCFS2_LOCK_TYPE_QINFO] = "Quota",
111 [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
111 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", 112 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
112}; 113};
113 114
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 7365e2e08706..3fb96fcd4c81 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo {
50 unsigned int dqi_chunks; /* Number of chunks in local quota file */ 50 unsigned int dqi_chunks; /* Number of chunks in local quota file */
51 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ 51 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
52 unsigned int dqi_syncms; /* How often should we sync with other nodes */ 52 unsigned int dqi_syncms; /* How often should we sync with other nodes */
53 unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */
54 struct list_head dqi_chunk; /* List of chunks */ 53 struct list_head dqi_chunk; /* List of chunks */
55 struct inode *dqi_gqinode; /* Global quota file inode */ 54 struct inode *dqi_gqinode; /* Global quota file inode */
56 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ 55 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index edfa60cd155c..44f2a5e1d042 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -23,6 +23,7 @@
23#include "sysfile.h" 23#include "sysfile.h"
24#include "dlmglue.h" 24#include "dlmglue.h"
25#include "uptodate.h" 25#include "uptodate.h"
26#include "super.h"
26#include "quota.h" 27#include "quota.h"
27 28
28static struct workqueue_struct *ocfs2_quota_wq = NULL; 29static struct workqueue_struct *ocfs2_quota_wq = NULL;
@@ -69,6 +70,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
69 d->dqb_curspace = cpu_to_le64(m->dqb_curspace); 70 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
70 d->dqb_btime = cpu_to_le64(m->dqb_btime); 71 d->dqb_btime = cpu_to_le64(m->dqb_btime);
71 d->dqb_itime = cpu_to_le64(m->dqb_itime); 72 d->dqb_itime = cpu_to_le64(m->dqb_itime);
73 d->dqb_pad1 = d->dqb_pad2 = 0;
72} 74}
73 75
74static int ocfs2_global_is_id(void *dp, struct dquot *dquot) 76static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
@@ -113,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
113 int rc = 0; 115 int rc = 0;
114 struct buffer_head *tmp = *bh; 116 struct buffer_head *tmp = *bh;
115 117
118 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
119 ocfs2_error(inode->i_sb,
120 "Quota file %llu is probably corrupted! Requested "
121 "to read block %Lu but file has size only %Lu\n",
122 (unsigned long long)OCFS2_I(inode)->ip_blkno,
123 (unsigned long long)v_block,
124 (unsigned long long)i_size_read(inode));
125 return -EIO;
126 }
116 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, 127 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
117 ocfs2_validate_quota_block); 128 ocfs2_validate_quota_block);
118 if (rc) 129 if (rc)
@@ -211,14 +222,13 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
211 222
212 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); 223 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
213 if (gqinode->i_size < off + len) { 224 if (gqinode->i_size < off + len) {
214 down_write(&OCFS2_I(gqinode)->ip_alloc_sem); 225 loff_t rounded_end =
215 err = ocfs2_extend_no_holes(gqinode, off + len, off); 226 ocfs2_align_bytes_to_blocks(sb, off + len);
216 up_write(&OCFS2_I(gqinode)->ip_alloc_sem); 227
217 if (err < 0) 228 /* Space is already allocated in ocfs2_global_read_dquot() */
218 goto out;
219 err = ocfs2_simple_size_update(gqinode, 229 err = ocfs2_simple_size_update(gqinode,
220 oinfo->dqi_gqi_bh, 230 oinfo->dqi_gqi_bh,
221 off + len); 231 rounded_end);
222 if (err < 0) 232 if (err < 0)
223 goto out; 233 goto out;
224 new = 1; 234 new = 1;
@@ -234,7 +244,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
234 } 244 }
235 if (err) { 245 if (err) {
236 mlog_errno(err); 246 mlog_errno(err);
237 return err; 247 goto out;
238 } 248 }
239 lock_buffer(bh); 249 lock_buffer(bh);
240 if (new) 250 if (new)
@@ -342,7 +352,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
342 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); 352 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
343 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); 353 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
344 oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); 354 oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
345 oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
346 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); 355 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
347 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); 356 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
348 oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); 357 oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -352,7 +361,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
352 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); 361 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
353 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); 362 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
354 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, 363 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
355 oinfo->dqi_syncjiff); 364 msecs_to_jiffies(oinfo->dqi_syncms));
356 365
357out_err: 366out_err:
358 mlog_exit(status); 367 mlog_exit(status);
@@ -402,13 +411,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
402 return err; 411 return err;
403} 412}
404 413
414static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
415{
416 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
417
418 /*
419 * We may need to allocate tree blocks and a leaf block but not the
420 * root block
421 */
422 return oinfo->dqi_gi.dqi_qtree_depth;
423}
424
425static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
426{
427 /* We modify all the allocated blocks, tree root, and info block */
428 return (ocfs2_global_qinit_alloc(sb, type) + 2) *
429 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
430}
431
405/* Read in information from global quota file and acquire a reference to it. 432/* Read in information from global quota file and acquire a reference to it.
406 * dquot_acquire() has already started the transaction and locked quota file */ 433 * dquot_acquire() has already started the transaction and locked quota file */
407int ocfs2_global_read_dquot(struct dquot *dquot) 434int ocfs2_global_read_dquot(struct dquot *dquot)
408{ 435{
409 int err, err2, ex = 0; 436 int err, err2, ex = 0;
410 struct ocfs2_mem_dqinfo *info = 437 struct super_block *sb = dquot->dq_sb;
411 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 438 int type = dquot->dq_type;
439 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
440 struct ocfs2_super *osb = OCFS2_SB(sb);
441 struct inode *gqinode = info->dqi_gqinode;
442 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
443 handle_t *handle = NULL;
412 444
413 err = ocfs2_qinfo_lock(info, 0); 445 err = ocfs2_qinfo_lock(info, 0);
414 if (err < 0) 446 if (err < 0)
@@ -419,14 +451,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
419 OCFS2_DQUOT(dquot)->dq_use_count++; 451 OCFS2_DQUOT(dquot)->dq_use_count++;
420 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; 452 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
421 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; 453 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
454 ocfs2_qinfo_unlock(info, 0);
455
422 if (!dquot->dq_off) { /* No real quota entry? */ 456 if (!dquot->dq_off) { /* No real quota entry? */
423 /* Upgrade to exclusive lock for allocation */
424 ocfs2_qinfo_unlock(info, 0);
425 err = ocfs2_qinfo_lock(info, 1);
426 if (err < 0)
427 goto out_qlock;
428 ex = 1; 457 ex = 1;
458 /*
459 * Add blocks to quota file before we start a transaction since
460 * locking allocators ranks above a transaction start
461 */
462 WARN_ON(journal_current_handle());
463 down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
464 err = ocfs2_extend_no_holes(gqinode,
465 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
466 gqinode->i_size);
467 up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
468 if (err < 0)
469 goto out;
429 } 470 }
471
472 handle = ocfs2_start_trans(osb,
473 ocfs2_calc_global_qinit_credits(sb, type));
474 if (IS_ERR(handle)) {
475 err = PTR_ERR(handle);
476 goto out;
477 }
478 err = ocfs2_qinfo_lock(info, ex);
479 if (err < 0)
480 goto out_trans;
430 err = qtree_write_dquot(&info->dqi_gi, dquot); 481 err = qtree_write_dquot(&info->dqi_gi, dquot);
431 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { 482 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
432 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); 483 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
@@ -438,6 +489,9 @@ out_qlock:
438 ocfs2_qinfo_unlock(info, 1); 489 ocfs2_qinfo_unlock(info, 1);
439 else 490 else
440 ocfs2_qinfo_unlock(info, 0); 491 ocfs2_qinfo_unlock(info, 0);
492out_trans:
493 if (handle)
494 ocfs2_commit_trans(osb, handle);
441out: 495out:
442 if (err < 0) 496 if (err < 0)
443 mlog_errno(err); 497 mlog_errno(err);
@@ -607,7 +661,7 @@ static void qsync_work_fn(struct work_struct *work)
607 661
608 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); 662 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
609 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, 663 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
610 oinfo->dqi_syncjiff); 664 msecs_to_jiffies(oinfo->dqi_syncms));
611} 665}
612 666
613/* 667/*
@@ -635,20 +689,18 @@ out:
635 return status; 689 return status;
636} 690}
637 691
638int ocfs2_calc_qdel_credits(struct super_block *sb, int type) 692static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
639{ 693{
640 struct ocfs2_mem_dqinfo *oinfo; 694 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
641 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 695 /*
642 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; 696 * We modify tree, leaf block, global info, local chunk header,
643 697 * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
644 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) 698 * accounts for inode update
645 return 0; 699 */
646 700 return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
647 oinfo = sb_dqinfo(sb, type)->dqi_priv; 701 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
648 /* We modify tree, leaf block, global info, local chunk header, 702 OCFS2_QINFO_WRITE_CREDITS +
649 * global and local inode */ 703 OCFS2_INODE_UPDATE_CREDITS;
650 return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
651 2 * OCFS2_INODE_UPDATE_CREDITS;
652} 704}
653 705
654static int ocfs2_release_dquot(struct dquot *dquot) 706static int ocfs2_release_dquot(struct dquot *dquot)
@@ -680,33 +732,10 @@ out:
680 return status; 732 return status;
681} 733}
682 734
683int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
684{
685 struct ocfs2_mem_dqinfo *oinfo;
686 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
687 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
688 struct ocfs2_dinode *lfe, *gfe;
689
690 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
691 return 0;
692
693 oinfo = sb_dqinfo(sb, type)->dqi_priv;
694 gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
695 lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
696 /* We can extend local file + global file. In local file we
697 * can modify info, chunk header block and dquot block. In
698 * global file we can modify info, tree and leaf block */
699 return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
700 ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
701 3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
702}
703
704static int ocfs2_acquire_dquot(struct dquot *dquot) 735static int ocfs2_acquire_dquot(struct dquot *dquot)
705{ 736{
706 handle_t *handle;
707 struct ocfs2_mem_dqinfo *oinfo = 737 struct ocfs2_mem_dqinfo *oinfo =
708 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 738 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
709 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
710 int status = 0; 739 int status = 0;
711 740
712 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 741 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
@@ -715,16 +744,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
715 status = ocfs2_lock_global_qf(oinfo, 1); 744 status = ocfs2_lock_global_qf(oinfo, 1);
716 if (status < 0) 745 if (status < 0)
717 goto out; 746 goto out;
718 handle = ocfs2_start_trans(osb,
719 ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
720 if (IS_ERR(handle)) {
721 status = PTR_ERR(handle);
722 mlog_errno(status);
723 goto out_ilock;
724 }
725 status = dquot_acquire(dquot); 747 status = dquot_acquire(dquot);
726 ocfs2_commit_trans(osb, handle);
727out_ilock:
728 ocfs2_unlock_global_qf(oinfo, 1); 748 ocfs2_unlock_global_qf(oinfo, 1);
729out: 749out:
730 mlog_exit(status); 750 mlog_exit(status);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 5a460fa82553..bdb09cb6e1fe 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -20,6 +20,7 @@
20#include "sysfile.h" 20#include "sysfile.h"
21#include "dlmglue.h" 21#include "dlmglue.h"
22#include "quota.h" 22#include "quota.h"
23#include "uptodate.h"
23 24
24/* Number of local quota structures per block */ 25/* Number of local quota structures per block */
25static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) 26static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -100,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
100 handle_t *handle; 101 handle_t *handle;
101 int status; 102 int status;
102 103
103 handle = ocfs2_start_trans(OCFS2_SB(sb), 1); 104 handle = ocfs2_start_trans(OCFS2_SB(sb),
105 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
104 if (IS_ERR(handle)) { 106 if (IS_ERR(handle)) {
105 status = PTR_ERR(handle); 107 status = PTR_ERR(handle);
106 mlog_errno(status); 108 mlog_errno(status);
@@ -610,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
610 goto out_bh; 612 goto out_bh;
611 /* Mark quota file as clean if we are recovering quota file of 613 /* Mark quota file as clean if we are recovering quota file of
612 * some other node. */ 614 * some other node. */
613 handle = ocfs2_start_trans(osb, 1); 615 handle = ocfs2_start_trans(osb,
616 OCFS2_LOCAL_QINFO_WRITE_CREDITS);
614 if (IS_ERR(handle)) { 617 if (IS_ERR(handle)) {
615 status = PTR_ERR(handle); 618 status = PTR_ERR(handle);
616 mlog_errno(status); 619 mlog_errno(status);
@@ -940,7 +943,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
940 struct ocfs2_local_disk_chunk *dchunk; 943 struct ocfs2_local_disk_chunk *dchunk;
941 int status; 944 int status;
942 handle_t *handle; 945 handle_t *handle;
943 struct buffer_head *bh = NULL; 946 struct buffer_head *bh = NULL, *dbh = NULL;
944 u64 p_blkno; 947 u64 p_blkno;
945 948
946 /* We are protected by dqio_sem so no locking needed */ 949 /* We are protected by dqio_sem so no locking needed */
@@ -964,32 +967,35 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
964 mlog_errno(status); 967 mlog_errno(status);
965 goto out; 968 goto out;
966 } 969 }
970 /* Local quota info and two new blocks we initialize */
971 handle = ocfs2_start_trans(OCFS2_SB(sb),
972 OCFS2_LOCAL_QINFO_WRITE_CREDITS +
973 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
974 if (IS_ERR(handle)) {
975 status = PTR_ERR(handle);
976 mlog_errno(status);
977 goto out;
978 }
967 979
980 /* Initialize chunk header */
968 down_read(&OCFS2_I(lqinode)->ip_alloc_sem); 981 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
969 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 982 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
970 &p_blkno, NULL, NULL); 983 &p_blkno, NULL, NULL);
971 up_read(&OCFS2_I(lqinode)->ip_alloc_sem); 984 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
972 if (status < 0) { 985 if (status < 0) {
973 mlog_errno(status); 986 mlog_errno(status);
974 goto out; 987 goto out_trans;
975 } 988 }
976 bh = sb_getblk(sb, p_blkno); 989 bh = sb_getblk(sb, p_blkno);
977 if (!bh) { 990 if (!bh) {
978 status = -ENOMEM; 991 status = -ENOMEM;
979 mlog_errno(status); 992 mlog_errno(status);
980 goto out; 993 goto out_trans;
981 } 994 }
982 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; 995 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
983 996 ocfs2_set_new_buffer_uptodate(lqinode, bh);
984 handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
985 if (IS_ERR(handle)) {
986 status = PTR_ERR(handle);
987 mlog_errno(status);
988 goto out;
989 }
990
991 status = ocfs2_journal_access_dq(handle, lqinode, bh, 997 status = ocfs2_journal_access_dq(handle, lqinode, bh,
992 OCFS2_JOURNAL_ACCESS_WRITE); 998 OCFS2_JOURNAL_ACCESS_CREATE);
993 if (status < 0) { 999 if (status < 0) {
994 mlog_errno(status); 1000 mlog_errno(status);
995 goto out_trans; 1001 goto out_trans;
@@ -999,7 +1005,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
999 memset(dchunk->dqc_bitmap, 0, 1005 memset(dchunk->dqc_bitmap, 0,
1000 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - 1006 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
1001 OCFS2_QBLK_RESERVED_SPACE); 1007 OCFS2_QBLK_RESERVED_SPACE);
1002 set_buffer_uptodate(bh);
1003 unlock_buffer(bh); 1008 unlock_buffer(bh);
1004 status = ocfs2_journal_dirty(handle, bh); 1009 status = ocfs2_journal_dirty(handle, bh);
1005 if (status < 0) { 1010 if (status < 0) {
@@ -1007,6 +1012,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1007 goto out_trans; 1012 goto out_trans;
1008 } 1013 }
1009 1014
1015 /* Initialize new block with structures */
1016 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1017 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
1018 &p_blkno, NULL, NULL);
1019 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1020 if (status < 0) {
1021 mlog_errno(status);
1022 goto out_trans;
1023 }
1024 dbh = sb_getblk(sb, p_blkno);
1025 if (!dbh) {
1026 status = -ENOMEM;
1027 mlog_errno(status);
1028 goto out_trans;
1029 }
1030 ocfs2_set_new_buffer_uptodate(lqinode, dbh);
1031 status = ocfs2_journal_access_dq(handle, lqinode, dbh,
1032 OCFS2_JOURNAL_ACCESS_CREATE);
1033 if (status < 0) {
1034 mlog_errno(status);
1035 goto out_trans;
1036 }
1037 lock_buffer(dbh);
1038 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
1039 unlock_buffer(dbh);
1040 status = ocfs2_journal_dirty(handle, dbh);
1041 if (status < 0) {
1042 mlog_errno(status);
1043 goto out_trans;
1044 }
1045
1046 /* Update local quotafile info */
1010 oinfo->dqi_blocks += 2; 1047 oinfo->dqi_blocks += 2;
1011 oinfo->dqi_chunks++; 1048 oinfo->dqi_chunks++;
1012 status = ocfs2_local_write_info(sb, type); 1049 status = ocfs2_local_write_info(sb, type);
@@ -1031,6 +1068,7 @@ out_trans:
1031 ocfs2_commit_trans(OCFS2_SB(sb), handle); 1068 ocfs2_commit_trans(OCFS2_SB(sb), handle);
1032out: 1069out:
1033 brelse(bh); 1070 brelse(bh);
1071 brelse(dbh);
1034 kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); 1072 kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
1035 return ERR_PTR(status); 1073 return ERR_PTR(status);
1036} 1074}
@@ -1048,6 +1086,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1048 struct ocfs2_local_disk_chunk *dchunk; 1086 struct ocfs2_local_disk_chunk *dchunk;
1049 int epb = ol_quota_entries_per_block(sb); 1087 int epb = ol_quota_entries_per_block(sb);
1050 unsigned int chunk_blocks; 1088 unsigned int chunk_blocks;
1089 struct buffer_head *bh;
1090 u64 p_blkno;
1051 int status; 1091 int status;
1052 handle_t *handle; 1092 handle_t *handle;
1053 1093
@@ -1075,12 +1115,49 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1075 mlog_errno(status); 1115 mlog_errno(status);
1076 goto out; 1116 goto out;
1077 } 1117 }
1078 handle = ocfs2_start_trans(OCFS2_SB(sb), 2); 1118
1119 /* Get buffer from the just added block */
1120 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1121 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
1122 &p_blkno, NULL, NULL);
1123 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1124 if (status < 0) {
1125 mlog_errno(status);
1126 goto out;
1127 }
1128 bh = sb_getblk(sb, p_blkno);
1129 if (!bh) {
1130 status = -ENOMEM;
1131 mlog_errno(status);
1132 goto out;
1133 }
1134 ocfs2_set_new_buffer_uptodate(lqinode, bh);
1135
1136 /* Local quota info, chunk header and the new block we initialize */
1137 handle = ocfs2_start_trans(OCFS2_SB(sb),
1138 OCFS2_LOCAL_QINFO_WRITE_CREDITS +
1139 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
1079 if (IS_ERR(handle)) { 1140 if (IS_ERR(handle)) {
1080 status = PTR_ERR(handle); 1141 status = PTR_ERR(handle);
1081 mlog_errno(status); 1142 mlog_errno(status);
1082 goto out; 1143 goto out;
1083 } 1144 }
1145 /* Zero created block */
1146 status = ocfs2_journal_access_dq(handle, lqinode, bh,
1147 OCFS2_JOURNAL_ACCESS_CREATE);
1148 if (status < 0) {
1149 mlog_errno(status);
1150 goto out_trans;
1151 }
1152 lock_buffer(bh);
1153 memset(bh->b_data, 0, sb->s_blocksize);
1154 unlock_buffer(bh);
1155 status = ocfs2_journal_dirty(handle, bh);
1156 if (status < 0) {
1157 mlog_errno(status);
1158 goto out_trans;
1159 }
1160 /* Update chunk header */
1084 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, 1161 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
1085 OCFS2_JOURNAL_ACCESS_WRITE); 1162 OCFS2_JOURNAL_ACCESS_WRITE);
1086 if (status < 0) { 1163 if (status < 0) {
@@ -1097,6 +1174,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1097 mlog_errno(status); 1174 mlog_errno(status);
1098 goto out_trans; 1175 goto out_trans;
1099 } 1176 }
1177 /* Update file header */
1100 oinfo->dqi_blocks++; 1178 oinfo->dqi_blocks++;
1101 status = ocfs2_local_write_info(sb, type); 1179 status = ocfs2_local_write_info(sb, type);
1102 if (status < 0) { 1180 if (status < 0) {
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3f661376a2de..e49c41050264 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -17,6 +17,7 @@
17 * General Public License for more details. 17 * General Public License for more details.
18 */ 18 */
19 19
20#include <linux/kernel.h>
20#include <linux/crc32.h> 21#include <linux/crc32.h>
21#include <linux/module.h> 22#include <linux/module.h>
22 23
@@ -153,7 +154,7 @@ static int status_map[] = {
153 154
154static int dlm_status_to_errno(enum dlm_status status) 155static int dlm_status_to_errno(enum dlm_status status)
155{ 156{
156 BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0]))); 157 BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
157 158
158 return status_map[status]; 159 return status_map[status];
159} 160}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7efb349fb9bd..a3f8871d21fd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
777 } 777 }
778 di = (struct ocfs2_dinode *) (*bh)->b_data; 778 di = (struct ocfs2_dinode *) (*bh)->b_data;
779 memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); 779 memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
780 spin_lock_init(&stats->b_lock);
780 status = ocfs2_verify_volume(di, *bh, blksize, stats); 781 status = ocfs2_verify_volume(di, *bh, blksize, stats);
781 if (status >= 0) 782 if (status >= 0)
782 goto bail; 783 goto bail;
@@ -1182,7 +1183,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1182 wake_up(&osb->osb_mount_event); 1183 wake_up(&osb->osb_mount_event);
1183 1184
1184 /* Start this when the mount is almost sure of being successful */ 1185 /* Start this when the mount is almost sure of being successful */
1185 ocfs2_orphan_scan_init(osb); 1186 ocfs2_orphan_scan_start(osb);
1186 1187
1187 mlog_exit(status); 1188 mlog_exit(status);
1188 return status; 1189 return status;
@@ -1213,14 +1214,31 @@ static int ocfs2_get_sb(struct file_system_type *fs_type,
1213 mnt); 1214 mnt);
1214} 1215}
1215 1216
1217static void ocfs2_kill_sb(struct super_block *sb)
1218{
1219 struct ocfs2_super *osb = OCFS2_SB(sb);
1220
1221 /* Failed mount? */
1222 if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
1223 goto out;
1224
1225 /* Prevent further queueing of inode drop events */
1226 spin_lock(&dentry_list_lock);
1227 ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
1228 spin_unlock(&dentry_list_lock);
1229 /* Wait for work to finish and/or remove it */
1230 cancel_work_sync(&osb->dentry_lock_work);
1231out:
1232 kill_block_super(sb);
1233}
1234
1216static struct file_system_type ocfs2_fs_type = { 1235static struct file_system_type ocfs2_fs_type = {
1217 .owner = THIS_MODULE, 1236 .owner = THIS_MODULE,
1218 .name = "ocfs2", 1237 .name = "ocfs2",
1219 .get_sb = ocfs2_get_sb, /* is this called when we mount 1238 .get_sb = ocfs2_get_sb, /* is this called when we mount
1220 * the fs? */ 1239 * the fs? */
1221 .kill_sb = kill_block_super, /* set to the generic one 1240 .kill_sb = ocfs2_kill_sb,
1222 * right now, but do we 1241
1223 * need to change that? */
1224 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 1242 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
1225 .next = NULL 1243 .next = NULL
1226}; 1244};
@@ -1819,6 +1837,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1819 1837
1820 debugfs_remove(osb->osb_ctxt); 1838 debugfs_remove(osb->osb_ctxt);
1821 1839
1840 /*
1841 * Flush inode dropping work queue so that deletes are
1842 * performed while the filesystem is still working
1843 */
1844 ocfs2_drop_all_dl_inodes(osb);
1845
1822 /* Orphan scan should be stopped as early as possible */ 1846 /* Orphan scan should be stopped as early as possible */
1823 ocfs2_orphan_scan_stop(osb); 1847 ocfs2_orphan_scan_stop(osb);
1824 1848
@@ -1981,6 +2005,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
1981 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 2005 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1982 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 2006 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1983 2007
2008 ocfs2_orphan_scan_init(osb);
2009
1984 status = ocfs2_recovery_init(osb); 2010 status = ocfs2_recovery_init(osb);
1985 if (status) { 2011 if (status) {
1986 mlog(ML_ERROR, "Unable to initialize recovery state\n"); 2012 mlog(ML_ERROR, "Unable to initialize recovery state\n");
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ba320e250747..d1a27cda984f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode,
1052 struct ocfs2_xattr_block *xb; 1052 struct ocfs2_xattr_block *xb;
1053 struct ocfs2_xattr_value_root *xv; 1053 struct ocfs2_xattr_value_root *xv;
1054 size_t size; 1054 size_t size;
1055 int ret = -ENODATA, name_offset, name_len, block_off, i; 1055 int ret = -ENODATA, name_offset, name_len, i;
1056 int uninitialized_var(block_off);
1056 1057
1057 xs->bucket = ocfs2_xattr_bucket_new(inode); 1058 xs->bucket = ocfs2_xattr_bucket_new(inode);
1058 if (!xs->bucket) { 1059 if (!xs->bucket) {
diff --git a/fs/open.c b/fs/open.c
index dd98e8076024..31191bf513e4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -199,7 +199,7 @@ out:
199int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, 199int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
200 struct file *filp) 200 struct file *filp)
201{ 201{
202 int err; 202 int ret;
203 struct iattr newattrs; 203 struct iattr newattrs;
204 204
205 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ 205 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
@@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
214 } 214 }
215 215
216 /* Remove suid/sgid on truncate too */ 216 /* Remove suid/sgid on truncate too */
217 newattrs.ia_valid |= should_remove_suid(dentry); 217 ret = should_remove_suid(dentry);
218 if (ret)
219 newattrs.ia_valid |= ret | ATTR_FORCE;
218 220
219 mutex_lock(&dentry->d_inode->i_mutex); 221 mutex_lock(&dentry->d_inode->i_mutex);
220 err = notify_change(dentry, &newattrs); 222 ret = notify_change(dentry, &newattrs);
221 mutex_unlock(&dentry->d_inode->i_mutex); 223 mutex_unlock(&dentry->d_inode->i_mutex);
222 return err; 224 return ret;
223} 225}
224 226
225static long do_sys_truncate(const char __user *pathname, loff_t length) 227static long do_sys_truncate(const char __user *pathname, loff_t length)
@@ -957,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
957 int error; 959 int error;
958 struct file *f; 960 struct file *f;
959 961
962 validate_creds(cred);
963
960 /* 964 /*
961 * We must always pass in a valid mount pointer. Historically 965 * We must always pass in a valid mount pointer. Historically
962 * callers got away with not passing it, but we must enforce this at 966 * callers got away with not passing it, but we must enforce this at
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..fbeaddf595d3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
248 part_stat_read(p, merges[WRITE]), 248 part_stat_read(p, merges[WRITE]),
249 (unsigned long long)part_stat_read(p, sectors[WRITE]), 249 (unsigned long long)part_stat_read(p, sectors[WRITE]),
250 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), 250 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
251 p->in_flight, 251 part_in_flight(p),
252 jiffies_to_msecs(part_stat_read(p, io_ticks)), 252 jiffies_to_msecs(part_stat_read(p, io_ticks)),
253 jiffies_to_msecs(part_stat_read(p, time_in_queue))); 253 jiffies_to_msecs(part_stat_read(p, time_in_queue)));
254} 254}
255 255
256ssize_t part_inflight_show(struct device *dev,
257 struct device_attribute *attr, char *buf)
258{
259 struct hd_struct *p = dev_to_part(dev);
260
261 return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
262}
263
256#ifdef CONFIG_FAIL_MAKE_REQUEST 264#ifdef CONFIG_FAIL_MAKE_REQUEST
257ssize_t part_fail_show(struct device *dev, 265ssize_t part_fail_show(struct device *dev,
258 struct device_attribute *attr, char *buf) 266 struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
281static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 289static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
282static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 290static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
283static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 291static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
292static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
284#ifdef CONFIG_FAIL_MAKE_REQUEST 293#ifdef CONFIG_FAIL_MAKE_REQUEST
285static struct device_attribute dev_attr_fail = 294static struct device_attribute dev_attr_fail =
286 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); 295 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
292 &dev_attr_size.attr, 301 &dev_attr_size.attr,
293 &dev_attr_alignment_offset.attr, 302 &dev_attr_alignment_offset.attr,
294 &dev_attr_stat.attr, 303 &dev_attr_stat.attr,
304 &dev_attr_inflight.attr,
295#ifdef CONFIG_FAIL_MAKE_REQUEST 305#ifdef CONFIG_FAIL_MAKE_REQUEST
296 &dev_attr_fail.attr, 306 &dev_attr_fail.attr,
297#endif 307#endif
@@ -302,7 +312,7 @@ static struct attribute_group part_attr_group = {
302 .attrs = part_attrs, 312 .attrs = part_attrs,
303}; 313};
304 314
305static struct attribute_group *part_attr_groups[] = { 315static const struct attribute_group *part_attr_groups[] = {
306 &part_attr_group, 316 &part_attr_group,
307#ifdef CONFIG_BLK_DEV_IO_TRACE 317#ifdef CONFIG_BLK_DEV_IO_TRACE
308 &blk_trace_attr_group, 318 &blk_trace_attr_group,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3ce5ae9e3d2d..6f742f6658a9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,23 +234,20 @@ static int check_mem_permission(struct task_struct *task)
234 234
235struct mm_struct *mm_for_maps(struct task_struct *task) 235struct mm_struct *mm_for_maps(struct task_struct *task)
236{ 236{
237 struct mm_struct *mm = get_task_mm(task); 237 struct mm_struct *mm;
238 if (!mm) 238
239 if (mutex_lock_killable(&task->cred_guard_mutex))
239 return NULL; 240 return NULL;
240 down_read(&mm->mmap_sem); 241
241 task_lock(task); 242 mm = get_task_mm(task);
242 if (task->mm != mm) 243 if (mm && mm != current->mm &&
243 goto out; 244 !ptrace_may_access(task, PTRACE_MODE_READ)) {
244 if (task->mm != current->mm && 245 mmput(mm);
245 __ptrace_may_access(task, PTRACE_MODE_READ) < 0) 246 mm = NULL;
246 goto out; 247 }
247 task_unlock(task); 248 mutex_unlock(&task->cred_guard_mutex);
249
248 return mm; 250 return mm;
249out:
250 task_unlock(task);
251 up_read(&mm->mmap_sem);
252 mmput(mm);
253 return NULL;
254} 251}
255 252
256static int proc_pid_cmdline(struct task_struct *task, char * buffer) 253static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -1006,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
1006 1003
1007 if (!task) 1004 if (!task)
1008 return -ESRCH; 1005 return -ESRCH;
1009 task_lock(task); 1006 oom_adjust = task->oomkilladj;
1010 if (task->mm)
1011 oom_adjust = task->mm->oom_adj;
1012 else
1013 oom_adjust = OOM_DISABLE;
1014 task_unlock(task);
1015 put_task_struct(task); 1007 put_task_struct(task);
1016 1008
1017 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); 1009 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1040,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1040 task = get_proc_task(file->f_path.dentry->d_inode); 1032 task = get_proc_task(file->f_path.dentry->d_inode);
1041 if (!task) 1033 if (!task)
1042 return -ESRCH; 1034 return -ESRCH;
1043 task_lock(task); 1035 if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
1044 if (!task->mm) {
1045 task_unlock(task);
1046 put_task_struct(task);
1047 return -EINVAL;
1048 }
1049 if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1050 task_unlock(task);
1051 put_task_struct(task); 1036 put_task_struct(task);
1052 return -EACCES; 1037 return -EACCES;
1053 } 1038 }
1054 task->mm->oom_adj = oom_adjust; 1039 task->oomkilladj = oom_adjust;
1055 task_unlock(task);
1056 put_task_struct(task); 1040 put_task_struct(task);
1057 if (end - buffer == 0) 1041 if (end - buffer == 0)
1058 return -EIO; 1042 return -EIO;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..9bd8be1d235c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
119 mm = mm_for_maps(priv->task); 119 mm = mm_for_maps(priv->task);
120 if (!mm) 120 if (!mm)
121 return NULL; 121 return NULL;
122 down_read(&mm->mmap_sem);
122 123
123 tail_vma = get_gate_vma(priv->task); 124 tail_vma = get_gate_vma(priv->task);
124 priv->tail_vma = tail_vma; 125 priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
189 priv->task = NULL; 189 priv->task = NULL;
190 return NULL; 190 return NULL;
191 } 191 }
192 down_read(&mm->mmap_sem);
192 193
193 /* start from the Nth VMA */ 194 /* start from the Nth VMA */
194 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) 195 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 70f36c043d62..38f7bd559f35 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2043,7 +2043,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2043 invalidate_bdev(sb->s_bdev); 2043 invalidate_bdev(sb->s_bdev);
2044 } 2044 }
2045 mutex_lock(&dqopt->dqonoff_mutex); 2045 mutex_lock(&dqopt->dqonoff_mutex);
2046 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2047 if (sb_has_quota_loaded(sb, type)) { 2046 if (sb_has_quota_loaded(sb, type)) {
2048 error = -EBUSY; 2047 error = -EBUSY;
2049 goto out_lock; 2048 goto out_lock;
@@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2054 * possible) Also nobody should write to the file - we use 2053 * possible) Also nobody should write to the file - we use
2055 * special IO operations which ignore the immutable bit. */ 2054 * special IO operations which ignore the immutable bit. */
2056 down_write(&dqopt->dqptr_sem); 2055 down_write(&dqopt->dqptr_sem);
2056 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2057 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | 2057 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
2058 S_NOQUOTA); 2058 S_NOQUOTA);
2059 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; 2059 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
2060 mutex_unlock(&inode->i_mutex);
2060 up_write(&dqopt->dqptr_sem); 2061 up_write(&dqopt->dqptr_sem);
2061 sb->dq_op->drop(inode); 2062 sb->dq_op->drop(inode);
2062 } 2063 }
@@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2080 goto out_file_init; 2081 goto out_file_init;
2081 } 2082 }
2082 mutex_unlock(&dqopt->dqio_mutex); 2083 mutex_unlock(&dqopt->dqio_mutex);
2083 mutex_unlock(&inode->i_mutex);
2084 spin_lock(&dq_state_lock); 2084 spin_lock(&dq_state_lock);
2085 dqopt->flags |= dquot_state_flag(flags, type); 2085 dqopt->flags |= dquot_state_flag(flags, type);
2086 spin_unlock(&dq_state_lock); 2086 spin_unlock(&dq_state_lock);
@@ -2096,13 +2096,14 @@ out_file_init:
2096out_lock: 2096out_lock:
2097 if (oldflags != -1) { 2097 if (oldflags != -1) {
2098 down_write(&dqopt->dqptr_sem); 2098 down_write(&dqopt->dqptr_sem);
2099 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2099 /* Set the flags back (in the case of accidental quotaon() 2100 /* Set the flags back (in the case of accidental quotaon()
2100 * on a wrong file we don't want to mess up the flags) */ 2101 * on a wrong file we don't want to mess up the flags) */
2101 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); 2102 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
2102 inode->i_flags |= oldflags; 2103 inode->i_flags |= oldflags;
2104 mutex_unlock(&inode->i_mutex);
2103 up_write(&dqopt->dqptr_sem); 2105 up_write(&dqopt->dqptr_sem);
2104 } 2106 }
2105 mutex_unlock(&inode->i_mutex);
2106 mutex_unlock(&dqopt->dqonoff_mutex); 2107 mutex_unlock(&dqopt->dqonoff_mutex);
2107out_fmt: 2108out_fmt:
2108 put_quota_format(fmt); 2109 put_quota_format(fmt);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ebb2c417912c..11f0c06316de 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -20,6 +20,7 @@
20#include <linux/ramfs.h> 20#include <linux/ramfs.h>
21#include <linux/pagevec.h> 21#include <linux/pagevec.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
23#include <linux/sched.h>
23 24
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include "internal.h" 26#include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a7f0110fca4c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops;
46static const struct inode_operations ramfs_dir_inode_operations; 46static const struct inode_operations ramfs_dir_inode_operations;
47 47
48static struct backing_dev_info ramfs_backing_dev_info = { 48static struct backing_dev_info ramfs_backing_dev_info = {
49 .name = "ramfs",
49 .ra_pages = 0, /* No readahead */ 50 .ra_pages = 0, /* No readahead */
50 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | 51 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
51 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | 52 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/select.c b/fs/select.c
index d870237e42c7..8084834e123e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq)
110{ 110{
111 init_poll_funcptr(&pwq->pt, __pollwait); 111 init_poll_funcptr(&pwq->pt, __pollwait);
112 pwq->polling_task = current; 112 pwq->polling_task = current;
113 pwq->triggered = 0;
113 pwq->error = 0; 114 pwq->error = 0;
114 pwq->table = NULL; 115 pwq->table = NULL;
115 pwq->inline_index = 0; 116 pwq->inline_index = 0;
diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..7394e9e17534 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
502 len = left; 502 len = left;
503 503
504 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 504 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
505 if (ret > 0) 505 if (ret > 0) {
506 *ppos += ret; 506 *ppos += ret;
507 file_accessed(in);
508 }
507 509
508 return ret; 510 return ret;
509} 511}
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
963 965
964 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 966 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
965 ret = file_remove_suid(out); 967 ret = file_remove_suid(out);
966 if (!ret) 968 if (!ret) {
969 file_update_time(out);
967 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); 970 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
971 }
968 mutex_unlock(&inode->i_mutex); 972 mutex_unlock(&inode->i_mutex);
969 } while (ret > 0); 973 } while (ret > 0);
970 splice_from_pipe_end(pipe, &sd); 974 splice_from_pipe_end(pipe, &sd);
@@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
976 980
977 if (ret > 0) { 981 if (ret > 0) {
978 unsigned long nr_pages; 982 unsigned long nr_pages;
983 int err;
979 984
980 *ppos += ret;
981 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 985 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
982 986
983 /* 987 err = generic_write_sync(out, *ppos, ret);
984 * If file or inode is SYNC and we actually wrote some data, 988 if (err)
985 * sync it. 989 ret = err;
986 */ 990 else
987 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 991 *ppos += ret;
988 int err;
989
990 mutex_lock(&inode->i_mutex);
991 err = generic_osync_inode(inode, mapping,
992 OSYNC_METADATA|OSYNC_DATA);
993 mutex_unlock(&inode->i_mutex);
994
995 if (err)
996 ret = err;
997 }
998 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 992 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
999 } 993 }
1000 994
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..b03fea8fbfb6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
62 s = NULL; 62 s = NULL;
63 goto out; 63 goto out;
64 } 64 }
65 INIT_LIST_HEAD(&s->s_dirty);
66 INIT_LIST_HEAD(&s->s_io);
67 INIT_LIST_HEAD(&s->s_more_io);
68 INIT_LIST_HEAD(&s->s_files); 65 INIT_LIST_HEAD(&s->s_files);
69 INIT_LIST_HEAD(&s->s_instances); 66 INIT_LIST_HEAD(&s->s_instances);
70 INIT_HLIST_HEAD(&s->s_anon); 67 INIT_HLIST_HEAD(&s->s_anon);
@@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
171 * Drops a temporary reference, frees superblock if there's no 168 * Drops a temporary reference, frees superblock if there's no
172 * references left. 169 * references left.
173 */ 170 */
174static void put_super(struct super_block *sb) 171void put_super(struct super_block *sb)
175{ 172{
176 spin_lock(&sb_lock); 173 spin_lock(&sb_lock);
177 __put_super(sb); 174 __put_super(sb);
@@ -710,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
710{ 707{
711 s->s_bdev = data; 708 s->s_bdev = data;
712 s->s_dev = s->s_bdev->bd_dev; 709 s->s_dev = s->s_bdev->bd_dev;
710
711 /*
712 * We set the bdi here to the queue backing, file systems can
713 * overwrite this in ->fill_super()
714 */
715 s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
713 return 0; 716 return 0;
714} 717}
715 718
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..c08467a5d7cb 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,29 @@
19 SYNC_FILE_RANGE_WAIT_AFTER) 19 SYNC_FILE_RANGE_WAIT_AFTER)
20 20
21/* 21/*
22 * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) 22 * Do the filesystem syncing work. For simple filesystems
23 * just dirties buffers with inodes so we have to submit IO for these buffers 23 * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
24 * via __sync_blockdev(). This also speeds up the wait == 1 case since in that 24 * submit IO for these buffers via __sync_blockdev(). This also speeds up the
25 * case write_inode() functions do sync_dirty_buffer() and thus effectively 25 * wait == 1 case since in that case write_inode() functions do
26 * write one block at a time. 26 * sync_dirty_buffer() and thus effectively write one block at a time.
27 */ 27 */
28static int __sync_filesystem(struct super_block *sb, int wait) 28static int __sync_filesystem(struct super_block *sb, int wait)
29{ 29{
30 /*
31 * This should be safe, as we require bdi backing to actually
32 * write out data in the first place
33 */
34 if (!sb->s_bdi)
35 return 0;
36
30 /* Avoid doing twice syncing and cache pruning for quota sync */ 37 /* Avoid doing twice syncing and cache pruning for quota sync */
31 if (!wait) 38 if (!wait) {
32 writeout_quota_sb(sb, -1); 39 writeout_quota_sb(sb, -1);
33 else 40 writeback_inodes_sb(sb);
41 } else {
34 sync_quota_sb(sb, -1); 42 sync_quota_sb(sb, -1);
35 sync_inodes_sb(sb, wait); 43 sync_inodes_sb(sb);
44 }
36 if (sb->s_op->sync_fs) 45 if (sb->s_op->sync_fs)
37 sb->s_op->sync_fs(sb, wait); 46 sb->s_op->sync_fs(sb, wait);
38 return __sync_blockdev(sb->s_bdev, wait); 47 return __sync_blockdev(sb->s_bdev, wait);
@@ -99,7 +108,7 @@ restart:
99 spin_unlock(&sb_lock); 108 spin_unlock(&sb_lock);
100 109
101 down_read(&sb->s_umount); 110 down_read(&sb->s_umount);
102 if (!(sb->s_flags & MS_RDONLY) && sb->s_root) 111 if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
103 __sync_filesystem(sb, wait); 112 __sync_filesystem(sb, wait);
104 up_read(&sb->s_umount); 113 up_read(&sb->s_umount);
105 114
@@ -118,7 +127,7 @@ restart:
118 */ 127 */
119SYSCALL_DEFINE0(sync) 128SYSCALL_DEFINE0(sync)
120{ 129{
121 wakeup_pdflush(0); 130 wakeup_flusher_threads(0);
122 sync_filesystems(0); 131 sync_filesystems(0);
123 sync_filesystems(1); 132 sync_filesystems(1);
124 if (unlikely(laptop_mode)) 133 if (unlikely(laptop_mode))
@@ -176,19 +185,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
176} 185}
177 186
178/** 187/**
179 * vfs_fsync - perform a fsync or fdatasync on a file 188 * vfs_fsync_range - helper to sync a range of data & metadata to disk
180 * @file: file to sync 189 * @file: file to sync
181 * @dentry: dentry of @file 190 * @dentry: dentry of @file
182 * @data: only perform a fdatasync operation 191 * @start: offset in bytes of the beginning of data range to sync
192 * @end: offset in bytes of the end of data range (inclusive)
193 * @datasync: perform only datasync
183 * 194 *
184 * Write back data and metadata for @file to disk. If @datasync is 195 * Write back data in range @start..@end and metadata for @file to disk. If
185 * set only metadata needed to access modified file data is written. 196 * @datasync is set only metadata needed to access modified file data is
197 * written.
186 * 198 *
187 * In case this function is called from nfsd @file may be %NULL and 199 * In case this function is called from nfsd @file may be %NULL and
188 * only @dentry is set. This can only happen when the filesystem 200 * only @dentry is set. This can only happen when the filesystem
189 * implements the export_operations API. 201 * implements the export_operations API.
190 */ 202 */
191int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) 203int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
204 loff_t end, int datasync)
192{ 205{
193 const struct file_operations *fop; 206 const struct file_operations *fop;
194 struct address_space *mapping; 207 struct address_space *mapping;
@@ -212,7 +225,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
212 goto out; 225 goto out;
213 } 226 }
214 227
215 ret = filemap_fdatawrite(mapping); 228 ret = filemap_write_and_wait_range(mapping, start, end);
216 229
217 /* 230 /*
218 * We need to protect against concurrent writers, which could cause 231 * We need to protect against concurrent writers, which could cause
@@ -223,12 +236,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
223 if (!ret) 236 if (!ret)
224 ret = err; 237 ret = err;
225 mutex_unlock(&mapping->host->i_mutex); 238 mutex_unlock(&mapping->host->i_mutex);
226 err = filemap_fdatawait(mapping); 239
227 if (!ret)
228 ret = err;
229out: 240out:
230 return ret; 241 return ret;
231} 242}
243EXPORT_SYMBOL(vfs_fsync_range);
244
245/**
246 * vfs_fsync - perform a fsync or fdatasync on a file
247 * @file: file to sync
248 * @dentry: dentry of @file
249 * @datasync: only perform a fdatasync operation
250 *
251 * Write back data and metadata for @file to disk. If @datasync is
252 * set only metadata needed to access modified file data is written.
253 *
254 * In case this function is called from nfsd @file may be %NULL and
255 * only @dentry is set. This can only happen when the filesystem
256 * implements the export_operations API.
257 */
258int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
259{
260 return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
261}
232EXPORT_SYMBOL(vfs_fsync); 262EXPORT_SYMBOL(vfs_fsync);
233 263
234static int do_fsync(unsigned int fd, int datasync) 264static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +284,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
254 return do_fsync(fd, 1); 284 return do_fsync(fd, 1);
255} 285}
256 286
287/**
288 * generic_write_sync - perform syncing after a write if file / inode is sync
289 * @file: file to which the write happened
290 * @pos: offset where the write started
291 * @count: length of the write
292 *
293 * This is just a simple wrapper about our general syncing function.
294 */
295int generic_write_sync(struct file *file, loff_t pos, loff_t count)
296{
297 if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
298 return 0;
299 return vfs_fsync_range(file, file->f_path.dentry, pos,
300 pos + count - 1, 1);
301}
302EXPORT_SYMBOL(generic_write_sync);
303
257/* 304/*
258 * sys_sync_file_range() permits finely controlled syncing over a segment of 305 * sys_sync_file_range() permits finely controlled syncing over a segment of
259 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is 306 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d88d0fac9fa5..0050fc40e8c9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
760const struct inode_operations sysfs_dir_inode_operations = { 760const struct inode_operations sysfs_dir_inode_operations = {
761 .lookup = sysfs_lookup, 761 .lookup = sysfs_lookup,
762 .setattr = sysfs_setattr, 762 .setattr = sysfs_setattr,
763 .setxattr = sysfs_setxattr,
763}; 764};
764 765
765static void remove_dir(struct sysfs_dirent *sd) 766static void remove_dir(struct sysfs_dirent *sd)
@@ -939,8 +940,10 @@ again:
939 /* Remove from old parent's list and insert into new parent's list. */ 940 /* Remove from old parent's list and insert into new parent's list. */
940 sysfs_unlink_sibling(sd); 941 sysfs_unlink_sibling(sd);
941 sysfs_get(new_parent_sd); 942 sysfs_get(new_parent_sd);
943 drop_nlink(old_parent->d_inode);
942 sysfs_put(sd->s_parent); 944 sysfs_put(sd->s_parent);
943 sd->s_parent = new_parent_sd; 945 sd->s_parent = new_parent_sd;
946 inc_nlink(new_parent->d_inode);
944 sysfs_link_sibling(sd); 947 sysfs_link_sibling(sd);
945 948
946 out_unlock: 949 out_unlock:
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e28cecf179f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,8 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/xattr.h>
22#include <linux/security.h>
21#include "sysfs.h" 23#include "sysfs.h"
22 24
23extern struct super_block * sysfs_sb; 25extern struct super_block * sysfs_sb;
@@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = {
29}; 31};
30 32
31static struct backing_dev_info sysfs_backing_dev_info = { 33static struct backing_dev_info sysfs_backing_dev_info = {
34 .name = "sysfs",
32 .ra_pages = 0, /* No readahead */ 35 .ra_pages = 0, /* No readahead */
33 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
34}; 37};
35 38
36static const struct inode_operations sysfs_inode_operations ={ 39static const struct inode_operations sysfs_inode_operations ={
37 .setattr = sysfs_setattr, 40 .setattr = sysfs_setattr,
41 .setxattr = sysfs_setxattr,
38}; 42};
39 43
40int __init sysfs_inode_init(void) 44int __init sysfs_inode_init(void)
@@ -42,18 +46,37 @@ int __init sysfs_inode_init(void)
42 return bdi_init(&sysfs_backing_dev_info); 46 return bdi_init(&sysfs_backing_dev_info);
43} 47}
44 48
49struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
50{
51 struct sysfs_inode_attrs *attrs;
52 struct iattr *iattrs;
53
54 attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
55 if (!attrs)
56 return NULL;
57 iattrs = &attrs->ia_iattr;
58
59 /* assign default attributes */
60 iattrs->ia_mode = sd->s_mode;
61 iattrs->ia_uid = 0;
62 iattrs->ia_gid = 0;
63 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
64
65 return attrs;
66}
45int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) 67int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
46{ 68{
47 struct inode * inode = dentry->d_inode; 69 struct inode * inode = dentry->d_inode;
48 struct sysfs_dirent * sd = dentry->d_fsdata; 70 struct sysfs_dirent * sd = dentry->d_fsdata;
49 struct iattr * sd_iattr; 71 struct sysfs_inode_attrs *sd_attrs;
72 struct iattr *iattrs;
50 unsigned int ia_valid = iattr->ia_valid; 73 unsigned int ia_valid = iattr->ia_valid;
51 int error; 74 int error;
52 75
53 if (!sd) 76 if (!sd)
54 return -EINVAL; 77 return -EINVAL;
55 78
56 sd_iattr = sd->s_iattr; 79 sd_attrs = sd->s_iattr;
57 80
58 error = inode_change_ok(inode, iattr); 81 error = inode_change_ok(inode, iattr);
59 if (error) 82 if (error)
@@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
65 if (error) 88 if (error)
66 return error; 89 return error;
67 90
68 if (!sd_iattr) { 91 if (!sd_attrs) {
69 /* setting attributes for the first time, allocate now */ 92 /* setting attributes for the first time, allocate now */
70 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); 93 sd_attrs = sysfs_init_inode_attrs(sd);
71 if (!sd_iattr) 94 if (!sd_attrs)
72 return -ENOMEM; 95 return -ENOMEM;
73 /* assign default attributes */ 96 sd->s_iattr = sd_attrs;
74 sd_iattr->ia_mode = sd->s_mode; 97 } else {
75 sd_iattr->ia_uid = 0; 98 /* attributes were changed at least once in past */
76 sd_iattr->ia_gid = 0; 99 iattrs = &sd_attrs->ia_iattr;
77 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; 100
78 sd->s_iattr = sd_iattr; 101 if (ia_valid & ATTR_UID)
102 iattrs->ia_uid = iattr->ia_uid;
103 if (ia_valid & ATTR_GID)
104 iattrs->ia_gid = iattr->ia_gid;
105 if (ia_valid & ATTR_ATIME)
106 iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
107 inode->i_sb->s_time_gran);
108 if (ia_valid & ATTR_MTIME)
109 iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
110 inode->i_sb->s_time_gran);
111 if (ia_valid & ATTR_CTIME)
112 iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
113 inode->i_sb->s_time_gran);
114 if (ia_valid & ATTR_MODE) {
115 umode_t mode = iattr->ia_mode;
116
117 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
118 mode &= ~S_ISGID;
119 iattrs->ia_mode = sd->s_mode = mode;
120 }
79 } 121 }
122 return error;
123}
80 124
81 /* attributes were changed atleast once in past */ 125int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
82 126 size_t size, int flags)
83 if (ia_valid & ATTR_UID) 127{
84 sd_iattr->ia_uid = iattr->ia_uid; 128 struct sysfs_dirent *sd = dentry->d_fsdata;
85 if (ia_valid & ATTR_GID) 129 struct sysfs_inode_attrs *iattrs;
86 sd_iattr->ia_gid = iattr->ia_gid; 130 void *secdata;
87 if (ia_valid & ATTR_ATIME) 131 int error;
88 sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, 132 u32 secdata_len = 0;
89 inode->i_sb->s_time_gran); 133
90 if (ia_valid & ATTR_MTIME) 134 if (!sd)
91 sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, 135 return -EINVAL;
92 inode->i_sb->s_time_gran); 136 if (!sd->s_iattr)
93 if (ia_valid & ATTR_CTIME) 137 sd->s_iattr = sysfs_init_inode_attrs(sd);
94 sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, 138 if (!sd->s_iattr)
95 inode->i_sb->s_time_gran); 139 return -ENOMEM;
96 if (ia_valid & ATTR_MODE) { 140
97 umode_t mode = iattr->ia_mode; 141 iattrs = sd->s_iattr;
98 142
99 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 143 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
100 mode &= ~S_ISGID; 144 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
101 sd_iattr->ia_mode = sd->s_mode = mode; 145 error = security_inode_setsecurity(dentry->d_inode, suffix,
102 } 146 value, size, flags);
147 if (error)
148 goto out;
149 error = security_inode_getsecctx(dentry->d_inode,
150 &secdata, &secdata_len);
151 if (error)
152 goto out;
153 if (iattrs->ia_secdata)
154 security_release_secctx(iattrs->ia_secdata,
155 iattrs->ia_secdata_len);
156 iattrs->ia_secdata = secdata;
157 iattrs->ia_secdata_len = secdata_len;
103 158
159 } else
160 return -EINVAL;
161out:
104 return error; 162 return error;
105} 163}
106 164
@@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
146static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) 204static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
147{ 205{
148 struct bin_attribute *bin_attr; 206 struct bin_attribute *bin_attr;
207 struct sysfs_inode_attrs *iattrs;
149 208
150 inode->i_private = sysfs_get(sd); 209 inode->i_private = sysfs_get(sd);
151 inode->i_mapping->a_ops = &sysfs_aops; 210 inode->i_mapping->a_ops = &sysfs_aops;
@@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
154 inode->i_ino = sd->s_ino; 213 inode->i_ino = sd->s_ino;
155 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); 214 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
156 215
157 if (sd->s_iattr) { 216 iattrs = sd->s_iattr;
217 if (iattrs) {
158 /* sysfs_dirent has non-default attributes 218 /* sysfs_dirent has non-default attributes
159 * get them for the new inode from persistent copy 219 * get them for the new inode from persistent copy
160 * in sysfs_dirent 220 * in sysfs_dirent
161 */ 221 */
162 set_inode_attr(inode, sd->s_iattr); 222 set_inode_attr(inode, &iattrs->ia_iattr);
223 if (iattrs->ia_secdata)
224 security_inode_notifysecctx(inode,
225 iattrs->ia_secdata,
226 iattrs->ia_secdata_len);
163 } else 227 } else
164 set_default_inode_attr(inode, sd->s_mode); 228 set_default_inode_attr(inode, sd->s_mode);
165 229
166
167 /* initialize inode according to type */ 230 /* initialize inode according to type */
168 switch (sysfs_type(sd)) { 231 switch (sysfs_type(sd)) {
169 case SYSFS_DIR: 232 case SYSFS_DIR:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1d897ad808e0..c5081ad77026 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -16,6 +16,7 @@
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/security.h>
19 20
20#include "sysfs.h" 21#include "sysfs.h"
21 22
@@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
209} 210}
210 211
211const struct inode_operations sysfs_symlink_inode_operations = { 212const struct inode_operations sysfs_symlink_inode_operations = {
213 .setxattr = sysfs_setxattr,
212 .readlink = generic_readlink, 214 .readlink = generic_readlink,
213 .follow_link = sysfs_follow_link, 215 .follow_link = sysfs_follow_link,
214 .put_link = sysfs_put_link, 216 .put_link = sysfs_put_link,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3fa0d98481e2..af4c4e7482ac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,8 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/fs.h>
12
11struct sysfs_open_dirent; 13struct sysfs_open_dirent;
12 14
13/* type-specific structures for sysfs_dirent->s_* union members */ 15/* type-specific structures for sysfs_dirent->s_* union members */
@@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr {
31 struct hlist_head buffers; 33 struct hlist_head buffers;
32}; 34};
33 35
36struct sysfs_inode_attrs {
37 struct iattr ia_iattr;
38 void *ia_secdata;
39 u32 ia_secdata_len;
40};
41
34/* 42/*
35 * sysfs_dirent - the building block of sysfs hierarchy. Each and 43 * sysfs_dirent - the building block of sysfs hierarchy. Each and
36 * every sysfs node is represented by single sysfs_dirent. 44 * every sysfs node is represented by single sysfs_dirent.
@@ -56,7 +64,7 @@ struct sysfs_dirent {
56 unsigned int s_flags; 64 unsigned int s_flags;
57 ino_t s_ino; 65 ino_t s_ino;
58 umode_t s_mode; 66 umode_t s_mode;
59 struct iattr *s_iattr; 67 struct sysfs_inode_attrs *s_iattr;
60}; 68};
61 69
62#define SD_DEACTIVATED_BIAS INT_MIN 70#define SD_DEACTIVATED_BIAS INT_MIN
@@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
148struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 156struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
149void sysfs_delete_inode(struct inode *inode); 157void sysfs_delete_inode(struct inode *inode);
150int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 158int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
159int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
160 size_t size, int flags);
151int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 161int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
152int sysfs_inode_init(void); 162int sysfs_inode_init(void);
153 163
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index c1f3f99b2939..076ca50e9933 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,41 +54,15 @@
54 * @nr_to_write: how many dirty pages to write-back 54 * @nr_to_write: how many dirty pages to write-back
55 * 55 *
56 * This function shrinks UBIFS liability by means of writing back some amount 56 * This function shrinks UBIFS liability by means of writing back some amount
57 * of dirty inodes and their pages. Returns the amount of pages which were 57 * of dirty inodes and their pages.
58 * written back. The returned value does not include dirty inodes which were
59 * synchronized.
60 * 58 *
61 * Note, this function synchronizes even VFS inodes which are locked 59 * Note, this function synchronizes even VFS inodes which are locked
62 * (@i_mutex) by the caller of the budgeting function, because write-back does 60 * (@i_mutex) by the caller of the budgeting function, because write-back does
63 * not touch @i_mutex. 61 * not touch @i_mutex.
64 */ 62 */
65static int shrink_liability(struct ubifs_info *c, int nr_to_write) 63static void shrink_liability(struct ubifs_info *c, int nr_to_write)
66{ 64{
67 int nr_written; 65 writeback_inodes_sb(c->vfs_sb);
68 struct writeback_control wbc = {
69 .sync_mode = WB_SYNC_NONE,
70 .range_end = LLONG_MAX,
71 .nr_to_write = nr_to_write,
72 };
73
74 generic_sync_sb_inodes(c->vfs_sb, &wbc);
75 nr_written = nr_to_write - wbc.nr_to_write;
76
77 if (!nr_written) {
78 /*
79 * Re-try again but wait on pages/inodes which are being
80 * written-back concurrently (e.g., by pdflush).
81 */
82 memset(&wbc, 0, sizeof(struct writeback_control));
83 wbc.sync_mode = WB_SYNC_ALL;
84 wbc.range_end = LLONG_MAX;
85 wbc.nr_to_write = nr_to_write;
86 generic_sync_sb_inodes(c->vfs_sb, &wbc);
87 nr_written = nr_to_write - wbc.nr_to_write;
88 }
89
90 dbg_budg("%d pages were written back", nr_written);
91 return nr_written;
92} 66}
93 67
94/** 68/**
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7e2b3d4d487a..333e181ee987 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1952,6 +1952,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1952 * 1952 *
1953 * Read-ahead will be disabled because @c->bdi.ra_pages is 0. 1953 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
1954 */ 1954 */
1955 c->bdi.name = "ubifs",
1955 c->bdi.capabilities = BDI_CAP_MAP_COPY; 1956 c->bdi.capabilities = BDI_CAP_MAP_COPY;
1956 c->bdi.unplug_io_fn = default_unplug_io_fn; 1957 c->bdi.unplug_io_fn = default_unplug_io_fn;
1957 err = bdi_init(&c->bdi); 1958 err = bdi_init(&c->bdi);
@@ -1966,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1966 if (err) 1967 if (err)
1967 goto out_bdi; 1968 goto out_bdi;
1968 1969
1970 sb->s_bdi = &c->bdi;
1969 sb->s_fs_info = c; 1971 sb->s_fs_info = c;
1970 sb->s_magic = UBIFS_SUPER_MAGIC; 1972 sb->s_magic = UBIFS_SUPER_MAGIC;
1971 sb->s_blocksize = UBIFS_BLOCK_SIZE; 1973 sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1d2c570704c8..2ffdb6733af1 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -18,59 +18,6 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20 20
21#if 0
22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
23 uint8_t ad_size, struct kernel_lb_addr fe_loc,
24 int *pos, int *offset, struct buffer_head **bh,
25 int *error)
26{
27 int loffset = *offset;
28 int block;
29 uint8_t *ad;
30 int remainder;
31
32 *error = 0;
33
34 ad = (uint8_t *)(*bh)->b_data + *offset;
35 *offset += ad_size;
36
37 if (!ad) {
38 brelse(*bh);
39 *error = 1;
40 return NULL;
41 }
42
43 if (*offset == dir->i_sb->s_blocksize) {
44 brelse(*bh);
45 block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
46 if (!block)
47 return NULL;
48 *bh = udf_tread(dir->i_sb, block);
49 if (!*bh)
50 return NULL;
51 } else if (*offset > dir->i_sb->s_blocksize) {
52 ad = tmpad;
53
54 remainder = dir->i_sb->s_blocksize - loffset;
55 memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
56
57 brelse(*bh);
58 block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
59 if (!block)
60 return NULL;
61 (*bh) = udf_tread(dir->i_sb, block);
62 if (!*bh)
63 return NULL;
64
65 memcpy((uint8_t *)ad + remainder, (*bh)->b_data,
66 ad_size - remainder);
67 *offset = ad_size - remainder;
68 }
69
70 return ad;
71}
72#endif
73
74struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, 21struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
75 struct udf_fileident_bh *fibh, 22 struct udf_fileident_bh *fibh,
76 struct fileIdentDesc *cfi, 23 struct fileIdentDesc *cfi,
@@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
248 return fi; 195 return fi;
249} 196}
250 197
251#if 0
252static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
253{
254 struct extent_ad *ext;
255 struct fileEntry *fe;
256 uint8_t *ptr;
257
258 if ((!buffer) || (!offset)) {
259 printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n");
260 return NULL;
261 }
262
263 fe = (struct fileEntry *)buffer;
264
265 if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) {
266 udf_debug("0x%x != TAG_IDENT_FE\n",
267 le16_to_cpu(fe->descTag.tagIdent));
268 return NULL;
269 }
270
271 ptr = (uint8_t *)(fe->extendedAttr) +
272 le32_to_cpu(fe->lengthExtendedAttr);
273
274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
275 ptr += *offset;
276
277 ext = (struct extent_ad *)ptr;
278
279 *offset = *offset + sizeof(struct extent_ad);
280 return ext;
281}
282#endif
283
284struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, 198struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
285 int inc) 199 int inc)
286{ 200{
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7464305382b5..b80cbd78833c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
193static int udf_release_file(struct inode *inode, struct file *filp) 193static int udf_release_file(struct inode *inode, struct file *filp)
194{ 194{
195 if (filp->f_mode & FMODE_WRITE) { 195 if (filp->f_mode & FMODE_WRITE) {
196 mutex_lock(&inode->i_mutex);
196 lock_kernel(); 197 lock_kernel();
197 udf_discard_prealloc(inode); 198 udf_discard_prealloc(inode);
198 unlock_kernel(); 199 unlock_kernel();
200 mutex_unlock(&inode->i_mutex);
199 } 201 }
200 return 0; 202 return 0;
201} 203}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e7533f785636..6d24c2c63f93 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -90,19 +90,16 @@ no_delete:
90} 90}
91 91
92/* 92/*
93 * If we are going to release inode from memory, we discard preallocation and 93 * If we are going to release inode from memory, we truncate last inode extent
94 * truncate last inode extent to proper length. We could use drop_inode() but 94 * to proper length. We could use drop_inode() but it's called under inode_lock
95 * it's called under inode_lock and thus we cannot mark inode dirty there. We 95 * and thus we cannot mark inode dirty there. We use clear_inode() but we have
96 * use clear_inode() but we have to make sure to write inode as it's not written 96 * to make sure to write inode as it's not written automatically.
97 * automatically.
98 */ 97 */
99void udf_clear_inode(struct inode *inode) 98void udf_clear_inode(struct inode *inode)
100{ 99{
101 struct udf_inode_info *iinfo; 100 struct udf_inode_info *iinfo;
102 if (!(inode->i_sb->s_flags & MS_RDONLY)) { 101 if (!(inode->i_sb->s_flags & MS_RDONLY)) {
103 lock_kernel(); 102 lock_kernel();
104 /* Discard preallocation for directories, symlinks, etc. */
105 udf_discard_prealloc(inode);
106 udf_truncate_tail_extent(inode); 103 udf_truncate_tail_extent(inode);
107 unlock_kernel(); 104 unlock_kernel();
108 write_inode_now(inode, 0); 105 write_inode_now(inode, 0);
@@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
664 udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); 661 udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
665 662
666#ifdef UDF_PREALLOCATE 663#ifdef UDF_PREALLOCATE
667 /* preallocate blocks */ 664 /* We preallocate blocks only for regular files. It also makes sense
668 udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); 665 * for directories but there's a problem when to drop the
666 * preallocation. We might use some delayed work for that but I feel
667 * it's overengineering for a filesystem like UDF. */
668 if (S_ISREG(inode->i_mode))
669 udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
669#endif 670#endif
670 671
671 /* merge any continuous blocks in laarr */ 672 /* merge any continuous blocks in laarr */
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 1b88fd5df05d..43e24a3b8e10 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb)
36 ms_info.addr_format = CDROM_LBA; 36 ms_info.addr_format = CDROM_LBA;
37 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); 37 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
38 38
39#define WE_OBEY_THE_WRITTEN_STANDARDS 1
40
41 if (i == 0) { 39 if (i == 0) {
42 udf_debug("XA disk: %s, vol_desc_start=%d\n", 40 udf_debug("XA disk: %s, vol_desc_start=%d\n",
43 (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); 41 (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
44#if WE_OBEY_THE_WRITTEN_STANDARDS
45 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ 42 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
46#endif
47 vol_desc_start = ms_info.addr.lba; 43 vol_desc_start = ms_info.addr.lba;
48 } else { 44 } else {
49 udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); 45 udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6a29fa34c478..21dad8c608f9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
943 pc->componentType = 1; 943 pc->componentType = 1;
944 pc->lengthComponentIdent = 0; 944 pc->lengthComponentIdent = 0;
945 pc->componentFileVersionNum = 0; 945 pc->componentFileVersionNum = 0;
946 pc += sizeof(struct pathComponent);
947 elen += sizeof(struct pathComponent); 946 elen += sizeof(struct pathComponent);
948 } 947 }
949 948
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6832135159b6..9d1b8c2e6c45 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1087,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1087 struct udf_inode_info *vati; 1087 struct udf_inode_info *vati;
1088 uint32_t pos; 1088 uint32_t pos;
1089 struct virtualAllocationTable20 *vat20; 1089 struct virtualAllocationTable20 *vat20;
1090 sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
1090 1091
1091 /* VAT file entry is in the last recorded block */ 1092 /* VAT file entry is in the last recorded block */
1092 ino.partitionReferenceNum = type1_index; 1093 ino.partitionReferenceNum = type1_index;
1093 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; 1094 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
1094 sbi->s_vat_inode = udf_iget(sb, &ino); 1095 sbi->s_vat_inode = udf_iget(sb, &ino);
1096 if (!sbi->s_vat_inode &&
1097 sbi->s_last_block != blocks - 1) {
1098 printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
1099 " last recorded block (%lu), retrying with the last "
1100 "block of the device (%lu).\n",
1101 (unsigned long)sbi->s_last_block,
1102 (unsigned long)blocks - 1);
1103 ino.partitionReferenceNum = type1_index;
1104 ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
1105 sbi->s_vat_inode = udf_iget(sb, &ino);
1106 }
1095 if (!sbi->s_vat_inode) 1107 if (!sbi->s_vat_inode)
1096 return 1; 1108 return 1;
1097 1109
diff --git a/fs/xattr.c b/fs/xattr.c
index 1c3d0af59ddf..6d4f6d3449fb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask)
66 return inode_permission(inode, mask); 66 return inode_permission(inode, mask);
67} 67}
68 68
69int 69/**
70vfs_setxattr(struct dentry *dentry, const char *name, const void *value, 70 * __vfs_setxattr_noperm - perform setxattr operation without performing
71 size_t size, int flags) 71 * permission checks.
72 *
73 * @dentry - object to perform setxattr on
74 * @name - xattr name to set
75 * @value - value to set @name to
76 * @size - size of @value
77 * @flags - flags to pass into filesystem operations
78 *
79 * returns the result of the internal setxattr or setsecurity operations.
80 *
81 * This function requires the caller to lock the inode's i_mutex before it
82 * is executed. It also assumes that the caller will make the appropriate
83 * permission checks.
84 */
85int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
86 const void *value, size_t size, int flags)
72{ 87{
73 struct inode *inode = dentry->d_inode; 88 struct inode *inode = dentry->d_inode;
74 int error; 89 int error = -EOPNOTSUPP;
75
76 error = xattr_permission(inode, name, MAY_WRITE);
77 if (error)
78 return error;
79 90
80 mutex_lock(&inode->i_mutex);
81 error = security_inode_setxattr(dentry, name, value, size, flags);
82 if (error)
83 goto out;
84 error = -EOPNOTSUPP;
85 if (inode->i_op->setxattr) { 91 if (inode->i_op->setxattr) {
86 error = inode->i_op->setxattr(dentry, name, value, size, flags); 92 error = inode->i_op->setxattr(dentry, name, value, size, flags);
87 if (!error) { 93 if (!error) {
@@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
97 if (!error) 103 if (!error)
98 fsnotify_xattr(dentry); 104 fsnotify_xattr(dentry);
99 } 105 }
106
107 return error;
108}
109
110
111int
112vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
113 size_t size, int flags)
114{
115 struct inode *inode = dentry->d_inode;
116 int error;
117
118 error = xattr_permission(inode, name, MAY_WRITE);
119 if (error)
120 return error;
121
122 mutex_lock(&inode->i_mutex);
123 error = security_inode_setxattr(dentry, name, value, size, flags);
124 if (error)
125 goto out;
126
127 error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
128
100out: 129out:
101 mutex_unlock(&inode->i_mutex); 130 mutex_unlock(&inode->i_mutex);
102 return error; 131 return error;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7ec89fc05b2b..d5e5559e31db 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
216 if (ip->i_d.di_size < isize) { 216 if (ip->i_d.di_size < isize) {
217 ip->i_d.di_size = isize; 217 ip->i_d.di_size = isize;
218 ip->i_update_core = 1; 218 ip->i_update_core = 1;
219 ip->i_update_size = 1;
220 xfs_mark_inode_dirty_sync(ip); 219 xfs_mark_inode_dirty_sync(ip);
221 } 220 }
222 221
@@ -1268,6 +1267,14 @@ xfs_vm_writepage(
1268 if (!page_has_buffers(page)) 1267 if (!page_has_buffers(page))
1269 create_empty_buffers(page, 1 << inode->i_blkbits, 0); 1268 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1270 1269
1270
1271 /*
1272 * VM calculation for nr_to_write seems off. Bump it way
1273 * up, this gets simple streaming writes zippy again.
1274 * To be reviewed again after Jens' writeback changes.
1275 */
1276 wbc->nr_to_write *= 4;
1277
1271 /* 1278 /*
1272 * Convert delayed allocate, unwritten or unmapped space 1279 * Convert delayed allocate, unwritten or unmapped space
1273 * to real space and flush out to disk. 1280 * to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 0c93c7ef3d18..965df1227d64 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -770,7 +770,7 @@ xfs_buf_associate_memory(
770 bp->b_pages = NULL; 770 bp->b_pages = NULL;
771 bp->b_addr = mem; 771 bp->b_addr = mem;
772 772
773 rval = _xfs_buf_get_pages(bp, page_count, 0); 773 rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
774 if (rval) 774 if (rval)
775 return rval; 775 return rval;
776 776
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
172 */ 172 */
173STATIC int 173STATIC int
174xfs_file_fsync( 174xfs_file_fsync(
175 struct file *filp, 175 struct file *file,
176 struct dentry *dentry, 176 struct dentry *dentry,
177 int datasync) 177 int datasync)
178{ 178{
179 xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); 179 struct inode *inode = dentry->d_inode;
180 return -xfs_fsync(XFS_I(dentry->d_inode)); 180 struct xfs_inode *ip = XFS_I(inode);
181 int error;
182
183 /* capture size updates in I/O completion before writing the inode. */
184 error = filemap_fdatawait(inode->i_mapping);
185 if (error)
186 return error;
187
188 xfs_iflags_clear(ip, XFS_ITRUNCATED);
189 return -xfs_fsync(ip);
181} 190}
182 191
183STATIC int 192STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
619 case XFS_IOC_GETVERSION_32: 619 case XFS_IOC_GETVERSION_32:
620 cmd = _NATIVE_IOC(cmd, long); 620 cmd = _NATIVE_IOC(cmd, long);
621 return xfs_file_ioctl(filp, cmd, p); 621 return xfs_file_ioctl(filp, cmd, p);
622 case XFS_IOC_SWAPEXT: { 622 case XFS_IOC_SWAPEXT_32: {
623 struct xfs_swapext sxp; 623 struct xfs_swapext sxp;
624 struct compat_xfs_swapext __user *sxu = arg; 624 struct compat_xfs_swapext __user *sxu = arg;
625 625
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 58973bb46038..da0159d99f82 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_itable.h" 44#include "xfs_itable.h"
45#include "xfs_rw.h" 45#include "xfs_rw.h"
46#include "xfs_acl.h"
47#include "xfs_attr.h" 46#include "xfs_attr.h"
48#include "xfs_buf_item.h" 47#include "xfs_buf_item.h"
49#include "xfs_utils.h" 48#include "xfs_utils.h"
@@ -485,14 +484,6 @@ xfs_vn_put_link(
485} 484}
486 485
487STATIC int 486STATIC int
488xfs_vn_permission(
489 struct inode *inode,
490 int mask)
491{
492 return generic_permission(inode, mask, xfs_check_acl);
493}
494
495STATIC int
496xfs_vn_getattr( 487xfs_vn_getattr(
497 struct vfsmount *mnt, 488 struct vfsmount *mnt,
498 struct dentry *dentry, 489 struct dentry *dentry,
@@ -680,8 +671,8 @@ xfs_vn_fiemap(
680 else 671 else
681 bm.bmv_length = BTOBB(length); 672 bm.bmv_length = BTOBB(length);
682 673
683 /* our formatter will tell xfs_getbmap when to stop. */ 674 /* We add one because in getbmap world count includes the header */
684 bm.bmv_count = MAXEXTNUM; 675 bm.bmv_count = fieinfo->fi_extents_max + 1;
685 bm.bmv_iflags = BMV_IF_PREALLOC; 676 bm.bmv_iflags = BMV_IF_PREALLOC;
686 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 677 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
687 bm.bmv_iflags |= BMV_IF_ATTRFORK; 678 bm.bmv_iflags |= BMV_IF_ATTRFORK;
@@ -696,7 +687,7 @@ xfs_vn_fiemap(
696} 687}
697 688
698static const struct inode_operations xfs_inode_operations = { 689static const struct inode_operations xfs_inode_operations = {
699 .permission = xfs_vn_permission, 690 .check_acl = xfs_check_acl,
700 .truncate = xfs_vn_truncate, 691 .truncate = xfs_vn_truncate,
701 .getattr = xfs_vn_getattr, 692 .getattr = xfs_vn_getattr,
702 .setattr = xfs_vn_setattr, 693 .setattr = xfs_vn_setattr,
@@ -724,7 +715,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
724 .rmdir = xfs_vn_unlink, 715 .rmdir = xfs_vn_unlink,
725 .mknod = xfs_vn_mknod, 716 .mknod = xfs_vn_mknod,
726 .rename = xfs_vn_rename, 717 .rename = xfs_vn_rename,
727 .permission = xfs_vn_permission, 718 .check_acl = xfs_check_acl,
728 .getattr = xfs_vn_getattr, 719 .getattr = xfs_vn_getattr,
729 .setattr = xfs_vn_setattr, 720 .setattr = xfs_vn_setattr,
730 .setxattr = generic_setxattr, 721 .setxattr = generic_setxattr,
@@ -749,7 +740,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
749 .rmdir = xfs_vn_unlink, 740 .rmdir = xfs_vn_unlink,
750 .mknod = xfs_vn_mknod, 741 .mknod = xfs_vn_mknod,
751 .rename = xfs_vn_rename, 742 .rename = xfs_vn_rename,
752 .permission = xfs_vn_permission, 743 .check_acl = xfs_check_acl,
753 .getattr = xfs_vn_getattr, 744 .getattr = xfs_vn_getattr,
754 .setattr = xfs_vn_setattr, 745 .setattr = xfs_vn_setattr,
755 .setxattr = generic_setxattr, 746 .setxattr = generic_setxattr,
@@ -762,7 +753,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
762 .readlink = generic_readlink, 753 .readlink = generic_readlink,
763 .follow_link = xfs_vn_follow_link, 754 .follow_link = xfs_vn_follow_link,
764 .put_link = xfs_vn_put_link, 755 .put_link = xfs_vn_put_link,
765 .permission = xfs_vn_permission, 756 .check_acl = xfs_check_acl,
766 .getattr = xfs_vn_getattr, 757 .getattr = xfs_vn_getattr,
767 .setattr = xfs_vn_setattr, 758 .setattr = xfs_vn_setattr,
768 .setxattr = generic_setxattr, 759 .setxattr = generic_setxattr,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,18 +812,21 @@ write_retry:
812 812
813 /* Handle various SYNC-type writes */ 813 /* Handle various SYNC-type writes */
814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
815 loff_t end = pos + ret - 1;
815 int error2; 816 int error2;
816 817
817 xfs_iunlock(xip, iolock); 818 xfs_iunlock(xip, iolock);
818 if (need_i_mutex) 819 if (need_i_mutex)
819 mutex_unlock(&inode->i_mutex); 820 mutex_unlock(&inode->i_mutex);
820 error2 = sync_page_range(inode, mapping, pos, ret); 821
822 error2 = filemap_write_and_wait_range(mapping, pos, end);
821 if (!error) 823 if (!error)
822 error = error2; 824 error = error2;
823 if (need_i_mutex) 825 if (need_i_mutex)
824 mutex_lock(&inode->i_mutex); 826 mutex_lock(&inode->i_mutex);
825 xfs_ilock(xip, iolock); 827 xfs_ilock(xip, iolock);
826 error2 = xfs_write_sync_logforce(mp, xip); 828
829 error2 = xfs_fsync(xip);
827 if (!error) 830 if (!error)
828 error = error2; 831 error = error2;
829 } 832 }
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
20 20
21DEFINE_PER_CPU(struct xfsstats, xfsstats); 21DEFINE_PER_CPU(struct xfsstats, xfsstats);
22 22
23STATIC int 23static int xfs_stat_proc_show(struct seq_file *m, void *v)
24xfs_read_xfsstats(
25 char *buffer,
26 char **start,
27 off_t offset,
28 int count,
29 int *eof,
30 void *data)
31{ 24{
32 int c, i, j, len, val; 25 int c, i, j, val;
33 __uint64_t xs_xstrat_bytes = 0; 26 __uint64_t xs_xstrat_bytes = 0;
34 __uint64_t xs_write_bytes = 0; 27 __uint64_t xs_write_bytes = 0;
35 __uint64_t xs_read_bytes = 0; 28 __uint64_t xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
60 }; 53 };
61 54
62 /* Loop over all stats groups */ 55 /* Loop over all stats groups */
63 for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) { 56 for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
64 len += sprintf(buffer + len, "%s", xstats[i].desc); 57 seq_printf(m, "%s", xstats[i].desc);
65 /* inner loop does each group */ 58 /* inner loop does each group */
66 while (j < xstats[i].endpoint) { 59 while (j < xstats[i].endpoint) {
67 val = 0; 60 val = 0;
68 /* sum over all cpus */ 61 /* sum over all cpus */
69 for_each_possible_cpu(c) 62 for_each_possible_cpu(c)
70 val += *(((__u32*)&per_cpu(xfsstats, c) + j)); 63 val += *(((__u32*)&per_cpu(xfsstats, c) + j));
71 len += sprintf(buffer + len, " %u", val); 64 seq_printf(m, " %u", val);
72 j++; 65 j++;
73 } 66 }
74 buffer[len++] = '\n'; 67 seq_putc(m, '\n');
75 } 68 }
76 /* extra precision counters */ 69 /* extra precision counters */
77 for_each_possible_cpu(i) { 70 for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
80 xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; 73 xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
81 } 74 }
82 75
83 len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n", 76 seq_printf(m, "xpc %Lu %Lu %Lu\n",
84 xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); 77 xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
85 len += sprintf(buffer + len, "debug %u\n", 78 seq_printf(m, "debug %u\n",
86#if defined(DEBUG) 79#if defined(DEBUG)
87 1); 80 1);
88#else 81#else
89 0); 82 0);
90#endif 83#endif
84 return 0;
85}
91 86
92 if (offset >= len) { 87static int xfs_stat_proc_open(struct inode *inode, struct file *file)
93 *start = buffer; 88{
94 *eof = 1; 89 return single_open(file, xfs_stat_proc_show, NULL);
95 return 0;
96 }
97 *start = buffer + offset;
98 if ((len -= offset) > count)
99 return count;
100 *eof = 1;
101
102 return len;
103} 90}
104 91
92static const struct file_operations xfs_stat_proc_fops = {
93 .owner = THIS_MODULE,
94 .open = xfs_stat_proc_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
105int 100int
106xfs_init_procfs(void) 101xfs_init_procfs(void)
107{ 102{
108 if (!proc_mkdir("fs/xfs", NULL)) 103 if (!proc_mkdir("fs/xfs", NULL))
109 goto out; 104 goto out;
110 105
111 if (!create_proc_read_entry("fs/xfs/stat", 0, NULL, 106 if (!proc_create("fs/xfs/stat", 0, NULL,
112 xfs_read_xfsstats, NULL)) 107 &xfs_stat_proc_fops))
113 goto out_remove_entry; 108 goto out_remove_entry;
114 return 0; 109 return 0;
115 110
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..5d7c60ac77b4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -579,15 +579,19 @@ xfs_showargs(
579 else if (mp->m_qflags & XFS_UQUOTA_ACCT) 579 else if (mp->m_qflags & XFS_UQUOTA_ACCT)
580 seq_puts(m, "," MNTOPT_UQUOTANOENF); 580 seq_puts(m, "," MNTOPT_UQUOTANOENF);
581 581
582 if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) 582 /* Either project or group quotas can be active, not both */
583 seq_puts(m, "," MNTOPT_PRJQUOTA); 583
584 else if (mp->m_qflags & XFS_PQUOTA_ACCT) 584 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
585 seq_puts(m, "," MNTOPT_PQUOTANOENF); 585 if (mp->m_qflags & XFS_OQUOTA_ENFD)
586 586 seq_puts(m, "," MNTOPT_PRJQUOTA);
587 if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD)) 587 else
588 seq_puts(m, "," MNTOPT_GRPQUOTA); 588 seq_puts(m, "," MNTOPT_PQUOTANOENF);
589 else if (mp->m_qflags & XFS_GQUOTA_ACCT) 589 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
590 seq_puts(m, "," MNTOPT_GQUOTANOENF); 590 if (mp->m_qflags & XFS_OQUOTA_ENFD)
591 seq_puts(m, "," MNTOPT_GRPQUOTA);
592 else
593 seq_puts(m, "," MNTOPT_GQUOTANOENF);
594 }
591 595
592 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) 596 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
593 seq_puts(m, "," MNTOPT_NOQUOTA); 597 seq_puts(m, "," MNTOPT_NOQUOTA);
@@ -687,7 +691,7 @@ xfs_barrier_test(
687 return error; 691 return error;
688} 692}
689 693
690void 694STATIC void
691xfs_mountfs_check_barriers(xfs_mount_t *mp) 695xfs_mountfs_check_barriers(xfs_mount_t *mp)
692{ 696{
693 int error; 697 int error;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b619d6b8ca43..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
708 return 0; 708 return 0;
709} 709}
710 710
711void
712__xfs_inode_set_reclaim_tag(
713 struct xfs_perag *pag,
714 struct xfs_inode *ip)
715{
716 radix_tree_tag_set(&pag->pag_ici_root,
717 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
718 XFS_ICI_RECLAIM_TAG);
719}
720
711/* 721/*
712 * We set the inode flag atomically with the radix tree tag. 722 * We set the inode flag atomically with the radix tree tag.
713 * Once we get tag lookups on the radix tree, this inode flag 723 * Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
722 732
723 read_lock(&pag->pag_ici_lock); 733 read_lock(&pag->pag_ici_lock);
724 spin_lock(&ip->i_flags_lock); 734 spin_lock(&ip->i_flags_lock);
725 radix_tree_tag_set(&pag->pag_ici_root, 735 __xfs_inode_set_reclaim_tag(pag, ip);
726 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
727 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 736 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
728 spin_unlock(&ip->i_flags_lock); 737 spin_unlock(&ip->i_flags_lock);
729 read_unlock(&pag->pag_ici_lock); 738 read_unlock(&pag->pag_ici_lock);
@@ -740,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
740 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 749 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
741} 750}
742 751
743void
744xfs_inode_clear_reclaim_tag(
745 xfs_inode_t *ip)
746{
747 xfs_mount_t *mp = ip->i_mount;
748 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
749
750 read_lock(&pag->pag_ici_lock);
751 spin_lock(&ip->i_flags_lock);
752 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
753 spin_unlock(&ip->i_flags_lock);
754 read_unlock(&pag->pag_ici_lock);
755 xfs_put_perag(mp, pag);
756}
757
758STATIC int 752STATIC int
759xfs_reclaim_inode_now( 753xfs_reclaim_inode_now(
760 struct xfs_inode *ip, 754 struct xfs_inode *ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 2a10301c99c7..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -48,7 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
48int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 48int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
49 49
50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
51void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); 51void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
52void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 52void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
53 struct xfs_inode *ip); 53 struct xfs_inode *ip);
54 54
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
48 48
49struct xqmstats xqmstats; 49struct xqmstats xqmstats;
50 50
51STATIC int 51static int xqm_proc_show(struct seq_file *m, void *v)
52xfs_qm_read_xfsquota(
53 char *buffer,
54 char **start,
55 off_t offset,
56 int count,
57 int *eof,
58 void *data)
59{ 52{
60 int len;
61
62 /* maximum; incore; ratio free to inuse; freelist */ 53 /* maximum; incore; ratio free to inuse; freelist */
63 len = sprintf(buffer, "%d\t%d\t%d\t%u\n", 54 seq_printf(m, "%d\t%d\t%d\t%u\n",
64 ndquot, 55 ndquot,
65 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, 56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
66 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, 57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
67 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); 58 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
68 59 return 0;
69 if (offset >= len) {
70 *start = buffer;
71 *eof = 1;
72 return 0;
73 }
74 *start = buffer + offset;
75 if ((len -= offset) > count)
76 return count;
77 *eof = 1;
78
79 return len;
80} 60}
81 61
82STATIC int 62static int xqm_proc_open(struct inode *inode, struct file *file)
83xfs_qm_read_stats(
84 char *buffer,
85 char **start,
86 off_t offset,
87 int count,
88 int *eof,
89 void *data)
90{ 63{
91 int len; 64 return single_open(file, xqm_proc_show, NULL);
65}
66
67static const struct file_operations xqm_proc_fops = {
68 .owner = THIS_MODULE,
69 .open = xqm_proc_open,
70 .read = seq_read,
71 .llseek = seq_lseek,
72 .release = single_release,
73};
92 74
75static int xqmstat_proc_show(struct seq_file *m, void *v)
76{
93 /* quota performance statistics */ 77 /* quota performance statistics */
94 len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n", 78 seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
95 xqmstats.xs_qm_dqreclaims, 79 xqmstats.xs_qm_dqreclaims,
96 xqmstats.xs_qm_dqreclaim_misses, 80 xqmstats.xs_qm_dqreclaim_misses,
97 xqmstats.xs_qm_dquot_dups, 81 xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
100 xqmstats.xs_qm_dqwants, 84 xqmstats.xs_qm_dqwants,
101 xqmstats.xs_qm_dqshake_reclaims, 85 xqmstats.xs_qm_dqshake_reclaims,
102 xqmstats.xs_qm_dqinact_reclaims); 86 xqmstats.xs_qm_dqinact_reclaims);
87 return 0;
88}
103 89
104 if (offset >= len) { 90static int xqmstat_proc_open(struct inode *inode, struct file *file)
105 *start = buffer; 91{
106 *eof = 1; 92 return single_open(file, xqmstat_proc_show, NULL);
107 return 0;
108 }
109 *start = buffer + offset;
110 if ((len -= offset) > count)
111 return count;
112 *eof = 1;
113
114 return len;
115} 93}
116 94
95static const struct file_operations xqmstat_proc_fops = {
96 .owner = THIS_MODULE,
97 .open = xqmstat_proc_open,
98 .read = seq_read,
99 .llseek = seq_lseek,
100 .release = single_release,
101};
102
117void 103void
118xfs_qm_init_procfs(void) 104xfs_qm_init_procfs(void)
119{ 105{
120 create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL); 106 proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
121 create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL); 107 proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
122} 108}
123 109
124void 110void
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
198 xfs_agino_t pagi_count; /* number of allocated inodes */ 198 xfs_agino_t pagi_count; /* number of allocated inodes */
199 int pagb_count; /* pagb slots in use */ 199 int pagb_count; /* pagb slots in use */
200 xfs_perag_busy_t *pagb_list; /* unstable blocks */ 200 xfs_perag_busy_t *pagb_list; /* unstable blocks */
201
202 /*
203 * Inode allocation search lookup optimisation.
204 * If the pagino matches, the search for new inodes
205 * doesn't need to search the near ones again straight away
206 */
207 xfs_agino_t pagl_pagino;
208 xfs_agino_t pagl_leftrec;
209 xfs_agino_t pagl_rightrec;
201#ifdef __KERNEL__ 210#ifdef __KERNEL__
202 spinlock_t pagb_lock; /* lock for pagb_list */ 211 spinlock_t pagb_lock; /* lock for pagb_list */
203 212
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index db15feb906ff..4ece1906bd41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
2010 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 2010 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
2011 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 2011 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
2012 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, 2012 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
2013 blkcnt, XFS_BUF_LOCK, &bp); 2013 blkcnt,
2014 XFS_BUF_LOCK | XBF_DONT_BLOCK,
2015 &bp);
2014 if (error) 2016 if (error)
2015 return(error); 2017 return(error);
2016 2018
@@ -2141,8 +2143,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2141 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), 2143 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
2142 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); 2144 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2143 2145
2144 bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, 2146 bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
2145 blkcnt, XFS_BUF_LOCK); 2147 XFS_BUF_LOCK | XBF_DONT_BLOCK);
2146 ASSERT(bp); 2148 ASSERT(bp);
2147 ASSERT(!XFS_BUF_GETERROR(bp)); 2149 ASSERT(!XFS_BUF_GETERROR(bp));
2148 2150
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7928b9983c1d..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
3713 * entry (null if none). Else, *lastxp will be set to the index 3713 * entry (null if none). Else, *lastxp will be set to the index
3714 * of the found entry; *gotp will contain the entry. 3714 * of the found entry; *gotp will contain the entry.
3715 */ 3715 */
3716xfs_bmbt_rec_host_t * /* pointer to found extent entry */ 3716STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
3717xfs_bmap_search_multi_extents( 3717xfs_bmap_search_multi_extents(
3718 xfs_ifork_t *ifp, /* inode fork pointer */ 3718 xfs_ifork_t *ifp, /* inode fork pointer */
3719 xfs_fileoff_t bno, /* block number searched for */ 3719 xfs_fileoff_t bno, /* block number searched for */
@@ -6009,7 +6009,7 @@ xfs_getbmap(
6009 */ 6009 */
6010 error = ENOMEM; 6010 error = ENOMEM;
6011 subnex = 16; 6011 subnex = 16;
6012 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL); 6012 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
6013 if (!map) 6013 if (!map)
6014 goto out_unlock_ilock; 6014 goto out_unlock_ilock;
6015 6015
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
392 int whichfork, 392 int whichfork,
393 int *count); 393 int *count);
394 394
395/*
396 * Search the extent records for the entry containing block bno.
397 * If bno lies in a hole, point to the next entry. If bno lies
398 * past eof, *eofp will be set, and *prevp will contain the last
399 * entry (null if none). Else, *lastxp will be set to the index
400 * of the found entry; *gotp will contain the entry.
401 */
402xfs_bmbt_rec_host_t *
403xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
404 xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
405
406#endif /* __KERNEL__ */ 395#endif /* __KERNEL__ */
407 396
408#endif /* __XFS_BMAP_H__ */ 397#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
202 ext_flag); 202 ext_flag);
203} 203}
204 204
205/* Endian flipping versions of the bmbt extraction functions */
206void
207xfs_bmbt_disk_get_all(
208 xfs_bmbt_rec_t *r,
209 xfs_bmbt_irec_t *s)
210{
211 __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
212 get_unaligned_be64(&r->l1), s);
213}
214
215/* 205/*
216 * Extract the blockcount field from an on disk bmap extent record. 206 * Extract the blockcount field from an on disk bmap extent record.
217 */ 207 */
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
816 *l1 = 0; 806 *l1 = 0;
817} 807}
818 808
809/* Endian flipping versions of the bmbt extraction functions */
810STATIC void
811xfs_bmbt_disk_get_all(
812 xfs_bmbt_rec_t *r,
813 xfs_bmbt_irec_t *s)
814{
815 __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
816 get_unaligned_be64(&r->l1), s);
817}
818
819STATIC void 819STATIC void
820xfs_bmbt_trace_record( 820xfs_bmbt_trace_record(
821 struct xfs_btree_cur *cur, 821 struct xfs_btree_cur *cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
220extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); 220extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
221extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); 221extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
222 222
223extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
224extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); 223extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
225extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); 224extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
226 225
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e9df99574829..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -120,8 +120,8 @@ xfs_btree_check_sblock(
120 XFS_RANDOM_BTREE_CHECK_SBLOCK))) { 120 XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
121 if (bp) 121 if (bp)
122 xfs_buftrace("SBTREE ERROR", bp); 122 xfs_buftrace("SBTREE ERROR", bp);
123 XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, 123 XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
124 cur->bc_mp); 124 XFS_ERRLEVEL_LOW, cur->bc_mp, block);
125 return XFS_ERROR(EFSCORRUPTED); 125 return XFS_ERROR(EFSCORRUPTED);
126 } 126 }
127 return 0; 127 return 0;
@@ -646,46 +646,6 @@ xfs_btree_read_bufl(
646} 646}
647 647
648/* 648/*
649 * Get a buffer for the block, return it read in.
650 * Short-form addressing.
651 */
652int /* error */
653xfs_btree_read_bufs(
654 xfs_mount_t *mp, /* file system mount point */
655 xfs_trans_t *tp, /* transaction pointer */
656 xfs_agnumber_t agno, /* allocation group number */
657 xfs_agblock_t agbno, /* allocation group block number */
658 uint lock, /* lock flags for read_buf */
659 xfs_buf_t **bpp, /* buffer for agno/agbno */
660 int refval) /* ref count value for buffer */
661{
662 xfs_buf_t *bp; /* return value */
663 xfs_daddr_t d; /* real disk block address */
664 int error;
665
666 ASSERT(agno != NULLAGNUMBER);
667 ASSERT(agbno != NULLAGBLOCK);
668 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
669 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
670 mp->m_bsize, lock, &bp))) {
671 return error;
672 }
673 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
674 if (bp != NULL) {
675 switch (refval) {
676 case XFS_ALLOC_BTREE_REF:
677 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
678 break;
679 case XFS_INO_BTREE_REF:
680 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
681 break;
682 }
683 }
684 *bpp = bp;
685 return 0;
686}
687
688/*
689 * Read-ahead the block, don't wait for it, don't return a buffer. 649 * Read-ahead the block, don't wait for it, don't return a buffer.
690 * Long-form addressing. 650 * Long-form addressing.
691 */ 651 */
@@ -2951,7 +2911,7 @@ error0:
2951 * inode we have to copy the single block it was pointing to into the 2911 * inode we have to copy the single block it was pointing to into the
2952 * inode. 2912 * inode.
2953 */ 2913 */
2954int 2914STATIC int
2955xfs_btree_kill_iroot( 2915xfs_btree_kill_iroot(
2956 struct xfs_btree_cur *cur) 2916 struct xfs_btree_cur *cur)
2957{ 2917{
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -379,20 +379,6 @@ xfs_btree_read_bufl(
379 int refval);/* ref count value for buffer */ 379 int refval);/* ref count value for buffer */
380 380
381/* 381/*
382 * Get a buffer for the block, return it read in.
383 * Short-form addressing.
384 */
385int /* error */
386xfs_btree_read_bufs(
387 struct xfs_mount *mp, /* file system mount point */
388 struct xfs_trans *tp, /* transaction pointer */
389 xfs_agnumber_t agno, /* allocation group number */
390 xfs_agblock_t agbno, /* allocation group block number */
391 uint lock, /* lock flags for read_buf */
392 struct xfs_buf **bpp, /* buffer for agno/agbno */
393 int refval);/* ref count value for buffer */
394
395/*
396 * Read-ahead the block, don't wait for it, don't return a buffer. 382 * Read-ahead the block, don't wait for it, don't return a buffer.
397 * Long-form addressing. 383 * Long-form addressing.
398 */ 384 */
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
432int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); 418int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
433int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); 419int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
434int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); 420int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
435int xfs_btree_kill_iroot(struct xfs_btree_cur *);
436int xfs_btree_insert(struct xfs_btree_cur *, int *); 421int xfs_btree_insert(struct xfs_btree_cur *, int *);
437int xfs_btree_delete(struct xfs_btree_cur *, int *); 422int xfs_btree_delete(struct xfs_btree_cur *, int *);
438int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); 423int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9ff6e57a5075..2847bbc1c534 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */
2201xfs_da_state_t * 2201xfs_da_state_t *
2202xfs_da_state_alloc(void) 2202xfs_da_state_alloc(void)
2203{ 2203{
2204 return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); 2204 return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
2205} 2205}
2206 2206
2207/* 2207/*
@@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
2261 int off; 2261 int off;
2262 2262
2263 if (nbuf == 1) 2263 if (nbuf == 1)
2264 dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); 2264 dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
2265 else 2265 else
2266 dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); 2266 dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
2267 dabuf->dirty = 0; 2267 dabuf->dirty = 0;
2268#ifdef XFS_DABUF_DEBUG 2268#ifdef XFS_DABUF_DEBUG
2269 dabuf->ra = ra; 2269 dabuf->ra = ra;
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index c657bec6d951..bb1d58eb3982 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -256,7 +256,7 @@ xfs_dir_cilookup_result(
256 !(args->op_flags & XFS_DA_OP_CILOOKUP)) 256 !(args->op_flags & XFS_DA_OP_CILOOKUP))
257 return EEXIST; 257 return EEXIST;
258 258
259 args->value = kmem_alloc(len, KM_MAYFAIL); 259 args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
260 if (!args->value) 260 if (!args->value)
261 return ENOMEM; 261 return ENOMEM;
262 262
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cbd451bb4848..2d0b3e1da9e6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,17 +167,25 @@ xfs_growfs_data_private(
167 new = nb - mp->m_sb.sb_dblocks; 167 new = nb - mp->m_sb.sb_dblocks;
168 oagcount = mp->m_sb.sb_agcount; 168 oagcount = mp->m_sb.sb_agcount;
169 if (nagcount > oagcount) { 169 if (nagcount > oagcount) {
170 void *new_perag, *old_perag;
171
170 xfs_filestream_flush(mp); 172 xfs_filestream_flush(mp);
173
174 new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
175 KM_MAYFAIL);
176 if (!new_perag)
177 return XFS_ERROR(ENOMEM);
178
171 down_write(&mp->m_peraglock); 179 down_write(&mp->m_peraglock);
172 mp->m_perag = kmem_realloc(mp->m_perag, 180 memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
173 sizeof(xfs_perag_t) * nagcount, 181 old_perag = mp->m_perag;
174 sizeof(xfs_perag_t) * oagcount, 182 mp->m_perag = new_perag;
175 KM_SLEEP); 183
176 memset(&mp->m_perag[oagcount], 0,
177 (nagcount - oagcount) * sizeof(xfs_perag_t));
178 mp->m_flags |= XFS_MOUNT_32BITINODES; 184 mp->m_flags |= XFS_MOUNT_32BITINODES;
179 nagimax = xfs_initialize_perag(mp, nagcount); 185 nagimax = xfs_initialize_perag(mp, nagcount);
180 up_write(&mp->m_peraglock); 186 up_write(&mp->m_peraglock);
187
188 kmem_free(old_perag);
181 } 189 }
182 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); 190 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
183 tp->t_flags |= XFS_TRANS_RESERVE; 191 tp->t_flags |= XFS_TRANS_RESERVE;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment(
57} 57}
58 58
59/* 59/*
60 * Lookup the record equal to ino in the btree given by cur. 60 * Lookup a record by ino in the btree given by cur.
61 */
62STATIC int /* error */
63xfs_inobt_lookup_eq(
64 struct xfs_btree_cur *cur, /* btree cursor */
65 xfs_agino_t ino, /* starting inode of chunk */
66 __int32_t fcnt, /* free inode count */
67 xfs_inofree_t free, /* free inode mask */
68 int *stat) /* success/failure */
69{
70 cur->bc_rec.i.ir_startino = ino;
71 cur->bc_rec.i.ir_freecount = fcnt;
72 cur->bc_rec.i.ir_free = free;
73 return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
74}
75
76/*
77 * Lookup the first record greater than or equal to ino
78 * in the btree given by cur.
79 */ 61 */
80int /* error */ 62int /* error */
81xfs_inobt_lookup_ge( 63xfs_inobt_lookup(
82 struct xfs_btree_cur *cur, /* btree cursor */ 64 struct xfs_btree_cur *cur, /* btree cursor */
83 xfs_agino_t ino, /* starting inode of chunk */ 65 xfs_agino_t ino, /* starting inode of chunk */
84 __int32_t fcnt, /* free inode count */ 66 xfs_lookup_t dir, /* <=, >=, == */
85 xfs_inofree_t free, /* free inode mask */
86 int *stat) /* success/failure */ 67 int *stat) /* success/failure */
87{ 68{
88 cur->bc_rec.i.ir_startino = ino; 69 cur->bc_rec.i.ir_startino = ino;
89 cur->bc_rec.i.ir_freecount = fcnt; 70 cur->bc_rec.i.ir_freecount = 0;
90 cur->bc_rec.i.ir_free = free; 71 cur->bc_rec.i.ir_free = 0;
91 return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); 72 return xfs_btree_lookup(cur, dir, stat);
92} 73}
93 74
94/* 75/*
95 * Lookup the first record less than or equal to ino 76 * Update the record referred to by cur to the value given.
96 * in the btree given by cur.
97 */
98int /* error */
99xfs_inobt_lookup_le(
100 struct xfs_btree_cur *cur, /* btree cursor */
101 xfs_agino_t ino, /* starting inode of chunk */
102 __int32_t fcnt, /* free inode count */
103 xfs_inofree_t free, /* free inode mask */
104 int *stat) /* success/failure */
105{
106 cur->bc_rec.i.ir_startino = ino;
107 cur->bc_rec.i.ir_freecount = fcnt;
108 cur->bc_rec.i.ir_free = free;
109 return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
110}
111
112/*
113 * Update the record referred to by cur to the value given
114 * by [ino, fcnt, free].
115 * This either works (return 0) or gets an EFSCORRUPTED error. 77 * This either works (return 0) or gets an EFSCORRUPTED error.
116 */ 78 */
117STATIC int /* error */ 79STATIC int /* error */
118xfs_inobt_update( 80xfs_inobt_update(
119 struct xfs_btree_cur *cur, /* btree cursor */ 81 struct xfs_btree_cur *cur, /* btree cursor */
120 xfs_agino_t ino, /* starting inode of chunk */ 82 xfs_inobt_rec_incore_t *irec) /* btree record */
121 __int32_t fcnt, /* free inode count */
122 xfs_inofree_t free) /* free inode mask */
123{ 83{
124 union xfs_btree_rec rec; 84 union xfs_btree_rec rec;
125 85
126 rec.inobt.ir_startino = cpu_to_be32(ino); 86 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
127 rec.inobt.ir_freecount = cpu_to_be32(fcnt); 87 rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
128 rec.inobt.ir_free = cpu_to_be64(free); 88 rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
129 return xfs_btree_update(cur, &rec); 89 return xfs_btree_update(cur, &rec);
130} 90}
131 91
@@ -135,9 +95,7 @@ xfs_inobt_update(
135int /* error */ 95int /* error */
136xfs_inobt_get_rec( 96xfs_inobt_get_rec(
137 struct xfs_btree_cur *cur, /* btree cursor */ 97 struct xfs_btree_cur *cur, /* btree cursor */
138 xfs_agino_t *ino, /* output: starting inode of chunk */ 98 xfs_inobt_rec_incore_t *irec, /* btree record */
139 __int32_t *fcnt, /* output: number of free inodes */
140 xfs_inofree_t *free, /* output: free inode mask */
141 int *stat) /* output: success/failure */ 99 int *stat) /* output: success/failure */
142{ 100{
143 union xfs_btree_rec *rec; 101 union xfs_btree_rec *rec;
@@ -145,14 +103,136 @@ xfs_inobt_get_rec(
145 103
146 error = xfs_btree_get_rec(cur, &rec, stat); 104 error = xfs_btree_get_rec(cur, &rec, stat);
147 if (!error && *stat == 1) { 105 if (!error && *stat == 1) {
148 *ino = be32_to_cpu(rec->inobt.ir_startino); 106 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
149 *fcnt = be32_to_cpu(rec->inobt.ir_freecount); 107 irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
150 *free = be64_to_cpu(rec->inobt.ir_free); 108 irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
151 } 109 }
152 return error; 110 return error;
153} 111}
154 112
155/* 113/*
114 * Verify that the number of free inodes in the AGI is correct.
115 */
116#ifdef DEBUG
117STATIC int
118xfs_check_agi_freecount(
119 struct xfs_btree_cur *cur,
120 struct xfs_agi *agi)
121{
122 if (cur->bc_nlevels == 1) {
123 xfs_inobt_rec_incore_t rec;
124 int freecount = 0;
125 int error;
126 int i;
127
128 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
129 if (error)
130 return error;
131
132 do {
133 error = xfs_inobt_get_rec(cur, &rec, &i);
134 if (error)
135 return error;
136
137 if (i) {
138 freecount += rec.ir_freecount;
139 error = xfs_btree_increment(cur, 0, &i);
140 if (error)
141 return error;
142 }
143 } while (i == 1);
144
145 if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
146 ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
147 }
148 return 0;
149}
150#else
151#define xfs_check_agi_freecount(cur, agi) 0
152#endif
153
154/*
155 * Initialise a new set of inodes.
156 */
157STATIC void
158xfs_ialloc_inode_init(
159 struct xfs_mount *mp,
160 struct xfs_trans *tp,
161 xfs_agnumber_t agno,
162 xfs_agblock_t agbno,
163 xfs_agblock_t length,
164 unsigned int gen)
165{
166 struct xfs_buf *fbuf;
167 struct xfs_dinode *free;
168 int blks_per_cluster, nbufs, ninodes;
169 int version;
170 int i, j;
171 xfs_daddr_t d;
172
173 /*
174 * Loop over the new block(s), filling in the inodes.
175 * For small block sizes, manipulate the inodes in buffers
176 * which are multiples of the blocks size.
177 */
178 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
179 blks_per_cluster = 1;
180 nbufs = length;
181 ninodes = mp->m_sb.sb_inopblock;
182 } else {
183 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
184 mp->m_sb.sb_blocksize;
185 nbufs = length / blks_per_cluster;
186 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
187 }
188
189 /*
190 * Figure out what version number to use in the inodes we create.
191 * If the superblock version has caught up to the one that supports
192 * the new inode format, then use the new inode version. Otherwise
193 * use the old version so that old kernels will continue to be
194 * able to use the file system.
195 */
196 if (xfs_sb_version_hasnlink(&mp->m_sb))
197 version = 2;
198 else
199 version = 1;
200
201 for (j = 0; j < nbufs; j++) {
202 /*
203 * Get the block.
204 */
205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
207 mp->m_bsize * blks_per_cluster,
208 XFS_BUF_LOCK);
209 ASSERT(fbuf);
210 ASSERT(!XFS_BUF_GETERROR(fbuf));
211
212 /*
213 * Initialize all inodes in this buffer and then log them.
214 *
215 * XXX: It would be much better if we had just one transaction
216 * to log a whole cluster of inodes instead of all the
217 * individual transactions causing a lot of log traffic.
218 */
219 xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
220 for (i = 0; i < ninodes; i++) {
221 int ioffset = i << mp->m_sb.sb_inodelog;
222 uint isize = sizeof(struct xfs_dinode);
223
224 free = xfs_make_iptr(mp, fbuf, i);
225 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
226 free->di_version = version;
227 free->di_gen = cpu_to_be32(gen);
228 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
229 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
230 }
231 xfs_trans_inode_alloc_buf(tp, fbuf);
232 }
233}
234
235/*
156 * Allocate new inodes in the allocation group specified by agbp. 236 * Allocate new inodes in the allocation group specified by agbp.
157 * Return 0 for success, else error code. 237 * Return 0 for success, else error code.
158 */ 238 */
@@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc(
164{ 244{
165 xfs_agi_t *agi; /* allocation group header */ 245 xfs_agi_t *agi; /* allocation group header */
166 xfs_alloc_arg_t args; /* allocation argument structure */ 246 xfs_alloc_arg_t args; /* allocation argument structure */
167 int blks_per_cluster; /* fs blocks per inode cluster */
168 xfs_btree_cur_t *cur; /* inode btree cursor */ 247 xfs_btree_cur_t *cur; /* inode btree cursor */
169 xfs_daddr_t d; /* disk addr of buffer */
170 xfs_agnumber_t agno; 248 xfs_agnumber_t agno;
171 int error; 249 int error;
172 xfs_buf_t *fbuf; /* new free inodes' buffer */ 250 int i;
173 xfs_dinode_t *free; /* new free inode structure */
174 int i; /* inode counter */
175 int j; /* block counter */
176 int nbufs; /* num bufs of new inodes */
177 xfs_agino_t newino; /* new first inode's number */ 251 xfs_agino_t newino; /* new first inode's number */
178 xfs_agino_t newlen; /* new number of inodes */ 252 xfs_agino_t newlen; /* new number of inodes */
179 int ninodes; /* num inodes per buf */
180 xfs_agino_t thisino; /* current inode number, for loop */ 253 xfs_agino_t thisino; /* current inode number, for loop */
181 int version; /* inode version number to use */
182 int isaligned = 0; /* inode allocation at stripe unit */ 254 int isaligned = 0; /* inode allocation at stripe unit */
183 /* boundary */ 255 /* boundary */
184 unsigned int gen;
185 256
186 args.tp = tp; 257 args.tp = tp;
187 args.mp = tp->t_mountp; 258 args.mp = tp->t_mountp;
@@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc(
202 */ 273 */
203 agi = XFS_BUF_TO_AGI(agbp); 274 agi = XFS_BUF_TO_AGI(agbp);
204 newino = be32_to_cpu(agi->agi_newino); 275 newino = be32_to_cpu(agi->agi_newino);
276 agno = be32_to_cpu(agi->agi_seqno);
205 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + 277 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
206 XFS_IALLOC_BLOCKS(args.mp); 278 XFS_IALLOC_BLOCKS(args.mp);
207 if (likely(newino != NULLAGINO && 279 if (likely(newino != NULLAGINO &&
208 (args.agbno < be32_to_cpu(agi->agi_length)))) { 280 (args.agbno < be32_to_cpu(agi->agi_length)))) {
209 args.fsbno = XFS_AGB_TO_FSB(args.mp, 281 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
210 be32_to_cpu(agi->agi_seqno), args.agbno);
211 args.type = XFS_ALLOCTYPE_THIS_BNO; 282 args.type = XFS_ALLOCTYPE_THIS_BNO;
212 args.mod = args.total = args.wasdel = args.isfl = 283 args.mod = args.total = args.wasdel = args.isfl =
213 args.userdata = args.minalignslop = 0; 284 args.userdata = args.minalignslop = 0;
@@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc(
258 * For now, just allocate blocks up front. 329 * For now, just allocate blocks up front.
259 */ 330 */
260 args.agbno = be32_to_cpu(agi->agi_root); 331 args.agbno = be32_to_cpu(agi->agi_root);
261 args.fsbno = XFS_AGB_TO_FSB(args.mp, 332 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
262 be32_to_cpu(agi->agi_seqno), args.agbno);
263 /* 333 /*
264 * Allocate a fixed-size extent of inodes. 334 * Allocate a fixed-size extent of inodes.
265 */ 335 */
@@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc(
282 if (isaligned && args.fsbno == NULLFSBLOCK) { 352 if (isaligned && args.fsbno == NULLFSBLOCK) {
283 args.type = XFS_ALLOCTYPE_NEAR_BNO; 353 args.type = XFS_ALLOCTYPE_NEAR_BNO;
284 args.agbno = be32_to_cpu(agi->agi_root); 354 args.agbno = be32_to_cpu(agi->agi_root);
285 args.fsbno = XFS_AGB_TO_FSB(args.mp, 355 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
286 be32_to_cpu(agi->agi_seqno), args.agbno);
287 args.alignment = xfs_ialloc_cluster_alignment(&args); 356 args.alignment = xfs_ialloc_cluster_alignment(&args);
288 if ((error = xfs_alloc_vextent(&args))) 357 if ((error = xfs_alloc_vextent(&args)))
289 return error; 358 return error;
@@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc(
294 return 0; 363 return 0;
295 } 364 }
296 ASSERT(args.len == args.minlen); 365 ASSERT(args.len == args.minlen);
297 /*
298 * Convert the results.
299 */
300 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
301 /*
302 * Loop over the new block(s), filling in the inodes.
303 * For small block sizes, manipulate the inodes in buffers
304 * which are multiples of the blocks size.
305 */
306 if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
307 blks_per_cluster = 1;
308 nbufs = (int)args.len;
309 ninodes = args.mp->m_sb.sb_inopblock;
310 } else {
311 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
312 args.mp->m_sb.sb_blocksize;
313 nbufs = (int)args.len / blks_per_cluster;
314 ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
315 }
316 /*
317 * Figure out what version number to use in the inodes we create.
318 * If the superblock version has caught up to the one that supports
319 * the new inode format, then use the new inode version. Otherwise
320 * use the old version so that old kernels will continue to be
321 * able to use the file system.
322 */
323 if (xfs_sb_version_hasnlink(&args.mp->m_sb))
324 version = 2;
325 else
326 version = 1;
327 366
328 /* 367 /*
368 * Stamp and write the inode buffers.
369 *
329 * Seed the new inode cluster with a random generation number. This 370 * Seed the new inode cluster with a random generation number. This
330 * prevents short-term reuse of generation numbers if a chunk is 371 * prevents short-term reuse of generation numbers if a chunk is
331 * freed and then immediately reallocated. We use random numbers 372 * freed and then immediately reallocated. We use random numbers
332 * rather than a linear progression to prevent the next generation 373 * rather than a linear progression to prevent the next generation
333 * number from being easily guessable. 374 * number from being easily guessable.
334 */ 375 */
335 gen = random32(); 376 xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
336 for (j = 0; j < nbufs; j++) { 377 random32());
337 /*
338 * Get the block.
339 */
340 d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
341 args.agbno + (j * blks_per_cluster));
342 fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
343 args.mp->m_bsize * blks_per_cluster,
344 XFS_BUF_LOCK);
345 ASSERT(fbuf);
346 ASSERT(!XFS_BUF_GETERROR(fbuf));
347 378
348 /* 379 /*
349 * Initialize all inodes in this buffer and then log them. 380 * Convert the results.
350 * 381 */
351 * XXX: It would be much better if we had just one transaction to 382 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
352 * log a whole cluster of inodes instead of all the individual
353 * transactions causing a lot of log traffic.
354 */
355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
356 for (i = 0; i < ninodes; i++) {
357 int ioffset = i << args.mp->m_sb.sb_inodelog;
358 uint isize = sizeof(struct xfs_dinode);
359
360 free = xfs_make_iptr(args.mp, fbuf, i);
361 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
362 free->di_version = version;
363 free->di_gen = cpu_to_be32(gen);
364 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
365 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
366 }
367 xfs_trans_inode_alloc_buf(tp, fbuf);
368 }
369 be32_add_cpu(&agi->agi_count, newlen); 383 be32_add_cpu(&agi->agi_count, newlen);
370 be32_add_cpu(&agi->agi_freecount, newlen); 384 be32_add_cpu(&agi->agi_freecount, newlen);
371 agno = be32_to_cpu(agi->agi_seqno);
372 down_read(&args.mp->m_peraglock); 385 down_read(&args.mp->m_peraglock);
373 args.mp->m_perag[agno].pagi_freecount += newlen; 386 args.mp->m_perag[agno].pagi_freecount += newlen;
374 up_read(&args.mp->m_peraglock); 387 up_read(&args.mp->m_peraglock);
375 agi->agi_newino = cpu_to_be32(newino); 388 agi->agi_newino = cpu_to_be32(newino);
389
376 /* 390 /*
377 * Insert records describing the new inode chunk into the btree. 391 * Insert records describing the new inode chunk into the btree.
378 */ 392 */
@@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc(
380 for (thisino = newino; 394 for (thisino = newino;
381 thisino < newino + newlen; 395 thisino < newino + newlen;
382 thisino += XFS_INODES_PER_CHUNK) { 396 thisino += XFS_INODES_PER_CHUNK) {
383 if ((error = xfs_inobt_lookup_eq(cur, thisino, 397 cur->bc_rec.i.ir_startino = thisino;
384 XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) { 398 cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
399 cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
400 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
401 if (error) {
385 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 402 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
386 return error; 403 return error;
387 } 404 }
388 ASSERT(i == 0); 405 ASSERT(i == 0);
389 if ((error = xfs_btree_insert(cur, &i))) { 406 error = xfs_btree_insert(cur, &i);
407 if (error) {
390 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 408 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
391 return error; 409 return error;
392 } 410 }
@@ -539,6 +557,62 @@ nextag:
539} 557}
540 558
541/* 559/*
560 * Try to retrieve the next record to the left/right from the current one.
561 */
562STATIC int
563xfs_ialloc_next_rec(
564 struct xfs_btree_cur *cur,
565 xfs_inobt_rec_incore_t *rec,
566 int *done,
567 int left)
568{
569 int error;
570 int i;
571
572 if (left)
573 error = xfs_btree_decrement(cur, 0, &i);
574 else
575 error = xfs_btree_increment(cur, 0, &i);
576
577 if (error)
578 return error;
579 *done = !i;
580 if (i) {
581 error = xfs_inobt_get_rec(cur, rec, &i);
582 if (error)
583 return error;
584 XFS_WANT_CORRUPTED_RETURN(i == 1);
585 }
586
587 return 0;
588}
589
590STATIC int
591xfs_ialloc_get_rec(
592 struct xfs_btree_cur *cur,
593 xfs_agino_t agino,
594 xfs_inobt_rec_incore_t *rec,
595 int *done,
596 int left)
597{
598 int error;
599 int i;
600
601 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
602 if (error)
603 return error;
604 *done = !i;
605 if (i) {
606 error = xfs_inobt_get_rec(cur, rec, &i);
607 if (error)
608 return error;
609 XFS_WANT_CORRUPTED_RETURN(i == 1);
610 }
611
612 return 0;
613}
614
615/*
542 * Visible inode allocation functions. 616 * Visible inode allocation functions.
543 */ 617 */
544 618
@@ -592,8 +666,8 @@ xfs_dialloc(
592 int j; /* result code */ 666 int j; /* result code */
593 xfs_mount_t *mp; /* file system mount structure */ 667 xfs_mount_t *mp; /* file system mount structure */
594 int offset; /* index of inode in chunk */ 668 int offset; /* index of inode in chunk */
595 xfs_agino_t pagino; /* parent's a.g. relative inode # */ 669 xfs_agino_t pagino; /* parent's AG relative inode # */
596 xfs_agnumber_t pagno; /* parent's allocation group number */ 670 xfs_agnumber_t pagno; /* parent's AG number */
597 xfs_inobt_rec_incore_t rec; /* inode allocation record */ 671 xfs_inobt_rec_incore_t rec; /* inode allocation record */
598 xfs_agnumber_t tagno; /* testing allocation group number */ 672 xfs_agnumber_t tagno; /* testing allocation group number */
599 xfs_btree_cur_t *tcur; /* temp cursor */ 673 xfs_btree_cur_t *tcur; /* temp cursor */
@@ -716,6 +790,8 @@ nextag:
716 */ 790 */
717 agno = tagno; 791 agno = tagno;
718 *IO_agbp = NULL; 792 *IO_agbp = NULL;
793
794 restart_pagno:
719 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); 795 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
720 /* 796 /*
721 * If pagino is 0 (this is the root inode allocation) use newino. 797 * If pagino is 0 (this is the root inode allocation) use newino.
@@ -723,220 +799,199 @@ nextag:
723 */ 799 */
724 if (!pagino) 800 if (!pagino)
725 pagino = be32_to_cpu(agi->agi_newino); 801 pagino = be32_to_cpu(agi->agi_newino);
726#ifdef DEBUG
727 if (cur->bc_nlevels == 1) {
728 int freecount = 0;
729 802
730 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 803 error = xfs_check_agi_freecount(cur, agi);
731 goto error0; 804 if (error)
732 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 805 goto error0;
733 do {
734 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
735 &rec.ir_freecount, &rec.ir_free, &i)))
736 goto error0;
737 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
738 freecount += rec.ir_freecount;
739 if ((error = xfs_btree_increment(cur, 0, &i)))
740 goto error0;
741 } while (i == 1);
742 806
743 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
744 XFS_FORCED_SHUTDOWN(mp));
745 }
746#endif
747 /* 807 /*
748 * If in the same a.g. as the parent, try to get near the parent. 808 * If in the same AG as the parent, try to get near the parent.
749 */ 809 */
750 if (pagno == agno) { 810 if (pagno == agno) {
751 if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i))) 811 xfs_perag_t *pag = &mp->m_perag[agno];
812 int doneleft; /* done, to the left */
813 int doneright; /* done, to the right */
814 int searchdistance = 10;
815
816 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
817 if (error)
818 goto error0;
819 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
820
821 error = xfs_inobt_get_rec(cur, &rec, &j);
822 if (error)
752 goto error0; 823 goto error0;
753 if (i != 0 && 824 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
754 (error = xfs_inobt_get_rec(cur, &rec.ir_startino, 825
755 &rec.ir_freecount, &rec.ir_free, &j)) == 0 && 826 if (rec.ir_freecount > 0) {
756 j == 1 &&
757 rec.ir_freecount > 0) {
758 /* 827 /*
759 * Found a free inode in the same chunk 828 * Found a free inode in the same chunk
760 * as parent, done. 829 * as the parent, done.
761 */ 830 */
831 goto alloc_inode;
762 } 832 }
833
834
835 /*
836 * In the same AG as parent, but parent's chunk is full.
837 */
838
839 /* duplicate the cursor, search left & right simultaneously */
840 error = xfs_btree_dup_cursor(cur, &tcur);
841 if (error)
842 goto error0;
843
763 /* 844 /*
764 * In the same a.g. as parent, but parent's chunk is full. 845 * Skip to last blocks looked up if same parent inode.
765 */ 846 */
766 else { 847 if (pagino != NULLAGINO &&
767 int doneleft; /* done, to the left */ 848 pag->pagl_pagino == pagino &&
768 int doneright; /* done, to the right */ 849 pag->pagl_leftrec != NULLAGINO &&
850 pag->pagl_rightrec != NULLAGINO) {
851 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
852 &trec, &doneleft, 1);
853 if (error)
854 goto error1;
769 855
856 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
857 &rec, &doneright, 0);
770 if (error) 858 if (error)
771 goto error0;
772 ASSERT(i == 1);
773 ASSERT(j == 1);
774 /*
775 * Duplicate the cursor, search left & right
776 * simultaneously.
777 */
778 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
779 goto error0;
780 /*
781 * Search left with tcur, back up 1 record.
782 */
783 if ((error = xfs_btree_decrement(tcur, 0, &i)))
784 goto error1; 859 goto error1;
785 doneleft = !i; 860 } else {
786 if (!doneleft) { 861 /* search left with tcur, back up 1 record */
787 if ((error = xfs_inobt_get_rec(tcur, 862 error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
788 &trec.ir_startino, 863 if (error)
789 &trec.ir_freecount,
790 &trec.ir_free, &i)))
791 goto error1;
792 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
793 }
794 /*
795 * Search right with cur, go forward 1 record.
796 */
797 if ((error = xfs_btree_increment(cur, 0, &i)))
798 goto error1; 864 goto error1;
799 doneright = !i;
800 if (!doneright) {
801 if ((error = xfs_inobt_get_rec(cur,
802 &rec.ir_startino,
803 &rec.ir_freecount,
804 &rec.ir_free, &i)))
805 goto error1;
806 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
807 }
808 /*
809 * Loop until we find the closest inode chunk
810 * with a free one.
811 */
812 while (!doneleft || !doneright) {
813 int useleft; /* using left inode
814 chunk this time */
815 865
866 /* search right with cur, go forward 1 record. */
867 error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
868 if (error)
869 goto error1;
870 }
871
872 /*
873 * Loop until we find an inode chunk with a free inode.
874 */
875 while (!doneleft || !doneright) {
876 int useleft; /* using left inode chunk this time */
877
878 if (!--searchdistance) {
816 /* 879 /*
817 * Figure out which block is closer, 880 * Not in range - save last search
818 * if both are valid. 881 * location and allocate a new inode
819 */
820 if (!doneleft && !doneright)
821 useleft =
822 pagino -
823 (trec.ir_startino +
824 XFS_INODES_PER_CHUNK - 1) <
825 rec.ir_startino - pagino;
826 else
827 useleft = !doneleft;
828 /*
829 * If checking the left, does it have
830 * free inodes?
831 */
832 if (useleft && trec.ir_freecount) {
833 /*
834 * Yes, set it up as the chunk to use.
835 */
836 rec = trec;
837 xfs_btree_del_cursor(cur,
838 XFS_BTREE_NOERROR);
839 cur = tcur;
840 break;
841 }
842 /*
843 * If checking the right, does it have
844 * free inodes?
845 */
846 if (!useleft && rec.ir_freecount) {
847 /*
848 * Yes, it's already set up.
849 */
850 xfs_btree_del_cursor(tcur,
851 XFS_BTREE_NOERROR);
852 break;
853 }
854 /*
855 * If used the left, get another one
856 * further left.
857 */
858 if (useleft) {
859 if ((error = xfs_btree_decrement(tcur, 0,
860 &i)))
861 goto error1;
862 doneleft = !i;
863 if (!doneleft) {
864 if ((error = xfs_inobt_get_rec(
865 tcur,
866 &trec.ir_startino,
867 &trec.ir_freecount,
868 &trec.ir_free, &i)))
869 goto error1;
870 XFS_WANT_CORRUPTED_GOTO(i == 1,
871 error1);
872 }
873 }
874 /*
875 * If used the right, get another one
876 * further right.
877 */ 882 */
878 else { 883 pag->pagl_leftrec = trec.ir_startino;
879 if ((error = xfs_btree_increment(cur, 0, 884 pag->pagl_rightrec = rec.ir_startino;
880 &i))) 885 pag->pagl_pagino = pagino;
881 goto error1; 886 goto newino;
882 doneright = !i; 887 }
883 if (!doneright) { 888
884 if ((error = xfs_inobt_get_rec( 889 /* figure out the closer block if both are valid. */
885 cur, 890 if (!doneleft && !doneright) {
886 &rec.ir_startino, 891 useleft = pagino -
887 &rec.ir_freecount, 892 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
888 &rec.ir_free, &i))) 893 rec.ir_startino - pagino;
889 goto error1; 894 } else {
890 XFS_WANT_CORRUPTED_GOTO(i == 1, 895 useleft = !doneleft;
891 error1);
892 }
893 }
894 } 896 }
895 ASSERT(!doneleft || !doneright); 897
898 /* free inodes to the left? */
899 if (useleft && trec.ir_freecount) {
900 rec = trec;
901 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
902 cur = tcur;
903
904 pag->pagl_leftrec = trec.ir_startino;
905 pag->pagl_rightrec = rec.ir_startino;
906 pag->pagl_pagino = pagino;
907 goto alloc_inode;
908 }
909
910 /* free inodes to the right? */
911 if (!useleft && rec.ir_freecount) {
912 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
913
914 pag->pagl_leftrec = trec.ir_startino;
915 pag->pagl_rightrec = rec.ir_startino;
916 pag->pagl_pagino = pagino;
917 goto alloc_inode;
918 }
919
920 /* get next record to check */
921 if (useleft) {
922 error = xfs_ialloc_next_rec(tcur, &trec,
923 &doneleft, 1);
924 } else {
925 error = xfs_ialloc_next_rec(cur, &rec,
926 &doneright, 0);
927 }
928 if (error)
929 goto error1;
896 } 930 }
931
932 /*
933 * We've reached the end of the btree. because
934 * we are only searching a small chunk of the
935 * btree each search, there is obviously free
936 * inodes closer to the parent inode than we
937 * are now. restart the search again.
938 */
939 pag->pagl_pagino = NULLAGINO;
940 pag->pagl_leftrec = NULLAGINO;
941 pag->pagl_rightrec = NULLAGINO;
942 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
943 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
944 goto restart_pagno;
897 } 945 }
946
898 /* 947 /*
899 * In a different a.g. from the parent. 948 * In a different AG from the parent.
900 * See if the most recently allocated block has any free. 949 * See if the most recently allocated block has any free.
901 */ 950 */
902 else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { 951newino:
903 if ((error = xfs_inobt_lookup_eq(cur, 952 if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
904 be32_to_cpu(agi->agi_newino), 0, 0, &i))) 953 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
954 XFS_LOOKUP_EQ, &i);
955 if (error)
905 goto error0; 956 goto error0;
906 if (i == 1 && 957
907 (error = xfs_inobt_get_rec(cur, &rec.ir_startino, 958 if (i == 1) {
908 &rec.ir_freecount, &rec.ir_free, &j)) == 0 && 959 error = xfs_inobt_get_rec(cur, &rec, &j);
909 j == 1 &&
910 rec.ir_freecount > 0) {
911 /*
912 * The last chunk allocated in the group still has
913 * a free inode.
914 */
915 }
916 /*
917 * None left in the last group, search the whole a.g.
918 */
919 else {
920 if (error) 960 if (error)
921 goto error0; 961 goto error0;
922 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 962
923 goto error0; 963 if (j == 1 && rec.ir_freecount > 0) {
924 ASSERT(i == 1); 964 /*
925 for (;;) { 965 * The last chunk allocated in the group
926 if ((error = xfs_inobt_get_rec(cur, 966 * still has a free inode.
927 &rec.ir_startino, 967 */
928 &rec.ir_freecount, &rec.ir_free, 968 goto alloc_inode;
929 &i)))
930 goto error0;
931 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
932 if (rec.ir_freecount > 0)
933 break;
934 if ((error = xfs_btree_increment(cur, 0, &i)))
935 goto error0;
936 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
937 } 969 }
938 } 970 }
939 } 971 }
972
973 /*
974 * None left in the last group, search the whole AG
975 */
976 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
977 if (error)
978 goto error0;
979 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
980
981 for (;;) {
982 error = xfs_inobt_get_rec(cur, &rec, &i);
983 if (error)
984 goto error0;
985 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
986 if (rec.ir_freecount > 0)
987 break;
988 error = xfs_btree_increment(cur, 0, &i);
989 if (error)
990 goto error0;
991 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
992 }
993
994alloc_inode:
940 offset = xfs_ialloc_find_free(&rec.ir_free); 995 offset = xfs_ialloc_find_free(&rec.ir_free);
941 ASSERT(offset >= 0); 996 ASSERT(offset >= 0);
942 ASSERT(offset < XFS_INODES_PER_CHUNK); 997 ASSERT(offset < XFS_INODES_PER_CHUNK);
@@ -945,33 +1000,19 @@ nextag:
945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); 1000 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
946 rec.ir_free &= ~XFS_INOBT_MASK(offset); 1001 rec.ir_free &= ~XFS_INOBT_MASK(offset);
947 rec.ir_freecount--; 1002 rec.ir_freecount--;
948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, 1003 error = xfs_inobt_update(cur, &rec);
949 rec.ir_free))) 1004 if (error)
950 goto error0; 1005 goto error0;
951 be32_add_cpu(&agi->agi_freecount, -1); 1006 be32_add_cpu(&agi->agi_freecount, -1);
952 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1007 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
953 down_read(&mp->m_peraglock); 1008 down_read(&mp->m_peraglock);
954 mp->m_perag[tagno].pagi_freecount--; 1009 mp->m_perag[tagno].pagi_freecount--;
955 up_read(&mp->m_peraglock); 1010 up_read(&mp->m_peraglock);
956#ifdef DEBUG
957 if (cur->bc_nlevels == 1) {
958 int freecount = 0;
959 1011
960 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 1012 error = xfs_check_agi_freecount(cur, agi);
961 goto error0; 1013 if (error)
962 do { 1014 goto error0;
963 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, 1015
964 &rec.ir_freecount, &rec.ir_free, &i)))
965 goto error0;
966 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
967 freecount += rec.ir_freecount;
968 if ((error = xfs_btree_increment(cur, 0, &i)))
969 goto error0;
970 } while (i == 1);
971 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
972 XFS_FORCED_SHUTDOWN(mp));
973 }
974#endif
975 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1016 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
976 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1017 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
977 *inop = ino; 1018 *inop = ino;
@@ -1062,38 +1103,23 @@ xfs_difree(
1062 * Initialize the cursor. 1103 * Initialize the cursor.
1063 */ 1104 */
1064 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1105 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1065#ifdef DEBUG
1066 if (cur->bc_nlevels == 1) {
1067 int freecount = 0;
1068 1106
1069 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 1107 error = xfs_check_agi_freecount(cur, agi);
1070 goto error0; 1108 if (error)
1071 do { 1109 goto error0;
1072 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, 1110
1073 &rec.ir_freecount, &rec.ir_free, &i)))
1074 goto error0;
1075 if (i) {
1076 freecount += rec.ir_freecount;
1077 if ((error = xfs_btree_increment(cur, 0, &i)))
1078 goto error0;
1079 }
1080 } while (i == 1);
1081 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
1082 XFS_FORCED_SHUTDOWN(mp));
1083 }
1084#endif
1085 /* 1111 /*
1086 * Look for the entry describing this inode. 1112 * Look for the entry describing this inode.
1087 */ 1113 */
1088 if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { 1114 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1089 cmn_err(CE_WARN, 1115 cmn_err(CE_WARN,
1090 "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.", 1116 "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.",
1091 error, mp->m_fsname); 1117 error, mp->m_fsname);
1092 goto error0; 1118 goto error0;
1093 } 1119 }
1094 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1120 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1095 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount, 1121 error = xfs_inobt_get_rec(cur, &rec, &i);
1096 &rec.ir_free, &i))) { 1122 if (error) {
1097 cmn_err(CE_WARN, 1123 cmn_err(CE_WARN,
1098 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", 1124 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
1099 error, mp->m_fsname); 1125 error, mp->m_fsname);
@@ -1148,12 +1174,14 @@ xfs_difree(
1148 } else { 1174 } else {
1149 *delete = 0; 1175 *delete = 0;
1150 1176
1151 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { 1177 error = xfs_inobt_update(cur, &rec);
1178 if (error) {
1152 cmn_err(CE_WARN, 1179 cmn_err(CE_WARN,
1153 "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", 1180 "xfs_difree: xfs_inobt_update returned an error %d on %s.",
1154 error, mp->m_fsname); 1181 error, mp->m_fsname);
1155 goto error0; 1182 goto error0;
1156 } 1183 }
1184
1157 /* 1185 /*
1158 * Change the inode free counts and log the ag/sb changes. 1186 * Change the inode free counts and log the ag/sb changes.
1159 */ 1187 */
@@ -1165,28 +1193,10 @@ xfs_difree(
1165 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); 1193 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1166 } 1194 }
1167 1195
1168#ifdef DEBUG 1196 error = xfs_check_agi_freecount(cur, agi);
1169 if (cur->bc_nlevels == 1) { 1197 if (error)
1170 int freecount = 0; 1198 goto error0;
1171 1199
1172 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
1173 goto error0;
1174 do {
1175 if ((error = xfs_inobt_get_rec(cur,
1176 &rec.ir_startino,
1177 &rec.ir_freecount,
1178 &rec.ir_free, &i)))
1179 goto error0;
1180 if (i) {
1181 freecount += rec.ir_freecount;
1182 if ((error = xfs_btree_increment(cur, 0, &i)))
1183 goto error0;
1184 }
1185 } while (i == 1);
1186 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
1187 XFS_FORCED_SHUTDOWN(mp));
1188 }
1189#endif
1190 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1200 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1191 return 0; 1201 return 0;
1192 1202
@@ -1297,9 +1307,7 @@ xfs_imap(
1297 chunk_agbno = agbno - offset_agbno; 1307 chunk_agbno = agbno - offset_agbno;
1298 } else { 1308 } else {
1299 xfs_btree_cur_t *cur; /* inode btree cursor */ 1309 xfs_btree_cur_t *cur; /* inode btree cursor */
1300 xfs_agino_t chunk_agino; /* first agino in inode chunk */ 1310 xfs_inobt_rec_incore_t chunk_rec;
1301 __int32_t chunk_cnt; /* count of free inodes in chunk */
1302 xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
1303 xfs_buf_t *agbp; /* agi buffer */ 1311 xfs_buf_t *agbp; /* agi buffer */
1304 int i; /* temp state */ 1312 int i; /* temp state */
1305 1313
@@ -1315,15 +1323,14 @@ xfs_imap(
1315 } 1323 }
1316 1324
1317 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1325 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1318 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i); 1326 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1319 if (error) { 1327 if (error) {
1320 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1328 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1321 "xfs_inobt_lookup_le() failed"); 1329 "xfs_inobt_lookup() failed");
1322 goto error0; 1330 goto error0;
1323 } 1331 }
1324 1332
1325 error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, 1333 error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
1326 &chunk_free, &i);
1327 if (error) { 1334 if (error) {
1328 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1335 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1329 "xfs_inobt_get_rec() failed"); 1336 "xfs_inobt_get_rec() failed");
@@ -1341,7 +1348,7 @@ xfs_imap(
1341 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1348 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1342 if (error) 1349 if (error)
1343 return error; 1350 return error;
1344 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); 1351 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
1345 offset_agbno = agbno - chunk_agbno; 1352 offset_agbno = agbno - chunk_agbno;
1346 } 1353 }
1347 1354
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,23 +150,15 @@ xfs_ialloc_pagi_init(
150 xfs_agnumber_t agno); /* allocation group number */ 150 xfs_agnumber_t agno); /* allocation group number */
151 151
152/* 152/*
153 * Lookup the first record greater than or equal to ino 153 * Lookup a record by ino in the btree given by cur.
154 * in the btree given by cur.
155 */ 154 */
156int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino, 155int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
157 __int32_t fcnt, xfs_inofree_t free, int *stat); 156 xfs_lookup_t dir, int *stat);
158
159/*
160 * Lookup the first record less than or equal to ino
161 * in the btree given by cur.
162 */
163int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
164 __int32_t fcnt, xfs_inofree_t free, int *stat);
165 157
166/* 158/*
167 * Get the data from the pointed-to record. 159 * Get the data from the pointed-to record.
168 */ 160 */
169extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino, 161extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
170 __int32_t *fcnt, xfs_inofree_t *free, int *stat); 162 xfs_inobt_rec_incore_t *rec, int *stat);
171 163
172#endif /* __XFS_IALLOC_H__ */ 164#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 5fcec6f020a7..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -64,6 +64,10 @@ xfs_inode_alloc(
64 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 64 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
65 if (!ip) 65 if (!ip)
66 return NULL; 66 return NULL;
67 if (inode_init_always(mp->m_super, VFS_I(ip))) {
68 kmem_zone_free(xfs_inode_zone, ip);
69 return NULL;
70 }
67 71
68 ASSERT(atomic_read(&ip->i_iocount) == 0); 72 ASSERT(atomic_read(&ip->i_iocount) == 0);
69 ASSERT(atomic_read(&ip->i_pincount) == 0); 73 ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -78,7 +82,6 @@ xfs_inode_alloc(
78 memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
79 ip->i_flags = 0; 83 ip->i_flags = 0;
80 ip->i_update_core = 0; 84 ip->i_update_core = 0;
81 ip->i_update_size = 0;
82 ip->i_delayed_blks = 0; 85 ip->i_delayed_blks = 0;
83 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 86 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
84 ip->i_size = 0; 87 ip->i_size = 0;
@@ -105,17 +108,6 @@ xfs_inode_alloc(
105#ifdef XFS_DIR2_TRACE 108#ifdef XFS_DIR2_TRACE
106 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); 109 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
107#endif 110#endif
108 /*
109 * Now initialise the VFS inode. We do this after the xfs_inode
110 * initialisation as internal failures will result in ->destroy_inode
111 * being called and that will pass down through the reclaim path and
112 * free the XFS inode. This path requires the XFS inode to already be
113 * initialised. Hence if this call fails, the xfs_inode has already
114 * been freed and we should not reference it at all in the error
115 * handling.
116 */
117 if (!inode_init_always(mp->m_super, VFS_I(ip)))
118 return NULL;
119 111
120 /* prevent anyone from using this yet */ 112 /* prevent anyone from using this yet */
121 VFS_I(ip)->i_state = I_NEW|I_LOCK; 113 VFS_I(ip)->i_state = I_NEW|I_LOCK;
@@ -123,6 +115,71 @@ xfs_inode_alloc(
123 return ip; 115 return ip;
124} 116}
125 117
118STATIC void
119xfs_inode_free(
120 struct xfs_inode *ip)
121{
122 switch (ip->i_d.di_mode & S_IFMT) {
123 case S_IFREG:
124 case S_IFDIR:
125 case S_IFLNK:
126 xfs_idestroy_fork(ip, XFS_DATA_FORK);
127 break;
128 }
129
130 if (ip->i_afp)
131 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
132
133#ifdef XFS_INODE_TRACE
134 ktrace_free(ip->i_trace);
135#endif
136#ifdef XFS_BMAP_TRACE
137 ktrace_free(ip->i_xtrace);
138#endif
139#ifdef XFS_BTREE_TRACE
140 ktrace_free(ip->i_btrace);
141#endif
142#ifdef XFS_RW_TRACE
143 ktrace_free(ip->i_rwtrace);
144#endif
145#ifdef XFS_ILOCK_TRACE
146 ktrace_free(ip->i_lock_trace);
147#endif
148#ifdef XFS_DIR2_TRACE
149 ktrace_free(ip->i_dir_trace);
150#endif
151
152 if (ip->i_itemp) {
153 /*
154 * Only if we are shutting down the fs will we see an
155 * inode still in the AIL. If it is there, we should remove
156 * it to prevent a use-after-free from occurring.
157 */
158 xfs_log_item_t *lip = &ip->i_itemp->ili_item;
159 struct xfs_ail *ailp = lip->li_ailp;
160
161 ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
162 XFS_FORCED_SHUTDOWN(ip->i_mount));
163 if (lip->li_flags & XFS_LI_IN_AIL) {
164 spin_lock(&ailp->xa_lock);
165 if (lip->li_flags & XFS_LI_IN_AIL)
166 xfs_trans_ail_delete(ailp, lip);
167 else
168 spin_unlock(&ailp->xa_lock);
169 }
170 xfs_inode_item_destroy(ip);
171 ip->i_itemp = NULL;
172 }
173
174 /* asserts to verify all state is correct here */
175 ASSERT(atomic_read(&ip->i_iocount) == 0);
176 ASSERT(atomic_read(&ip->i_pincount) == 0);
177 ASSERT(!spin_is_locked(&ip->i_flags_lock));
178 ASSERT(completion_done(&ip->i_flush));
179
180 kmem_zone_free(xfs_inode_zone, ip);
181}
182
126/* 183/*
127 * Check the validity of the inode we just found it the cache 184 * Check the validity of the inode we just found it the cache
128 */ 185 */
@@ -133,80 +190,82 @@ xfs_iget_cache_hit(
133 int flags, 190 int flags,
134 int lock_flags) __releases(pag->pag_ici_lock) 191 int lock_flags) __releases(pag->pag_ici_lock)
135{ 192{
193 struct inode *inode = VFS_I(ip);
136 struct xfs_mount *mp = ip->i_mount; 194 struct xfs_mount *mp = ip->i_mount;
137 int error = EAGAIN; 195 int error;
196
197 spin_lock(&ip->i_flags_lock);
138 198
139 /* 199 /*
140 * If INEW is set this inode is being set up 200 * If we are racing with another cache hit that is currently
141 * If IRECLAIM is set this inode is being torn down 201 * instantiating this inode or currently recycling it out of
142 * Pause and try again. 202 * reclaimabe state, wait for the initialisation to complete
203 * before continuing.
204 *
205 * XXX(hch): eventually we should do something equivalent to
206 * wait_on_inode to wait for these flags to be cleared
207 * instead of polling for it.
143 */ 208 */
144 if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { 209 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
145 XFS_STATS_INC(xs_ig_frecycle); 210 XFS_STATS_INC(xs_ig_frecycle);
211 error = EAGAIN;
146 goto out_error; 212 goto out_error;
147 } 213 }
148 214
149 /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ 215 /*
150 if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { 216 * If lookup is racing with unlink return an error immediately.
151 217 */
152 /* 218 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
153 * If lookup is racing with unlink, then we should return an 219 error = ENOENT;
154 * error immediately so we don't remove it from the reclaim 220 goto out_error;
155 * list and potentially leak the inode. 221 }
156 */
157 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
158 error = ENOENT;
159 goto out_error;
160 }
161 222
223 /*
224 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
225 * Need to carefully get it back into useable state.
226 */
227 if (ip->i_flags & XFS_IRECLAIMABLE) {
162 xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); 228 xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
163 229
164 /* 230 /*
165 * We need to re-initialise the VFS inode as it has been 231 * We need to set XFS_INEW atomically with clearing the
166 * 'freed' by the VFS. Do this here so we can deal with 232 * reclaimable tag so that we do have an indicator of the
167 * errors cleanly, then tag it so it can be set up correctly 233 * inode still being initialized.
168 * later.
169 */ 234 */
170 if (!inode_init_always(mp->m_super, VFS_I(ip))) { 235 ip->i_flags |= XFS_INEW;
171 error = ENOMEM; 236 ip->i_flags &= ~XFS_IRECLAIMABLE;
172 goto out_error; 237 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
173 }
174 238
175 /* 239 spin_unlock(&ip->i_flags_lock);
176 * We must set the XFS_INEW flag before clearing the 240 read_unlock(&pag->pag_ici_lock);
177 * XFS_IRECLAIMABLE flag so that if a racing lookup does
178 * not find the XFS_IRECLAIMABLE above but has the igrab()
179 * below succeed we can safely check XFS_INEW to detect
180 * that this inode is still being initialised.
181 */
182 xfs_iflags_set(ip, XFS_INEW);
183 xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
184 241
185 /* clear the radix tree reclaim flag as well. */ 242 error = -inode_init_always(mp->m_super, inode);
186 __xfs_inode_clear_reclaim_tag(mp, pag, ip); 243 if (error) {
187 } else if (!igrab(VFS_I(ip))) { 244 /*
245 * Re-initializing the inode failed, and we are in deep
246 * trouble. Try to re-add it to the reclaim list.
247 */
248 read_lock(&pag->pag_ici_lock);
249 spin_lock(&ip->i_flags_lock);
250
251 ip->i_flags &= ~XFS_INEW;
252 ip->i_flags |= XFS_IRECLAIMABLE;
253 __xfs_inode_set_reclaim_tag(pag, ip);
254 goto out_error;
255 }
256 inode->i_state = I_LOCK|I_NEW;
257 } else {
188 /* If the VFS inode is being torn down, pause and try again. */ 258 /* If the VFS inode is being torn down, pause and try again. */
189 XFS_STATS_INC(xs_ig_frecycle); 259 if (!igrab(inode)) {
190 goto out_error; 260 error = EAGAIN;
191 } else if (xfs_iflags_test(ip, XFS_INEW)) { 261 goto out_error;
192 /* 262 }
193 * We are racing with another cache hit that is
194 * currently recycling this inode out of the XFS_IRECLAIMABLE
195 * state. Wait for the initialisation to complete before
196 * continuing.
197 */
198 wait_on_inode(VFS_I(ip));
199 }
200 263
201 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { 264 /* We've got a live one. */
202 error = ENOENT; 265 spin_unlock(&ip->i_flags_lock);
203 iput(VFS_I(ip)); 266 read_unlock(&pag->pag_ici_lock);
204 goto out_error;
205 } 267 }
206 268
207 /* We've got a live one. */
208 read_unlock(&pag->pag_ici_lock);
209
210 if (lock_flags != 0) 269 if (lock_flags != 0)
211 xfs_ilock(ip, lock_flags); 270 xfs_ilock(ip, lock_flags);
212 271
@@ -216,6 +275,7 @@ xfs_iget_cache_hit(
216 return 0; 275 return 0;
217 276
218out_error: 277out_error:
278 spin_unlock(&ip->i_flags_lock);
219 read_unlock(&pag->pag_ici_lock); 279 read_unlock(&pag->pag_ici_lock);
220 return error; 280 return error;
221} 281}
@@ -299,7 +359,8 @@ out_preload_end:
299 if (lock_flags) 359 if (lock_flags)
300 xfs_iunlock(ip, lock_flags); 360 xfs_iunlock(ip, lock_flags);
301out_destroy: 361out_destroy:
302 xfs_destroy_inode(ip); 362 __destroy_inode(VFS_I(ip));
363 xfs_inode_free(ip);
303 return error; 364 return error;
304} 365}
305 366
@@ -394,32 +455,6 @@ out_error_or_again:
394 return error; 455 return error;
395} 456}
396 457
397
398/*
399 * Look for the inode corresponding to the given ino in the hash table.
400 * If it is there and its i_transp pointer matches tp, return it.
401 * Otherwise, return NULL.
402 */
403xfs_inode_t *
404xfs_inode_incore(xfs_mount_t *mp,
405 xfs_ino_t ino,
406 xfs_trans_t *tp)
407{
408 xfs_inode_t *ip;
409 xfs_perag_t *pag;
410
411 pag = xfs_get_perag(mp, ino);
412 read_lock(&pag->pag_ici_lock);
413 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
414 read_unlock(&pag->pag_ici_lock);
415 xfs_put_perag(mp, pag);
416
417 /* the returned inode must match the transaction */
418 if (ip && (ip->i_transp != tp))
419 return NULL;
420 return ip;
421}
422
423/* 458/*
424 * Decrement reference count of an inode structure and unlock it. 459 * Decrement reference count of an inode structure and unlock it.
425 * 460 *
@@ -504,62 +539,7 @@ xfs_ireclaim(
504 xfs_qm_dqdetach(ip); 539 xfs_qm_dqdetach(ip);
505 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 540 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
506 541
507 switch (ip->i_d.di_mode & S_IFMT) { 542 xfs_inode_free(ip);
508 case S_IFREG:
509 case S_IFDIR:
510 case S_IFLNK:
511 xfs_idestroy_fork(ip, XFS_DATA_FORK);
512 break;
513 }
514
515 if (ip->i_afp)
516 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
517
518#ifdef XFS_INODE_TRACE
519 ktrace_free(ip->i_trace);
520#endif
521#ifdef XFS_BMAP_TRACE
522 ktrace_free(ip->i_xtrace);
523#endif
524#ifdef XFS_BTREE_TRACE
525 ktrace_free(ip->i_btrace);
526#endif
527#ifdef XFS_RW_TRACE
528 ktrace_free(ip->i_rwtrace);
529#endif
530#ifdef XFS_ILOCK_TRACE
531 ktrace_free(ip->i_lock_trace);
532#endif
533#ifdef XFS_DIR2_TRACE
534 ktrace_free(ip->i_dir_trace);
535#endif
536 if (ip->i_itemp) {
537 /*
538 * Only if we are shutting down the fs will we see an
539 * inode still in the AIL. If it is there, we should remove
540 * it to prevent a use-after-free from occurring.
541 */
542 xfs_log_item_t *lip = &ip->i_itemp->ili_item;
543 struct xfs_ail *ailp = lip->li_ailp;
544
545 ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
546 XFS_FORCED_SHUTDOWN(ip->i_mount));
547 if (lip->li_flags & XFS_LI_IN_AIL) {
548 spin_lock(&ailp->xa_lock);
549 if (lip->li_flags & XFS_LI_IN_AIL)
550 xfs_trans_ail_delete(ailp, lip);
551 else
552 spin_unlock(&ailp->xa_lock);
553 }
554 xfs_inode_item_destroy(ip);
555 ip->i_itemp = NULL;
556 }
557 /* asserts to verify all state is correct here */
558 ASSERT(atomic_read(&ip->i_iocount) == 0);
559 ASSERT(atomic_read(&ip->i_pincount) == 0);
560 ASSERT(!spin_is_locked(&ip->i_flags_lock));
561 ASSERT(completion_done(&ip->i_flush));
562 kmem_zone_free(xfs_inode_zone, ip);
563} 543}
564 544
565/* 545/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1f22d65fed0a..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -343,6 +343,16 @@ xfs_iformat(
343 return XFS_ERROR(EFSCORRUPTED); 343 return XFS_ERROR(EFSCORRUPTED);
344 } 344 }
345 345
346 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
347 !ip->i_mount->m_rtdev_targp)) {
348 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
349 "corrupt dinode %Lu, has realtime flag set.",
350 ip->i_ino);
351 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
352 XFS_ERRLEVEL_LOW, ip->i_mount, dip);
353 return XFS_ERROR(EFSCORRUPTED);
354 }
355
346 switch (ip->i_d.di_mode & S_IFMT) { 356 switch (ip->i_d.di_mode & S_IFMT) {
347 case S_IFIFO: 357 case S_IFIFO:
348 case S_IFCHR: 358 case S_IFCHR:
@@ -641,7 +651,7 @@ xfs_iformat_btree(
641 return 0; 651 return 0;
642} 652}
643 653
644void 654STATIC void
645xfs_dinode_from_disk( 655xfs_dinode_from_disk(
646 xfs_icdinode_t *to, 656 xfs_icdinode_t *to,
647 xfs_dinode_t *from) 657 xfs_dinode_t *from)
@@ -1237,7 +1247,7 @@ xfs_isize_check(
1237 * In that case the pages will still be in memory, but the inode size 1247 * In that case the pages will still be in memory, but the inode size
1238 * will never have been updated. 1248 * will never have been updated.
1239 */ 1249 */
1240xfs_fsize_t 1250STATIC xfs_fsize_t
1241xfs_file_last_byte( 1251xfs_file_last_byte(
1242 xfs_inode_t *ip) 1252 xfs_inode_t *ip)
1243{ 1253{
@@ -3827,7 +3837,7 @@ xfs_iext_inline_to_direct(
3827/* 3837/*
3828 * Resize an extent indirection array to new_size bytes. 3838 * Resize an extent indirection array to new_size bytes.
3829 */ 3839 */
3830void 3840STATIC void
3831xfs_iext_realloc_indirect( 3841xfs_iext_realloc_indirect(
3832 xfs_ifork_t *ifp, /* inode fork pointer */ 3842 xfs_ifork_t *ifp, /* inode fork pointer */
3833 int new_size) /* new indirection array size */ 3843 int new_size) /* new indirection array size */
@@ -3852,7 +3862,7 @@ xfs_iext_realloc_indirect(
3852/* 3862/*
3853 * Switch from indirection array to linear (direct) extent allocations. 3863 * Switch from indirection array to linear (direct) extent allocations.
3854 */ 3864 */
3855void 3865STATIC void
3856xfs_iext_indirect_to_direct( 3866xfs_iext_indirect_to_direct(
3857 xfs_ifork_t *ifp) /* inode fork pointer */ 3867 xfs_ifork_t *ifp) /* inode fork pointer */
3858{ 3868{
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1804f866a71d..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
261 /* Miscellaneous state. */ 261 /* Miscellaneous state. */
262 unsigned short i_flags; /* see defined flags below */ 262 unsigned short i_flags; /* see defined flags below */
263 unsigned char i_update_core; /* timestamps/size is dirty */ 263 unsigned char i_update_core; /* timestamps/size is dirty */
264 unsigned char i_update_size; /* di_size field is dirty */
265 unsigned int i_delayed_blks; /* count of delay alloc blks */ 264 unsigned int i_delayed_blks; /* count of delay alloc blks */
266 265
267 xfs_icdinode_t i_d; /* most of ondisk inode */ 266 xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -310,23 +309,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
310} 309}
311 310
312/* 311/*
313 * Get rid of a partially initialized inode.
314 *
315 * We have to go through destroy_inode to make sure allocations
316 * from init_inode_always like the security data are undone.
317 *
318 * We mark the inode bad so that it takes the short cut in
319 * the reclaim path instead of going through the flush path
320 * which doesn't make sense for an inode that has never seen the
321 * light of day.
322 */
323static inline void xfs_destroy_inode(struct xfs_inode *ip)
324{
325 make_bad_inode(VFS_I(ip));
326 return destroy_inode(VFS_I(ip));
327}
328
329/*
330 * i_flags helper functions 312 * i_flags helper functions
331 */ 313 */
332static inline void 314static inline void
@@ -485,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
485/* 467/*
486 * xfs_iget.c prototypes. 468 * xfs_iget.c prototypes.
487 */ 469 */
488xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
489 struct xfs_trans *);
490int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, 470int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
491 uint, uint, xfs_inode_t **, xfs_daddr_t); 471 uint, uint, xfs_inode_t **, xfs_daddr_t);
492void xfs_iput(xfs_inode_t *, uint); 472void xfs_iput(xfs_inode_t *, uint);
@@ -521,7 +501,6 @@ void xfs_ipin(xfs_inode_t *);
521void xfs_iunpin(xfs_inode_t *); 501void xfs_iunpin(xfs_inode_t *);
522int xfs_iflush(xfs_inode_t *, uint); 502int xfs_iflush(xfs_inode_t *, uint);
523void xfs_ichgtime(xfs_inode_t *, int); 503void xfs_ichgtime(xfs_inode_t *, int);
524xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
525void xfs_lock_inodes(xfs_inode_t **, int, uint); 504void xfs_lock_inodes(xfs_inode_t **, int, uint);
526void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 505void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
527 506
@@ -589,8 +568,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
589 struct xfs_buf **, uint); 568 struct xfs_buf **, uint);
590int xfs_iread(struct xfs_mount *, struct xfs_trans *, 569int xfs_iread(struct xfs_mount *, struct xfs_trans *,
591 struct xfs_inode *, xfs_daddr_t, uint); 570 struct xfs_inode *, xfs_daddr_t, uint);
592void xfs_dinode_from_disk(struct xfs_icdinode *,
593 struct xfs_dinode *);
594void xfs_dinode_to_disk(struct xfs_dinode *, 571void xfs_dinode_to_disk(struct xfs_dinode *,
595 struct xfs_icdinode *); 572 struct xfs_icdinode *);
596void xfs_idestroy_fork(struct xfs_inode *, int); 573void xfs_idestroy_fork(struct xfs_inode *, int);
@@ -609,8 +586,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
609void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); 586void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
610void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); 587void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
611void xfs_iext_realloc_direct(xfs_ifork_t *, int); 588void xfs_iext_realloc_direct(xfs_ifork_t *, int);
612void xfs_iext_realloc_indirect(xfs_ifork_t *, int);
613void xfs_iext_indirect_to_direct(xfs_ifork_t *);
614void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); 589void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
615void xfs_iext_inline_to_direct(xfs_ifork_t *, int); 590void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
616void xfs_iext_destroy(xfs_ifork_t *); 591void xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -263,14 +263,6 @@ xfs_inode_item_format(
263 } 263 }
264 264
265 /* 265 /*
266 * We don't have to worry about re-ordering here because
267 * the update_size field is protected by the inode lock
268 * and we have that held in exclusive mode.
269 */
270 if (ip->i_update_size)
271 ip->i_update_size = 0;
272
273 /*
274 * Make sure to get the latest atime from the Linux inode. 266 * Make sure to get the latest atime from the Linux inode.
275 */ 267 */
276 xfs_synchronize_atime(ip); 268 xfs_synchronize_atime(ip);
@@ -712,8 +704,6 @@ xfs_inode_item_unlock(
712 * Clear out the fields of the inode log item particular 704 * Clear out the fields of the inode log item particular
713 * to the current transaction. 705 * to the current transaction.
714 */ 706 */
715 iip->ili_ilock_recur = 0;
716 iip->ili_iolock_recur = 0;
717 iip->ili_flags = 0; 707 iip->ili_flags = 0;
718 708
719 /* 709 /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
137 struct xfs_inode *ili_inode; /* inode ptr */ 137 struct xfs_inode *ili_inode; /* inode ptr */
138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ 138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ 139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
140 unsigned short ili_ilock_recur; /* lock recursion count */
141 unsigned short ili_iolock_recur; /* lock recursion count */
142 unsigned short ili_flags; /* misc flags */ 140 unsigned short ili_flags; /* misc flags */
143 unsigned short ili_logged; /* flushed logged data */ 141 unsigned short ili_logged; /* flushed logged data */
144 unsigned int ili_last_fields; /* fields when flushed */ 142 unsigned int ili_last_fields; /* fields when flushed */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
72 72
73#if XFS_BIG_INUMS 73#if XFS_BIG_INUMS
74#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) 74#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
75#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32))
76#else 75#else
77#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) 76#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
78#endif 77#endif
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
39#include "xfs_error.h" 39#include "xfs_error.h"
40#include "xfs_btree.h" 40#include "xfs_btree.h"
41 41
42int 42STATIC int
43xfs_internal_inum( 43xfs_internal_inum(
44 xfs_mount_t *mp, 44 xfs_mount_t *mp,
45 xfs_ino_t ino) 45 xfs_ino_t ino)
@@ -353,9 +353,6 @@ xfs_bulkstat(
353 int end_of_ag; /* set if we've seen the ag end */ 353 int end_of_ag; /* set if we've seen the ag end */
354 int error; /* error code */ 354 int error; /* error code */
355 int fmterror;/* bulkstat formatter result */ 355 int fmterror;/* bulkstat formatter result */
356 __int32_t gcnt; /* current btree rec's count */
357 xfs_inofree_t gfree; /* current btree rec's free mask */
358 xfs_agino_t gino; /* current btree rec's start inode */
359 int i; /* loop index */ 356 int i; /* loop index */
360 int icount; /* count of inodes good in irbuf */ 357 int icount; /* count of inodes good in irbuf */
361 size_t irbsize; /* size of irec buffer in bytes */ 358 size_t irbsize; /* size of irec buffer in bytes */
@@ -442,40 +439,43 @@ xfs_bulkstat(
442 * we need to get the remainder of the chunk we're in. 439 * we need to get the remainder of the chunk we're in.
443 */ 440 */
444 if (agino > 0) { 441 if (agino > 0) {
442 xfs_inobt_rec_incore_t r;
443
445 /* 444 /*
446 * Lookup the inode chunk that this inode lives in. 445 * Lookup the inode chunk that this inode lives in.
447 */ 446 */
448 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp); 447 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
448 &tmp);
449 if (!error && /* no I/O error */ 449 if (!error && /* no I/O error */
450 tmp && /* lookup succeeded */ 450 tmp && /* lookup succeeded */
451 /* got the record, should always work */ 451 /* got the record, should always work */
452 !(error = xfs_inobt_get_rec(cur, &gino, &gcnt, 452 !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
453 &gfree, &i)) &&
454 i == 1 && 453 i == 1 &&
455 /* this is the right chunk */ 454 /* this is the right chunk */
456 agino < gino + XFS_INODES_PER_CHUNK && 455 agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
457 /* lastino was not last in chunk */ 456 /* lastino was not last in chunk */
458 (chunkidx = agino - gino + 1) < 457 (chunkidx = agino - r.ir_startino + 1) <
459 XFS_INODES_PER_CHUNK && 458 XFS_INODES_PER_CHUNK &&
460 /* there are some left allocated */ 459 /* there are some left allocated */
461 xfs_inobt_maskn(chunkidx, 460 xfs_inobt_maskn(chunkidx,
462 XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { 461 XFS_INODES_PER_CHUNK - chunkidx) &
462 ~r.ir_free) {
463 /* 463 /*
464 * Grab the chunk record. Mark all the 464 * Grab the chunk record. Mark all the
465 * uninteresting inodes (because they're 465 * uninteresting inodes (because they're
466 * before our start point) free. 466 * before our start point) free.
467 */ 467 */
468 for (i = 0; i < chunkidx; i++) { 468 for (i = 0; i < chunkidx; i++) {
469 if (XFS_INOBT_MASK(i) & ~gfree) 469 if (XFS_INOBT_MASK(i) & ~r.ir_free)
470 gcnt++; 470 r.ir_freecount++;
471 } 471 }
472 gfree |= xfs_inobt_maskn(0, chunkidx); 472 r.ir_free |= xfs_inobt_maskn(0, chunkidx);
473 irbp->ir_startino = gino; 473 irbp->ir_startino = r.ir_startino;
474 irbp->ir_freecount = gcnt; 474 irbp->ir_freecount = r.ir_freecount;
475 irbp->ir_free = gfree; 475 irbp->ir_free = r.ir_free;
476 irbp++; 476 irbp++;
477 agino = gino + XFS_INODES_PER_CHUNK; 477 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
478 icount = XFS_INODES_PER_CHUNK - gcnt; 478 icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
479 } else { 479 } else {
480 /* 480 /*
481 * If any of those tests failed, bump the 481 * If any of those tests failed, bump the
@@ -493,7 +493,7 @@ xfs_bulkstat(
493 /* 493 /*
494 * Start of ag. Lookup the first inode chunk. 494 * Start of ag. Lookup the first inode chunk.
495 */ 495 */
496 error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp); 496 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
497 icount = 0; 497 icount = 0;
498 } 498 }
499 /* 499 /*
@@ -501,6 +501,8 @@ xfs_bulkstat(
501 * until we run out of inodes or space in the buffer. 501 * until we run out of inodes or space in the buffer.
502 */ 502 */
503 while (irbp < irbufend && icount < ubcount) { 503 while (irbp < irbufend && icount < ubcount) {
504 xfs_inobt_rec_incore_t r;
505
504 /* 506 /*
505 * Loop as long as we're unable to read the 507 * Loop as long as we're unable to read the
506 * inode btree. 508 * inode btree.
@@ -510,51 +512,55 @@ xfs_bulkstat(
510 if (XFS_AGINO_TO_AGBNO(mp, agino) >= 512 if (XFS_AGINO_TO_AGBNO(mp, agino) >=
511 be32_to_cpu(agi->agi_length)) 513 be32_to_cpu(agi->agi_length))
512 break; 514 break;
513 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, 515 error = xfs_inobt_lookup(cur, agino,
514 &tmp); 516 XFS_LOOKUP_GE, &tmp);
515 cond_resched(); 517 cond_resched();
516 } 518 }
517 /* 519 /*
518 * If ran off the end of the ag either with an error, 520 * If ran off the end of the ag either with an error,
519 * or the normal way, set end and stop collecting. 521 * or the normal way, set end and stop collecting.
520 */ 522 */
521 if (error || 523 if (error) {
522 (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
523 &gfree, &i)) ||
524 i == 0) {
525 end_of_ag = 1; 524 end_of_ag = 1;
526 break; 525 break;
527 } 526 }
527
528 error = xfs_inobt_get_rec(cur, &r, &i);
529 if (error || i == 0) {
530 end_of_ag = 1;
531 break;
532 }
533
528 /* 534 /*
529 * If this chunk has any allocated inodes, save it. 535 * If this chunk has any allocated inodes, save it.
530 * Also start read-ahead now for this chunk. 536 * Also start read-ahead now for this chunk.
531 */ 537 */
532 if (gcnt < XFS_INODES_PER_CHUNK) { 538 if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
533 /* 539 /*
534 * Loop over all clusters in the next chunk. 540 * Loop over all clusters in the next chunk.
535 * Do a readahead if there are any allocated 541 * Do a readahead if there are any allocated
536 * inodes in that cluster. 542 * inodes in that cluster.
537 */ 543 */
538 for (agbno = XFS_AGINO_TO_AGBNO(mp, gino), 544 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
539 chunkidx = 0; 545 for (chunkidx = 0;
540 chunkidx < XFS_INODES_PER_CHUNK; 546 chunkidx < XFS_INODES_PER_CHUNK;
541 chunkidx += nicluster, 547 chunkidx += nicluster,
542 agbno += nbcluster) { 548 agbno += nbcluster) {
543 if (xfs_inobt_maskn(chunkidx, 549 if (xfs_inobt_maskn(chunkidx, nicluster)
544 nicluster) & ~gfree) 550 & ~r.ir_free)
545 xfs_btree_reada_bufs(mp, agno, 551 xfs_btree_reada_bufs(mp, agno,
546 agbno, nbcluster); 552 agbno, nbcluster);
547 } 553 }
548 irbp->ir_startino = gino; 554 irbp->ir_startino = r.ir_startino;
549 irbp->ir_freecount = gcnt; 555 irbp->ir_freecount = r.ir_freecount;
550 irbp->ir_free = gfree; 556 irbp->ir_free = r.ir_free;
551 irbp++; 557 irbp++;
552 icount += XFS_INODES_PER_CHUNK - gcnt; 558 icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
553 } 559 }
554 /* 560 /*
555 * Set agino to after this chunk and bump the cursor. 561 * Set agino to after this chunk and bump the cursor.
556 */ 562 */
557 agino = gino + XFS_INODES_PER_CHUNK; 563 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
558 error = xfs_btree_increment(cur, 0, &tmp); 564 error = xfs_btree_increment(cur, 0, &tmp);
559 cond_resched(); 565 cond_resched();
560 } 566 }
@@ -820,9 +826,7 @@ xfs_inumbers(
820 int bufidx; 826 int bufidx;
821 xfs_btree_cur_t *cur; 827 xfs_btree_cur_t *cur;
822 int error; 828 int error;
823 __int32_t gcnt; 829 xfs_inobt_rec_incore_t r;
824 xfs_inofree_t gfree;
825 xfs_agino_t gino;
826 int i; 830 int i;
827 xfs_ino_t ino; 831 xfs_ino_t ino;
828 int left; 832 int left;
@@ -855,7 +859,8 @@ xfs_inumbers(
855 continue; 859 continue;
856 } 860 }
857 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); 861 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
858 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); 862 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
863 &tmp);
859 if (error) { 864 if (error) {
860 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 865 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
861 cur = NULL; 866 cur = NULL;
@@ -870,9 +875,8 @@ xfs_inumbers(
870 continue; 875 continue;
871 } 876 }
872 } 877 }
873 if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree, 878 error = xfs_inobt_get_rec(cur, &r, &i);
874 &i)) || 879 if (error || i == 0) {
875 i == 0) {
876 xfs_buf_relse(agbp); 880 xfs_buf_relse(agbp);
877 agbp = NULL; 881 agbp = NULL;
878 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 882 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +885,12 @@ xfs_inumbers(
881 agino = 0; 885 agino = 0;
882 continue; 886 continue;
883 } 887 }
884 agino = gino + XFS_INODES_PER_CHUNK - 1; 888 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
885 buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino); 889 buffer[bufidx].xi_startino =
886 buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt; 890 XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
887 buffer[bufidx].xi_allocmask = ~gfree; 891 buffer[bufidx].xi_alloccount =
892 XFS_INODES_PER_CHUNK - r.ir_freecount;
893 buffer[bufidx].xi_allocmask = ~r.ir_free;
888 bufidx++; 894 bufidx++;
889 left--; 895 left--;
890 if (bufidx == bcount) { 896 if (bufidx == bcount) {
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
99 void *dibuff, 99 void *dibuff,
100 int *stat); 100 int *stat);
101 101
102int
103xfs_internal_inum(
104 xfs_mount_t *mp,
105 xfs_ino_t ino);
106
107typedef int (*inumbers_fmt_pf)( 102typedef int (*inumbers_fmt_pf)(
108 void __user *ubuffer, /* buffer to write to */ 103 void __user *ubuffer, /* buffer to write to */
109 const xfs_inogrp_t *buffer, /* buffer to read from */ 104 const xfs_inogrp_t *buffer, /* buffer to read from */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3750f04ede0b..9dbdff3ea484 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3180,7 +3180,7 @@ try_again:
3180STATIC void 3180STATIC void
3181xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) 3181xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3182{ 3182{
3183 ASSERT(spin_is_locked(&log->l_icloglock)); 3183 assert_spin_locked(&log->l_icloglock);
3184 3184
3185 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3185 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3186 xlog_state_switch_iclogs(log, iclog, 0); 3186 xlog_state_switch_iclogs(log, iclog, 0);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log,
451extern int xlog_recover(xlog_t *log); 451extern int xlog_recover(xlog_t *log);
452extern int xlog_recover_finish(xlog_t *log); 452extern int xlog_recover_finish(xlog_t *log);
453extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 453extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
454extern void xlog_recover_process_iunlinks(xlog_t *log);
455
456extern struct xfs_buf *xlog_get_bp(xlog_t *, int); 454extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
457extern void xlog_put_bp(struct xfs_buf *); 455extern void xlog_put_bp(struct xfs_buf *);
458 456
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
3263 * freeing of the inode and its removal from the list must be 3263 * freeing of the inode and its removal from the list must be
3264 * atomic. 3264 * atomic.
3265 */ 3265 */
3266void 3266STATIC void
3267xlog_recover_process_iunlinks( 3267xlog_recover_process_iunlinks(
3268 xlog_t *log) 3268 xlog_t *log)
3269{ 3269{
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1568 * 1568 *
1569 * The m_sb_lock must be held when this routine is called. 1569 * The m_sb_lock must be held when this routine is called.
1570 */ 1570 */
1571int 1571STATIC int
1572xfs_mod_incore_sb_unlocked( 1572xfs_mod_incore_sb_unlocked(
1573 xfs_mount_t *mp, 1573 xfs_mount_t *mp,
1574 xfs_sb_field_t field, 1574 xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
414 414
415extern int xfs_log_sbcount(xfs_mount_t *, uint); 415extern int xfs_log_sbcount(xfs_mount_t *, uint);
416extern int xfs_mountfs(xfs_mount_t *mp); 416extern int xfs_mountfs(xfs_mount_t *mp);
417extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
418 417
419extern void xfs_unmountfs(xfs_mount_t *); 418extern void xfs_unmountfs(xfs_mount_t *);
420extern int xfs_unmountfs_writesb(xfs_mount_t *); 419extern int xfs_unmountfs_writesb(xfs_mount_t *);
421extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 420extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
422extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
423 int64_t, int);
424extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 421extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
425 uint, int); 422 uint, int);
426extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); 423extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -564,35 +564,6 @@ xfs_mru_cache_lookup(
564} 564}
565 565
566/* 566/*
567 * To look up an element using its key, but leave its location in the internal
568 * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
569 * function returns NULL.
570 *
571 * See the comments above the declaration of the xfs_mru_cache_lookup() function
572 * for important locking information pertaining to this call.
573 */
574void *
575xfs_mru_cache_peek(
576 xfs_mru_cache_t *mru,
577 unsigned long key)
578{
579 xfs_mru_cache_elem_t *elem;
580
581 ASSERT(mru && mru->lists);
582 if (!mru || !mru->lists)
583 return NULL;
584
585 spin_lock(&mru->lock);
586 elem = radix_tree_lookup(&mru->store, key);
587 if (!elem)
588 spin_unlock(&mru->lock);
589 else
590 __release(mru_lock); /* help sparse not be stupid */
591
592 return elem ? elem->value : NULL;
593}
594
595/*
596 * To release the internal data structure spinlock after having performed an 567 * To release the internal data structure spinlock after having performed an
597 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() 568 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
598 * with the data store pointer. 569 * with the data store pointer.
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
49void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); 49void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
50void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); 50void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
51void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); 51void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
52void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
53void xfs_mru_cache_done(struct xfs_mru_cache *mru); 52void xfs_mru_cache_done(struct xfs_mru_cache *mru);
54 53
55#endif /* __XFS_MRU_CACHE_H__ */ 54#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
88} 88}
89 89
90/* 90/*
91 * Handle logging requirements of various synchronous types of write.
92 */
93int
94xfs_write_sync_logforce(
95 xfs_mount_t *mp,
96 xfs_inode_t *ip)
97{
98 int error = 0;
99
100 /*
101 * If we're treating this as O_DSYNC and we have not updated the
102 * size, force the log.
103 */
104 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
105 !(ip->i_update_size)) {
106 xfs_inode_log_item_t *iip = ip->i_itemp;
107
108 /*
109 * If an allocation transaction occurred
110 * without extending the size, then we have to force
111 * the log up the proper point to ensure that the
112 * allocation is permanent. We can't count on
113 * the fact that buffered writes lock out direct I/O
114 * writes - the direct I/O write could have extended
115 * the size nontransactionally, then finished before
116 * we started. xfs_write_file will think that the file
117 * didn't grow but the update isn't safe unless the
118 * size change is logged.
119 *
120 * Force the log if we've committed a transaction
121 * against the inode or if someone else has and
122 * the commit record hasn't gone to disk (e.g.
123 * the inode is pinned). This guarantees that
124 * all changes affecting the inode are permanent
125 * when we return.
126 */
127 if (iip && iip->ili_last_lsn) {
128 error = _xfs_log_force(mp, iip->ili_last_lsn,
129 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
130 } else if (xfs_ipincount(ip) > 0) {
131 error = _xfs_log_force(mp, (xfs_lsn_t)0,
132 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
133 }
134
135 } else {
136 xfs_trans_t *tp;
137
138 /*
139 * O_SYNC or O_DSYNC _with_ a size update are handled
140 * the same way.
141 *
142 * If the write was synchronous then we need to make
143 * sure that the inode modification time is permanent.
144 * We'll have updated the timestamp above, so here
145 * we use a synchronous transaction to log the inode.
146 * It's not fast, but it's necessary.
147 *
148 * If this a dsync write and the size got changed
149 * non-transactionally, then we need to ensure that
150 * the size change gets logged in a synchronous
151 * transaction.
152 */
153 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
154 if ((error = xfs_trans_reserve(tp, 0,
155 XFS_SWRITE_LOG_RES(mp),
156 0, 0, 0))) {
157 /* Transaction reserve failed */
158 xfs_trans_cancel(tp, 0);
159 } else {
160 /* Transaction reserve successful */
161 xfs_ilock(ip, XFS_ILOCK_EXCL);
162 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
163 xfs_trans_ihold(tp, ip);
164 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
165 xfs_trans_set_sync(tp);
166 error = xfs_trans_commit(tp, 0);
167 xfs_iunlock(ip, XFS_ILOCK_EXCL);
168 }
169 }
170
171 return error;
172}
173
174/*
175 * Force a shutdown of the filesystem instantly while keeping 91 * Force a shutdown of the filesystem instantly while keeping
176 * the filesystem consistent. We don't do an unmount here; just shutdown 92 * the filesystem consistent. We don't do an unmount here; just shutdown
177 * the shop, make sure that absolutely nothing persistent happens to 93 * the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
68 * Prototypes for functions in xfs_rw.c. 68 * Prototypes for functions in xfs_rw.c.
69 */ 69 */
70extern int xfs_write_clear_setuid(struct xfs_inode *ip); 70extern int xfs_write_clear_setuid(struct xfs_inode *ip);
71extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
72extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); 71extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
73extern int xfs_bioerror(struct xfs_buf *bp); 72extern int xfs_bioerror(struct xfs_buf *bp);
74extern int xfs_bioerror_relse(struct xfs_buf *bp); 73extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -78,10 +77,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
78extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, 77extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
79 xfs_buf_t *bp, xfs_daddr_t blkno); 78 xfs_buf_t *bp, xfs_daddr_t blkno);
80 79
81/*
82 * Prototypes for functions in xfs_vnodeops.c.
83 */
84extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
85 int flags);
86
87#endif /* __XFS_RW_H__ */ 80#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
68#define XFS_TRANS_GROWFS 14 68#define XFS_TRANS_GROWFS 14
69#define XFS_TRANS_STRAT_WRITE 15 69#define XFS_TRANS_STRAT_WRITE 15
70#define XFS_TRANS_DIOSTRAT 16 70#define XFS_TRANS_DIOSTRAT 16
71#define XFS_TRANS_WRITE_SYNC 17 71/* 17 was XFS_TRANS_WRITE_SYNC */
72#define XFS_TRANS_WRITEID 18 72#define XFS_TRANS_WRITEID 18
73#define XFS_TRANS_ADDAFORK 19 73#define XFS_TRANS_ADDAFORK 19
74#define XFS_TRANS_ATTRINVAL 20 74#define XFS_TRANS_ATTRINVAL 20
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
307 return (flags & XFS_BUF_TRYLOCK) ? 307 return (flags & XFS_BUF_TRYLOCK) ?
308 EAGAIN : XFS_ERROR(ENOMEM); 308 EAGAIN : XFS_ERROR(ENOMEM);
309 309
310 if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { 310 if (XFS_BUF_GETERROR(bp) != 0) {
311 xfs_ioerror_alert("xfs_trans_read_buf", mp, 311 xfs_ioerror_alert("xfs_trans_read_buf", mp,
312 bp, blkno); 312 bp, blkno);
313 error = XFS_BUF_GETERROR(bp); 313 error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
315 return error; 315 return error;
316 } 316 }
317#ifdef DEBUG 317#ifdef DEBUG
318 if (xfs_do_error && (bp != NULL)) { 318 if (xfs_do_error) {
319 if (xfs_error_target == target) { 319 if (xfs_error_target == target) {
320 if (((xfs_req_num++) % xfs_error_mod) == 0) { 320 if (((xfs_req_num++) % xfs_error_mod) == 0) {
321 xfs_buf_relse(bp); 321 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
49 49
50 50
51/* 51/*
52 * Get and lock the inode for the caller if it is not already 52 * Get an inode and join it to the transaction.
53 * locked within the given transaction. If it is already locked
54 * within the transaction, just increment its lock recursion count
55 * and return a pointer to it.
56 *
57 * For an inode to be locked in a transaction, the inode lock, as
58 * opposed to the io lock, must be taken exclusively. This ensures
59 * that the inode can be involved in only 1 transaction at a time.
60 * Lock recursion is handled on the io lock, but only for lock modes
61 * of equal or lesser strength. That is, you can recur on the io lock
62 * held EXCL with a SHARED request but not vice versa. Also, if
63 * the inode is already a part of the transaction then you cannot
64 * go from not holding the io lock to having it EXCL or SHARED.
65 *
66 * Use the inode cache routine xfs_inode_incore() to find the inode
67 * if it is already owned by this transaction.
68 *
69 * If we don't already own the inode, use xfs_iget() to get it.
70 * Since the inode log item structure is embedded in the incore
71 * inode structure and is initialized when the inode is brought
72 * into memory, there is nothing to do with it here.
73 *
74 * If the given transaction pointer is NULL, just call xfs_iget().
75 * This simplifies code which must handle both cases.
76 */ 53 */
77int 54int
78xfs_trans_iget( 55xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
84 xfs_inode_t **ipp) 61 xfs_inode_t **ipp)
85{ 62{
86 int error; 63 int error;
87 xfs_inode_t *ip;
88
89 /*
90 * If the transaction pointer is NULL, just call the normal
91 * xfs_iget().
92 */
93 if (tp == NULL)
94 return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
95
96 /*
97 * If we find the inode in core with this transaction
98 * pointer in its i_transp field, then we know we already
99 * have it locked. In this case we just increment the lock
100 * recursion count and return the inode to the caller.
101 * Assert that the inode is already locked in the mode requested
102 * by the caller. We cannot do lock promotions yet, so
103 * die if someone gets this wrong.
104 */
105 if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
106 /*
107 * Make sure that the inode lock is held EXCL and
108 * that the io lock is never upgraded when the inode
109 * is already a part of the transaction.
110 */
111 ASSERT(ip->i_itemp != NULL);
112 ASSERT(lock_flags & XFS_ILOCK_EXCL);
113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
114 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
115 xfs_isilocked(ip, XFS_IOLOCK_EXCL));
116 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
117 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
118 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
119 xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
120 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
121 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
122
123 if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
124 ip->i_itemp->ili_iolock_recur++;
125 }
126 if (lock_flags & XFS_ILOCK_EXCL) {
127 ip->i_itemp->ili_ilock_recur++;
128 }
129 *ipp = ip;
130 return 0;
131 }
132
133 ASSERT(lock_flags & XFS_ILOCK_EXCL);
134 error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
135 if (error) {
136 return error;
137 }
138 ASSERT(ip != NULL);
139 64
140 xfs_trans_ijoin(tp, ip, lock_flags); 65 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
141 *ipp = ip; 66 if (!error && tp)
142 return 0; 67 xfs_trans_ijoin(tp, *ipp, lock_flags);
68 return error;
143} 69}
144 70
145/* 71/*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
163 xfs_inode_item_init(ip, ip->i_mount); 89 xfs_inode_item_init(ip, ip->i_mount);
164 iip = ip->i_itemp; 90 iip = ip->i_itemp;
165 ASSERT(iip->ili_flags == 0); 91 ASSERT(iip->ili_flags == 0);
166 ASSERT(iip->ili_ilock_recur == 0);
167 ASSERT(iip->ili_iolock_recur == 0);
168 92
169 /* 93 /*
170 * Get a log_item_desc to point at the new item. 94 * Get a log_item_desc to point at the new item.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c4eca5ed5dab..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -538,7 +538,9 @@ xfs_readlink_bmap(
538 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); 538 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
539 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 539 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
540 540
541 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); 541 bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
542 XBF_LOCK | XBF_MAPPED |
543 XBF_DONT_BLOCK);
542 error = XFS_BUF_GETERROR(bp); 544 error = XFS_BUF_GETERROR(bp);
543 if (error) { 545 if (error) {
544 xfs_ioerror_alert("xfs_readlink", 546 xfs_ioerror_alert("xfs_readlink",
@@ -609,7 +611,7 @@ xfs_fsync(
609 xfs_inode_t *ip) 611 xfs_inode_t *ip)
610{ 612{
611 xfs_trans_t *tp; 613 xfs_trans_t *tp;
612 int error; 614 int error = 0;
613 int log_flushed = 0, changed = 1; 615 int log_flushed = 0, changed = 1;
614 616
615 xfs_itrace_entry(ip); 617 xfs_itrace_entry(ip);
@@ -617,14 +619,9 @@ xfs_fsync(
617 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 619 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
618 return XFS_ERROR(EIO); 620 return XFS_ERROR(EIO);
619 621
620 /* capture size updates in I/O completion before writing the inode. */
621 error = xfs_wait_on_pages(ip, 0, -1);
622 if (error)
623 return XFS_ERROR(error);
624
625 /* 622 /*
626 * We always need to make sure that the required inode state is safe on 623 * We always need to make sure that the required inode state is safe on
627 * disk. The vnode might be clean but we still might need to force the 624 * disk. The inode might be clean but we still might need to force the
628 * log because of committed transactions that haven't hit the disk yet. 625 * log because of committed transactions that haven't hit the disk yet.
629 * Likewise, there could be unflushed non-transactional changes to the 626 * Likewise, there could be unflushed non-transactional changes to the
630 * inode core that have to go to disk and this requires us to issue 627 * inode core that have to go to disk and this requires us to issue
@@ -636,7 +633,7 @@ xfs_fsync(
636 */ 633 */
637 xfs_ilock(ip, XFS_ILOCK_SHARED); 634 xfs_ilock(ip, XFS_ILOCK_SHARED);
638 635
639 if (!(ip->i_update_size || ip->i_update_core)) { 636 if (!ip->i_update_core) {
640 /* 637 /*
641 * Timestamps/size haven't changed since last inode flush or 638 * Timestamps/size haven't changed since last inode flush or
642 * inode transaction commit. That means either nothing got 639 * inode transaction commit. That means either nothing got
@@ -716,7 +713,7 @@ xfs_fsync(
716 * when the link count isn't zero and by xfs_dm_punch_hole() when 713 * when the link count isn't zero and by xfs_dm_punch_hole() when
717 * punching a hole to EOF. 714 * punching a hole to EOF.
718 */ 715 */
719int 716STATIC int
720xfs_free_eofblocks( 717xfs_free_eofblocks(
721 xfs_mount_t *mp, 718 xfs_mount_t *mp,
722 xfs_inode_t *ip, 719 xfs_inode_t *ip,
@@ -1474,8 +1471,8 @@ xfs_create(
1474 if (error == ENOSPC) { 1471 if (error == ENOSPC) {
1475 /* flush outstanding delalloc blocks and retry */ 1472 /* flush outstanding delalloc blocks and retry */
1476 xfs_flush_inodes(dp); 1473 xfs_flush_inodes(dp);
1477 error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, 1474 error = xfs_trans_reserve(tp, resblks, log_res, 0,
1478 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1475 XFS_TRANS_PERM_LOG_RES, log_count);
1479 } 1476 }
1480 if (error == ENOSPC) { 1477 if (error == ENOSPC) {
1481 /* No space at all so try a "no-allocation" reservation */ 1478 /* No space at all so try a "no-allocation" reservation */