aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c33
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_dir.c14
-rw-r--r--fs/9p/vfs_file.c36
-rw-r--r--fs/9p/vfs_inode.c141
-rw-r--r--fs/9p/vfs_inode_dotl.c86
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/attr.c5
-rw-r--r--fs/autofs4/autofs_i.h26
-rw-r--r--fs/autofs4/waitq.c2
-rw-r--r--fs/befs/linuxvfs.c23
-rw-r--r--fs/block_dev.c7
-rw-r--r--fs/btrfs/btrfs_inode.h6
-rw-r--r--fs/btrfs/ctree.h10
-rw-r--r--fs/btrfs/extent-tree.c77
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c75
-rw-r--r--fs/btrfs/free-space-cache.c20
-rw-r--r--fs/btrfs/inode.c52
-rw-r--r--fs/btrfs/ioctl.c47
-rw-r--r--fs/btrfs/transaction.c4
-rw-r--r--fs/btrfs/tree-log.c28
-rw-r--r--fs/btrfs/volumes.c51
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/btrfs/xattr.c59
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/ceph/mds_client.c2
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/cifs/README14
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_fs_sb.h4
-rw-r--r--fs/cifs/cifsacl.c373
-rw-r--r--fs/cifs/cifsencrypt.c159
-rw-r--r--fs/cifs/cifsfs.c36
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h111
-rw-r--r--fs/cifs/cifspdu.h48
-rw-r--r--fs/cifs/cifsproto.h48
-rw-r--r--fs/cifs/cifssmb.c457
-rw-r--r--fs/cifs/connect.c706
-rw-r--r--fs/cifs/dir.c26
-rw-r--r--fs/cifs/export.c4
-rw-r--r--fs/cifs/file.c1126
-rw-r--r--fs/cifs/inode.c54
-rw-r--r--fs/cifs/link.c17
-rw-r--r--fs/cifs/misc.c66
-rw-r--r--fs/cifs/sess.c4
-rw-r--r--fs/cifs/smbencrypt.c121
-rw-r--r--fs/cifs/transport.c70
-rw-r--r--fs/cifs/xattr.c42
-rw-r--r--fs/coda/coda_linux.h5
-rw-r--r--fs/compat.c13
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/configfs/inode.c3
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/direct-io.c646
-rw-r--r--fs/ecryptfs/Kconfig2
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/main.c23
-rw-r--r--fs/ecryptfs/read_write.c18
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c17
-rw-r--r--fs/exofs/Kbuild3
-rw-r--r--fs/exofs/Kconfig9
-rw-r--r--fs/exofs/exofs.h26
-rw-r--r--fs/exofs/inode.c233
-rw-r--r--fs/exofs/ore.c656
-rw-r--r--fs/exofs/ore_raid.c660
-rw-r--r--fs/exofs/ore_raid.h79
-rw-r--r--fs/exofs/super.c205
-rw-r--r--fs/ext2/xattr_security.c34
-rw-r--r--fs/ext3/inode.c4
-rw-r--r--fs/ext3/namei.c9
-rw-r--r--fs/ext3/xattr_security.c36
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/ext4_jbd2.h4
-rw-r--r--fs/ext4/file.c47
-rw-r--r--fs/ext4/indirect.c9
-rw-r--r--fs/ext4/inode.c27
-rw-r--r--fs/ext4/namei.c9
-rw-r--r--fs/ext4/page-io.c24
-rw-r--r--fs/ext4/super.c1
-rw-r--r--fs/ext4/xattr_security.c36
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/inode.c7
-rw-r--r--fs/fuse/dev.c16
-rw-r--r--fs/fuse/file.c84
-rw-r--r--fs/fuse/fuse_i.h8
-rw-r--r--fs/fuse/inode.c13
-rw-r--r--fs/gfs2/acl.c5
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c199
-rw-r--r--fs/gfs2/dir.c50
-rw-r--r--fs/gfs2/file.c299
-rw-r--r--fs/gfs2/glops.c89
-rw-r--r--fs/gfs2/glops.h2
-rw-r--r--fs/gfs2/incore.h23
-rw-r--r--fs/gfs2/inode.c150
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/log.c4
-rw-r--r--fs/gfs2/lops.c66
-rw-r--r--fs/gfs2/meta_io.c6
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c30
-rw-r--r--fs/gfs2/rgrp.c573
-rw-r--r--fs/gfs2/rgrp.h31
-rw-r--r--fs/gfs2/super.c134
-rw-r--r--fs/gfs2/trans.c5
-rw-r--r--fs/gfs2/trans.h22
-rw-r--r--fs/gfs2/xattr.c28
-rw-r--r--fs/hfsplus/super.c15
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/hugetlbfs/inode.c1
-rw-r--r--fs/inode.c26
-rw-r--r--fs/jffs2/security.c35
-rw-r--r--fs/jfs/jfs_umount.c4
-rw-r--r--fs/jfs/xattr.c57
-rw-r--r--fs/lockd/host.c25
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/locks.c225
-rw-r--r--fs/namei.c58
-rw-r--r--fs/namespace.c3
-rw-r--r--fs/nfs/Kconfig16
-rw-r--r--fs/nfs/blocklayout/blocklayout.c59
-rw-r--r--fs/nfs/blocklayout/blocklayout.h4
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c35
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback.h2
-rw-r--r--fs/nfs/callback_proc.c25
-rw-r--r--fs/nfs/callback_xdr.c24
-rw-r--r--fs/nfs/client.c11
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/file.c10
-rw-r--r--fs/nfs/fscache-index.c4
-rw-r--r--fs/nfs/idmap.c25
-rw-r--r--fs/nfs/inode.c16
-rw-r--r--fs/nfs/internal.h10
-rw-r--r--fs/nfs/nfs4_fs.h32
-rw-r--r--fs/nfs/nfs4filelayout.c33
-rw-r--r--fs/nfs/nfs4proc.c113
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/objlayout/objio_osd.c28
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c3
-rw-r--r--fs/nfs/pnfs.c52
-rw-r--r--fs/nfs/pnfs.h5
-rw-r--r--fs/nfs/read.c40
-rw-r--r--fs/nfs/super.c42
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c75
-rw-r--r--fs/nfsd/export.c16
-rw-r--r--fs/nfsd/nfs4callback.c20
-rw-r--r--fs/nfsd/nfs4proc.c374
-rw-r--r--fs/nfsd/nfs4recover.c53
-rw-r--r--fs/nfsd/nfs4state.c1794
-rw-r--r--fs/nfsd/nfs4xdr.c380
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/nfsd.h33
-rw-r--r--fs/nfsd/nfsfh.c39
-rw-r--r--fs/nfsd/state.h174
-rw-r--r--fs/nfsd/vfs.c31
-rw-r--r--fs/nfsd/vfs.h29
-rw-r--r--fs/nfsd/xdr4.h28
-rw-r--r--fs/ocfs2/xattr.c38
-rw-r--r--fs/open.c4
-rw-r--r--fs/posix_acl.c2
-rw-r--r--fs/proc/stat.c41
-rw-r--r--fs/proc/task_mmu.c80
-rw-r--r--fs/quota/quota.c2
-rw-r--r--fs/read_write.c74
-rw-r--r--fs/reiserfs/journal.c9
-rw-r--r--fs/reiserfs/resize.c4
-rw-r--r--fs/reiserfs/xattr_security.c4
-rw-r--r--fs/squashfs/Kconfig6
-rw-r--r--fs/stat.c2
-rw-r--r--fs/sysfs/dir.c182
-rw-r--r--fs/sysfs/file.c56
-rw-r--r--fs/sysfs/inode.c16
-rw-r--r--fs/sysfs/sysfs.h17
-rw-r--r--fs/ubifs/debug.h6
-rw-r--r--fs/xattr.c63
-rw-r--r--fs/xfs/Makefile119
-rw-r--r--fs/xfs/kmem.c (renamed from fs/xfs/linux-2.6/kmem.c)0
-rw-r--r--fs/xfs/kmem.h (renamed from fs/xfs/linux-2.6/kmem.h)7
-rw-r--r--fs/xfs/mrlock.h (renamed from fs/xfs/linux-2.6/mrlock.h)0
-rw-r--r--fs/xfs/time.h (renamed from fs/xfs/linux-2.6/time.h)0
-rw-r--r--fs/xfs/uuid.c (renamed from fs/xfs/support/uuid.c)0
-rw-r--r--fs/xfs/uuid.h (renamed from fs/xfs/support/uuid.h)0
-rw-r--r--fs/xfs/xfs.h3
-rw-r--r--fs/xfs/xfs_acl.c (renamed from fs/xfs/linux-2.6/xfs_acl.c)0
-rw-r--r--fs/xfs/xfs_ag.h6
-rw-r--r--fs/xfs/xfs_alloc.c11
-rw-r--r--fs/xfs/xfs_aops.c (renamed from fs/xfs/linux-2.6/xfs_aops.c)118
-rw-r--r--fs/xfs/xfs_aops.h (renamed from fs/xfs/linux-2.6/xfs_aops.h)4
-rw-r--r--fs/xfs/xfs_attr.c90
-rw-r--r--fs/xfs/xfs_attr_leaf.c7
-rw-r--r--fs/xfs/xfs_bmap.c2534
-rw-r--r--fs/xfs/xfs_bmap.h318
-rw-r--r--fs/xfs/xfs_btree.c26
-rw-r--r--fs/xfs/xfs_btree.h2
-rw-r--r--fs/xfs/xfs_buf.c (renamed from fs/xfs/linux-2.6/xfs_buf.c)257
-rw-r--r--fs/xfs/xfs_buf.h (renamed from fs/xfs/linux-2.6/xfs_buf.h)69
-rw-r--r--fs/xfs/xfs_buf_item.c37
-rw-r--r--fs/xfs/xfs_da_btree.c66
-rw-r--r--fs/xfs/xfs_dfrag.c6
-rw-r--r--fs/xfs/xfs_dinode.h2
-rw-r--r--fs/xfs/xfs_dir2_leaf.c6
-rw-r--r--fs/xfs/xfs_discard.c (renamed from fs/xfs/linux-2.6/xfs_discard.c)20
-rw-r--r--fs/xfs/xfs_discard.h (renamed from fs/xfs/linux-2.6/xfs_discard.h)0
-rw-r--r--fs/xfs/xfs_dquot.c (renamed from fs/xfs/quota/xfs_dquot.c)46
-rw-r--r--fs/xfs/xfs_dquot.h (renamed from fs/xfs/quota/xfs_dquot.h)0
-rw-r--r--fs/xfs/xfs_dquot_item.c (renamed from fs/xfs/quota/xfs_dquot_item.c)10
-rw-r--r--fs/xfs/xfs_dquot_item.h (renamed from fs/xfs/quota/xfs_dquot_item.h)0
-rw-r--r--fs/xfs/xfs_export.c (renamed from fs/xfs/linux-2.6/xfs_export.c)12
-rw-r--r--fs/xfs/xfs_export.h (renamed from fs/xfs/linux-2.6/xfs_export.h)0
-rw-r--r--fs/xfs/xfs_file.c (renamed from fs/xfs/linux-2.6/xfs_file.c)168
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fs_subr.c (renamed from fs/xfs/linux-2.6/xfs_fs_subr.c)0
-rw-r--r--fs/xfs/xfs_fsops.c60
-rw-r--r--fs/xfs/xfs_globals.c (renamed from fs/xfs/linux-2.6/xfs_globals.c)0
-rw-r--r--fs/xfs/xfs_ialloc.c18
-rw-r--r--fs/xfs/xfs_iget.c2
-rw-r--r--fs/xfs/xfs_inode.c47
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c14
-rw-r--r--fs/xfs/xfs_ioctl.c (renamed from fs/xfs/linux-2.6/xfs_ioctl.c)2
-rw-r--r--fs/xfs/xfs_ioctl.h (renamed from fs/xfs/linux-2.6/xfs_ioctl.h)0
-rw-r--r--fs/xfs/xfs_ioctl32.c (renamed from fs/xfs/linux-2.6/xfs_ioctl32.c)0
-rw-r--r--fs/xfs/xfs_ioctl32.h (renamed from fs/xfs/linux-2.6/xfs_ioctl32.h)0
-rw-r--r--fs/xfs/xfs_iomap.c39
-rw-r--r--fs/xfs/xfs_iops.c (renamed from fs/xfs/linux-2.6/xfs_iops.c)67
-rw-r--r--fs/xfs/xfs_iops.h (renamed from fs/xfs/linux-2.6/xfs_iops.h)0
-rw-r--r--fs/xfs/xfs_linux.h (renamed from fs/xfs/linux-2.6/xfs_linux.h)29
-rw-r--r--fs/xfs/xfs_log.c34
-rw-r--r--fs/xfs/xfs_log_recover.c81
-rw-r--r--fs/xfs/xfs_message.c (renamed from fs/xfs/linux-2.6/xfs_message.c)0
-rw-r--r--fs/xfs/xfs_message.h (renamed from fs/xfs/linux-2.6/xfs_message.h)0
-rw-r--r--fs/xfs/xfs_mount.c40
-rw-r--r--fs/xfs/xfs_qm.c (renamed from fs/xfs/quota/xfs_qm.c)14
-rw-r--r--fs/xfs/xfs_qm.h (renamed from fs/xfs/quota/xfs_qm.h)0
-rw-r--r--fs/xfs/xfs_qm_bhv.c (renamed from fs/xfs/quota/xfs_qm_bhv.c)0
-rw-r--r--fs/xfs/xfs_qm_stats.c (renamed from fs/xfs/quota/xfs_qm_stats.c)0
-rw-r--r--fs/xfs/xfs_qm_stats.h (renamed from fs/xfs/quota/xfs_qm_stats.h)0
-rw-r--r--fs/xfs/xfs_qm_syscalls.c (renamed from fs/xfs/quota/xfs_qm_syscalls.c)2
-rw-r--r--fs/xfs/xfs_quota_priv.h (renamed from fs/xfs/quota/xfs_quota_priv.h)0
-rw-r--r--fs/xfs/xfs_quotaops.c (renamed from fs/xfs/linux-2.6/xfs_quotaops.c)2
-rw-r--r--fs/xfs/xfs_rename.c8
-rw-r--r--fs/xfs/xfs_rtalloc.c80
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_rw.c27
-rw-r--r--fs/xfs/xfs_rw.h2
-rw-r--r--fs/xfs/xfs_sb.h2
-rw-r--r--fs/xfs/xfs_stats.c (renamed from fs/xfs/linux-2.6/xfs_stats.c)0
-rw-r--r--fs/xfs/xfs_stats.h (renamed from fs/xfs/linux-2.6/xfs_stats.h)0
-rw-r--r--fs/xfs/xfs_super.c (renamed from fs/xfs/linux-2.6/xfs_super.c)60
-rw-r--r--fs/xfs/xfs_super.h (renamed from fs/xfs/linux-2.6/xfs_super.h)0
-rw-r--r--fs/xfs/xfs_sync.c (renamed from fs/xfs/linux-2.6/xfs_sync.c)18
-rw-r--r--fs/xfs/xfs_sync.h (renamed from fs/xfs/linux-2.6/xfs_sync.h)0
-rw-r--r--fs/xfs/xfs_sysctl.c (renamed from fs/xfs/linux-2.6/xfs_sysctl.c)0
-rw-r--r--fs/xfs/xfs_sysctl.h (renamed from fs/xfs/linux-2.6/xfs_sysctl.h)0
-rw-r--r--fs/xfs/xfs_trace.c (renamed from fs/xfs/linux-2.6/xfs_trace.c)4
-rw-r--r--fs/xfs/xfs_trace.h (renamed from fs/xfs/linux-2.6/xfs_trace.h)39
-rw-r--r--fs/xfs/xfs_trans.c13
-rw-r--r--fs/xfs/xfs_trans.h10
-rw-r--r--fs/xfs/xfs_trans_ail.c191
-rw-r--r--fs/xfs/xfs_trans_buf.c50
-rw-r--r--fs/xfs/xfs_trans_dquot.c (renamed from fs/xfs/quota/xfs_trans_dquot.c)0
-rw-r--r--fs/xfs/xfs_trans_inode.c25
-rw-r--r--fs/xfs/xfs_trans_priv.h9
-rw-r--r--fs/xfs/xfs_vnode.h (renamed from fs/xfs/linux-2.6/xfs_vnode.h)0
-rw-r--r--fs/xfs/xfs_vnodeops.c119
-rw-r--r--fs/xfs/xfs_xattr.c (renamed from fs/xfs/linux-2.6/xfs_xattr.c)0
274 files changed, 10699 insertions, 8128 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index ef9661886112..2b78014a124a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
132 options = tmp_options; 132 options = tmp_options;
133 133
134 while ((p = strsep(&options, ",")) != NULL) { 134 while ((p = strsep(&options, ",")) != NULL) {
135 int token; 135 int token, r;
136 if (!*p) 136 if (!*p)
137 continue; 137 continue;
138 token = match_token(p, tokens, args); 138 token = match_token(p, tokens, args);
139 if (token < Opt_uname) { 139 switch (token) {
140 int r = match_int(&args[0], &option); 140 case Opt_debug:
141 r = match_int(&args[0], &option);
141 if (r < 0) { 142 if (r < 0) {
142 P9_DPRINTK(P9_DEBUG_ERROR, 143 P9_DPRINTK(P9_DEBUG_ERROR,
143 "integer field, but no integer?\n"); 144 "integer field, but no integer?\n");
144 ret = r; 145 ret = r;
145 continue; 146 continue;
146 } 147 }
147 }
148 switch (token) {
149 case Opt_debug:
150 v9ses->debug = option; 148 v9ses->debug = option;
151#ifdef CONFIG_NET_9P_DEBUG 149#ifdef CONFIG_NET_9P_DEBUG
152 p9_debug_level = option; 150 p9_debug_level = option;
@@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
154 break; 152 break;
155 153
156 case Opt_dfltuid: 154 case Opt_dfltuid:
155 r = match_int(&args[0], &option);
156 if (r < 0) {
157 P9_DPRINTK(P9_DEBUG_ERROR,
158 "integer field, but no integer?\n");
159 ret = r;
160 continue;
161 }
157 v9ses->dfltuid = option; 162 v9ses->dfltuid = option;
158 break; 163 break;
159 case Opt_dfltgid: 164 case Opt_dfltgid:
165 r = match_int(&args[0], &option);
166 if (r < 0) {
167 P9_DPRINTK(P9_DEBUG_ERROR,
168 "integer field, but no integer?\n");
169 ret = r;
170 continue;
171 }
160 v9ses->dfltgid = option; 172 v9ses->dfltgid = option;
161 break; 173 break;
162 case Opt_afid: 174 case Opt_afid:
175 r = match_int(&args[0], &option);
176 if (r < 0) {
177 P9_DPRINTK(P9_DEBUG_ERROR,
178 "integer field, but no integer?\n");
179 ret = r;
180 continue;
181 }
163 v9ses->afid = option; 182 v9ses->afid = option;
164 break; 183 break;
165 case Opt_uname: 184 case Opt_uname:
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 46ce357ca1ab..410ffd6ceb5f 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache;
54 54
55struct inode *v9fs_alloc_inode(struct super_block *sb); 55struct inode *v9fs_alloc_inode(struct super_block *sb);
56void v9fs_destroy_inode(struct inode *inode); 56void v9fs_destroy_inode(struct inode *inode);
57struct inode *v9fs_get_inode(struct super_block *sb, int mode); 57struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t);
58int v9fs_init_inode(struct v9fs_session_info *v9ses, 58int v9fs_init_inode(struct v9fs_session_info *v9ses,
59 struct inode *inode, int mode); 59 struct inode *inode, int mode, dev_t);
60void v9fs_evict_inode(struct inode *inode); 60void v9fs_evict_inode(struct inode *inode);
61ino_t v9fs_qid2ino(struct p9_qid *qid); 61ino_t v9fs_qid2ino(struct p9_qid *qid);
62void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); 62void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -83,4 +83,6 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
83 v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; 83 v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
84 return; 84 return;
85} 85}
86
87int v9fs_open_to_dotl_flags(int flags);
86#endif 88#endif
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 9c2bdda5cd9d..598fff1a54e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
165 } 165 }
166 while (rdir->head < rdir->tail) { 166 while (rdir->head < rdir->tail) {
167 p9stat_init(&st); 167 p9stat_init(&st);
168 err = p9stat_read(rdir->buf + rdir->head, 168 err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
169 rdir->tail - rdir->head, &st, 169 rdir->tail - rdir->head, &st);
170 fid->clnt->proto_version);
171 if (err) { 170 if (err) {
172 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 171 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
173 err = -EIO; 172 err = -EIO;
@@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
231 while (err == 0) { 230 while (err == 0) {
232 if (rdir->tail == rdir->head) { 231 if (rdir->tail == rdir->head) {
233 err = p9_client_readdir(fid, rdir->buf, buflen, 232 err = p9_client_readdir(fid, rdir->buf, buflen,
234 filp->f_pos); 233 filp->f_pos);
235 if (err <= 0) 234 if (err <= 0)
236 goto unlock_and_exit; 235 goto unlock_and_exit;
237 236
@@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
241 240
242 while (rdir->head < rdir->tail) { 241 while (rdir->head < rdir->tail) {
243 242
244 err = p9dirent_read(rdir->buf + rdir->head, 243 err = p9dirent_read(fid->clnt, rdir->buf + rdir->head,
245 rdir->tail - rdir->head, 244 rdir->tail - rdir->head,
246 &curdirent, 245 &curdirent);
247 fid->clnt->proto_version);
248 if (err < 0) { 246 if (err < 0) {
249 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 247 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
250 err = -EIO; 248 err = -EIO;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3c173fcc2c5a..62857a810a79 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -65,7 +65,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
65 v9inode = V9FS_I(inode); 65 v9inode = V9FS_I(inode);
66 v9ses = v9fs_inode2v9ses(inode); 66 v9ses = v9fs_inode2v9ses(inode);
67 if (v9fs_proto_dotl(v9ses)) 67 if (v9fs_proto_dotl(v9ses))
68 omode = file->f_flags; 68 omode = v9fs_open_to_dotl_flags(file->f_flags);
69 else 69 else
70 omode = v9fs_uflags2omode(file->f_flags, 70 omode = v9fs_uflags2omode(file->f_flags,
71 v9fs_proto_dotu(v9ses)); 71 v9fs_proto_dotu(v9ses));
@@ -169,7 +169,18 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
169 169
170 /* convert posix lock to p9 tlock args */ 170 /* convert posix lock to p9 tlock args */
171 memset(&flock, 0, sizeof(flock)); 171 memset(&flock, 0, sizeof(flock));
172 flock.type = fl->fl_type; 172 /* map the lock type */
173 switch (fl->fl_type) {
174 case F_RDLCK:
175 flock.type = P9_LOCK_TYPE_RDLCK;
176 break;
177 case F_WRLCK:
178 flock.type = P9_LOCK_TYPE_WRLCK;
179 break;
180 case F_UNLCK:
181 flock.type = P9_LOCK_TYPE_UNLCK;
182 break;
183 }
173 flock.start = fl->fl_start; 184 flock.start = fl->fl_start;
174 if (fl->fl_end == OFFSET_MAX) 185 if (fl->fl_end == OFFSET_MAX)
175 flock.length = 0; 186 flock.length = 0;
@@ -245,7 +256,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
245 256
246 /* convert posix lock to p9 tgetlock args */ 257 /* convert posix lock to p9 tgetlock args */
247 memset(&glock, 0, sizeof(glock)); 258 memset(&glock, 0, sizeof(glock));
248 glock.type = fl->fl_type; 259 glock.type = P9_LOCK_TYPE_UNLCK;
249 glock.start = fl->fl_start; 260 glock.start = fl->fl_start;
250 if (fl->fl_end == OFFSET_MAX) 261 if (fl->fl_end == OFFSET_MAX)
251 glock.length = 0; 262 glock.length = 0;
@@ -257,17 +268,26 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
257 res = p9_client_getlock_dotl(fid, &glock); 268 res = p9_client_getlock_dotl(fid, &glock);
258 if (res < 0) 269 if (res < 0)
259 return res; 270 return res;
260 if (glock.type != F_UNLCK) { 271 /* map 9p lock type to os lock type */
261 fl->fl_type = glock.type; 272 switch (glock.type) {
273 case P9_LOCK_TYPE_RDLCK:
274 fl->fl_type = F_RDLCK;
275 break;
276 case P9_LOCK_TYPE_WRLCK:
277 fl->fl_type = F_WRLCK;
278 break;
279 case P9_LOCK_TYPE_UNLCK:
280 fl->fl_type = F_UNLCK;
281 break;
282 }
283 if (glock.type != P9_LOCK_TYPE_UNLCK) {
262 fl->fl_start = glock.start; 284 fl->fl_start = glock.start;
263 if (glock.length == 0) 285 if (glock.length == 0)
264 fl->fl_end = OFFSET_MAX; 286 fl->fl_end = OFFSET_MAX;
265 else 287 else
266 fl->fl_end = glock.start + glock.length - 1; 288 fl->fl_end = glock.start + glock.length - 1;
267 fl->fl_pid = glock.proc_id; 289 fl->fl_pid = glock.proc_id;
268 } else 290 }
269 fl->fl_type = F_UNLCK;
270
271 return res; 291 return res;
272} 292}
273 293
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8bb5507e822f..b5a1076aaa6c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -95,15 +95,18 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
95/** 95/**
96 * p9mode2unixmode- convert plan9 mode bits to unix mode bits 96 * p9mode2unixmode- convert plan9 mode bits to unix mode bits
97 * @v9ses: v9fs session information 97 * @v9ses: v9fs session information
98 * @mode: mode to convert 98 * @stat: p9_wstat from which mode need to be derived
99 * @rdev: major number, minor number in case of device files.
99 * 100 *
100 */ 101 */
101 102static int p9mode2unixmode(struct v9fs_session_info *v9ses,
102static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) 103 struct p9_wstat *stat, dev_t *rdev)
103{ 104{
104 int res; 105 int res;
106 int mode = stat->mode;
105 107
106 res = mode & 0777; 108 res = mode & S_IALLUGO;
109 *rdev = 0;
107 110
108 if ((mode & P9_DMDIR) == P9_DMDIR) 111 if ((mode & P9_DMDIR) == P9_DMDIR)
109 res |= S_IFDIR; 112 res |= S_IFDIR;
@@ -116,9 +119,26 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
116 && (v9ses->nodev == 0)) 119 && (v9ses->nodev == 0))
117 res |= S_IFIFO; 120 res |= S_IFIFO;
118 else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) 121 else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
119 && (v9ses->nodev == 0)) 122 && (v9ses->nodev == 0)) {
120 res |= S_IFBLK; 123 char type = 0, ext[32];
121 else 124 int major = -1, minor = -1;
125
126 strncpy(ext, stat->extension, sizeof(ext));
127 sscanf(ext, "%c %u %u", &type, &major, &minor);
128 switch (type) {
129 case 'c':
130 res |= S_IFCHR;
131 break;
132 case 'b':
133 res |= S_IFBLK;
134 break;
135 default:
136 P9_DPRINTK(P9_DEBUG_ERROR,
137 "Unknown special type %c %s\n", type,
138 stat->extension);
139 };
140 *rdev = MKDEV(major, minor);
141 } else
122 res |= S_IFREG; 142 res |= S_IFREG;
123 143
124 if (v9fs_proto_dotu(v9ses)) { 144 if (v9fs_proto_dotu(v9ses)) {
@@ -131,7 +151,6 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
131 if ((mode & P9_DMSETVTX) == P9_DMSETVTX) 151 if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
132 res |= S_ISVTX; 152 res |= S_ISVTX;
133 } 153 }
134
135 return res; 154 return res;
136} 155}
137 156
@@ -242,13 +261,13 @@ void v9fs_destroy_inode(struct inode *inode)
242} 261}
243 262
244int v9fs_init_inode(struct v9fs_session_info *v9ses, 263int v9fs_init_inode(struct v9fs_session_info *v9ses,
245 struct inode *inode, int mode) 264 struct inode *inode, int mode, dev_t rdev)
246{ 265{
247 int err = 0; 266 int err = 0;
248 267
249 inode_init_owner(inode, NULL, mode); 268 inode_init_owner(inode, NULL, mode);
250 inode->i_blocks = 0; 269 inode->i_blocks = 0;
251 inode->i_rdev = 0; 270 inode->i_rdev = rdev;
252 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 271 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
253 inode->i_mapping->a_ops = &v9fs_addr_operations; 272 inode->i_mapping->a_ops = &v9fs_addr_operations;
254 273
@@ -259,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
259 case S_IFSOCK: 278 case S_IFSOCK:
260 if (v9fs_proto_dotl(v9ses)) { 279 if (v9fs_proto_dotl(v9ses)) {
261 inode->i_op = &v9fs_file_inode_operations_dotl; 280 inode->i_op = &v9fs_file_inode_operations_dotl;
262 inode->i_fop = &v9fs_file_operations_dotl;
263 } else if (v9fs_proto_dotu(v9ses)) { 281 } else if (v9fs_proto_dotu(v9ses)) {
264 inode->i_op = &v9fs_file_inode_operations; 282 inode->i_op = &v9fs_file_inode_operations;
265 inode->i_fop = &v9fs_file_operations;
266 } else { 283 } else {
267 P9_DPRINTK(P9_DEBUG_ERROR, 284 P9_DPRINTK(P9_DEBUG_ERROR,
268 "special files without extended mode\n"); 285 "special files without extended mode\n");
@@ -335,7 +352,7 @@ error:
335 * 352 *
336 */ 353 */
337 354
338struct inode *v9fs_get_inode(struct super_block *sb, int mode) 355struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev)
339{ 356{
340 int err; 357 int err;
341 struct inode *inode; 358 struct inode *inode;
@@ -348,7 +365,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
348 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); 365 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
349 return ERR_PTR(-ENOMEM); 366 return ERR_PTR(-ENOMEM);
350 } 367 }
351 err = v9fs_init_inode(v9ses, inode, mode); 368 err = v9fs_init_inode(v9ses, inode, mode, rdev);
352 if (err) { 369 if (err) {
353 iput(inode); 370 iput(inode);
354 return ERR_PTR(err); 371 return ERR_PTR(err);
@@ -435,11 +452,12 @@ void v9fs_evict_inode(struct inode *inode)
435static int v9fs_test_inode(struct inode *inode, void *data) 452static int v9fs_test_inode(struct inode *inode, void *data)
436{ 453{
437 int umode; 454 int umode;
455 dev_t rdev;
438 struct v9fs_inode *v9inode = V9FS_I(inode); 456 struct v9fs_inode *v9inode = V9FS_I(inode);
439 struct p9_wstat *st = (struct p9_wstat *)data; 457 struct p9_wstat *st = (struct p9_wstat *)data;
440 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); 458 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
441 459
442 umode = p9mode2unixmode(v9ses, st->mode); 460 umode = p9mode2unixmode(v9ses, st, &rdev);
443 /* don't match inode of different type */ 461 /* don't match inode of different type */
444 if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) 462 if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
445 return 0; 463 return 0;
@@ -473,6 +491,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
473 struct p9_wstat *st, 491 struct p9_wstat *st,
474 int new) 492 int new)
475{ 493{
494 dev_t rdev;
476 int retval, umode; 495 int retval, umode;
477 unsigned long i_ino; 496 unsigned long i_ino;
478 struct inode *inode; 497 struct inode *inode;
@@ -496,8 +515,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
496 * later. 515 * later.
497 */ 516 */
498 inode->i_ino = i_ino; 517 inode->i_ino = i_ino;
499 umode = p9mode2unixmode(v9ses, st->mode); 518 umode = p9mode2unixmode(v9ses, st, &rdev);
500 retval = v9fs_init_inode(v9ses, inode, umode); 519 retval = v9fs_init_inode(v9ses, inode, umode, rdev);
501 if (retval) 520 if (retval)
502 goto error; 521 goto error;
503 522
@@ -532,6 +551,19 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
532} 551}
533 552
534/** 553/**
554 * v9fs_at_to_dotl_flags- convert Linux specific AT flags to
555 * plan 9 AT flag.
556 * @flags: flags to convert
557 */
558static int v9fs_at_to_dotl_flags(int flags)
559{
560 int rflags = 0;
561 if (flags & AT_REMOVEDIR)
562 rflags |= P9_DOTL_AT_REMOVEDIR;
563 return rflags;
564}
565
566/**
535 * v9fs_remove - helper function to remove files and directories 567 * v9fs_remove - helper function to remove files and directories
536 * @dir: directory inode that is being deleted 568 * @dir: directory inode that is being deleted
537 * @dentry: dentry that is being deleted 569 * @dentry: dentry that is being deleted
@@ -558,7 +590,8 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
558 return retval; 590 return retval;
559 } 591 }
560 if (v9fs_proto_dotl(v9ses)) 592 if (v9fs_proto_dotl(v9ses))
561 retval = p9_client_unlinkat(dfid, dentry->d_name.name, flags); 593 retval = p9_client_unlinkat(dfid, dentry->d_name.name,
594 v9fs_at_to_dotl_flags(flags));
562 if (retval == -EOPNOTSUPP) { 595 if (retval == -EOPNOTSUPP) {
563 /* Try the one based on path */ 596 /* Try the one based on path */
564 v9fid = v9fs_fid_clone(dentry); 597 v9fid = v9fs_fid_clone(dentry);
@@ -645,13 +678,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
645 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 678 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
646 goto error; 679 goto error;
647 } 680 }
648 d_instantiate(dentry, inode);
649 err = v9fs_fid_add(dentry, fid); 681 err = v9fs_fid_add(dentry, fid);
650 if (err < 0) 682 if (err < 0)
651 goto error; 683 goto error;
652 684 d_instantiate(dentry, inode);
653 return ofid; 685 return ofid;
654
655error: 686error:
656 if (ofid) 687 if (ofid)
657 p9_client_clunk(ofid); 688 p9_client_clunk(ofid);
@@ -792,6 +823,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
792struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, 823struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
793 struct nameidata *nameidata) 824 struct nameidata *nameidata)
794{ 825{
826 struct dentry *res;
795 struct super_block *sb; 827 struct super_block *sb;
796 struct v9fs_session_info *v9ses; 828 struct v9fs_session_info *v9ses;
797 struct p9_fid *dfid, *fid; 829 struct p9_fid *dfid, *fid;
@@ -823,22 +855,35 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
823 855
824 return ERR_PTR(result); 856 return ERR_PTR(result);
825 } 857 }
826 858 /*
827 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); 859 * Make sure we don't use a wrong inode due to parallel
860 * unlink. For cached mode create calls request for new
861 * inode. But with cache disabled, lookup should do this.
862 */
863 if (v9ses->cache)
864 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
865 else
866 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
828 if (IS_ERR(inode)) { 867 if (IS_ERR(inode)) {
829 result = PTR_ERR(inode); 868 result = PTR_ERR(inode);
830 inode = NULL; 869 inode = NULL;
831 goto error; 870 goto error;
832 } 871 }
833
834 result = v9fs_fid_add(dentry, fid); 872 result = v9fs_fid_add(dentry, fid);
835 if (result < 0) 873 if (result < 0)
836 goto error_iput; 874 goto error_iput;
837
838inst_out: 875inst_out:
839 d_add(dentry, inode); 876 /*
840 return NULL; 877 * If we had a rename on the server and a parallel lookup
841 878 * for the new name, then make sure we instantiate with
879 * the new name. ie look up for a/b, while on server somebody
880 * moved b under k and client parallely did a lookup for
881 * k/b.
882 */
883 res = d_materialise_unique(dentry, inode);
884 if (!IS_ERR(res))
885 return res;
886 result = PTR_ERR(res);
842error_iput: 887error_iput:
843 iput(inode); 888 iput(inode);
844error: 889error:
@@ -1002,7 +1047,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1002 return PTR_ERR(st); 1047 return PTR_ERR(st);
1003 1048
1004 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); 1049 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
1005 generic_fillattr(dentry->d_inode, stat); 1050 generic_fillattr(dentry->d_inode, stat);
1006 1051
1007 p9stat_free(st); 1052 p9stat_free(st);
1008 kfree(st); 1053 kfree(st);
@@ -1086,6 +1131,7 @@ void
1086v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, 1131v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1087 struct super_block *sb) 1132 struct super_block *sb)
1088{ 1133{
1134 mode_t mode;
1089 char ext[32]; 1135 char ext[32];
1090 char tag_name[14]; 1136 char tag_name[14];
1091 unsigned int i_nlink; 1137 unsigned int i_nlink;
@@ -1121,31 +1167,9 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1121 inode->i_nlink = i_nlink; 1167 inode->i_nlink = i_nlink;
1122 } 1168 }
1123 } 1169 }
1124 inode->i_mode = p9mode2unixmode(v9ses, stat->mode); 1170 mode = stat->mode & S_IALLUGO;
1125 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { 1171 mode |= inode->i_mode & ~S_IALLUGO;
1126 char type = 0; 1172 inode->i_mode = mode;
1127 int major = -1;
1128 int minor = -1;
1129
1130 strncpy(ext, stat->extension, sizeof(ext));
1131 sscanf(ext, "%c %u %u", &type, &major, &minor);
1132 switch (type) {
1133 case 'c':
1134 inode->i_mode &= ~S_IFBLK;
1135 inode->i_mode |= S_IFCHR;
1136 break;
1137 case 'b':
1138 break;
1139 default:
1140 P9_DPRINTK(P9_DEBUG_ERROR,
1141 "Unknown special type %c %s\n", type,
1142 stat->extension);
1143 };
1144 inode->i_rdev = MKDEV(major, minor);
1145 init_special_inode(inode, inode->i_mode, inode->i_rdev);
1146 } else
1147 inode->i_rdev = 0;
1148
1149 i_size_write(inode, stat->length); 1173 i_size_write(inode, stat->length);
1150 1174
1151 /* not real number of blocks, but 512 byte ones ... */ 1175 /* not real number of blocks, but 512 byte ones ... */
@@ -1411,6 +1435,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1411 1435
1412int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) 1436int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
1413{ 1437{
1438 int umode;
1439 dev_t rdev;
1414 loff_t i_size; 1440 loff_t i_size;
1415 struct p9_wstat *st; 1441 struct p9_wstat *st;
1416 struct v9fs_session_info *v9ses; 1442 struct v9fs_session_info *v9ses;
@@ -1419,6 +1445,12 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
1419 st = p9_client_stat(fid); 1445 st = p9_client_stat(fid);
1420 if (IS_ERR(st)) 1446 if (IS_ERR(st))
1421 return PTR_ERR(st); 1447 return PTR_ERR(st);
1448 /*
1449 * Don't update inode if the file type is different
1450 */
1451 umode = p9mode2unixmode(v9ses, st, &rdev);
1452 if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
1453 goto out;
1422 1454
1423 spin_lock(&inode->i_lock); 1455 spin_lock(&inode->i_lock);
1424 /* 1456 /*
@@ -1430,6 +1462,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
1430 if (v9ses->cache) 1462 if (v9ses->cache)
1431 inode->i_size = i_size; 1463 inode->i_size = i_size;
1432 spin_unlock(&inode->i_lock); 1464 spin_unlock(&inode->i_lock);
1465out:
1433 p9stat_free(st); 1466 p9stat_free(st);
1434 kfree(st); 1467 kfree(st);
1435 return 0; 1468 return 0;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index b6c8ed205192..aded79fcd5cf 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -153,7 +153,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
153 * later. 153 * later.
154 */ 154 */
155 inode->i_ino = i_ino; 155 inode->i_ino = i_ino;
156 retval = v9fs_init_inode(v9ses, inode, st->st_mode); 156 retval = v9fs_init_inode(v9ses, inode,
157 st->st_mode, new_decode_dev(st->st_rdev));
157 if (retval) 158 if (retval)
158 goto error; 159 goto error;
159 160
@@ -190,6 +191,58 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
190 return inode; 191 return inode;
191} 192}
192 193
194struct dotl_openflag_map {
195 int open_flag;
196 int dotl_flag;
197};
198
199static int v9fs_mapped_dotl_flags(int flags)
200{
201 int i;
202 int rflags = 0;
203 struct dotl_openflag_map dotl_oflag_map[] = {
204 { O_CREAT, P9_DOTL_CREATE },
205 { O_EXCL, P9_DOTL_EXCL },
206 { O_NOCTTY, P9_DOTL_NOCTTY },
207 { O_TRUNC, P9_DOTL_TRUNC },
208 { O_APPEND, P9_DOTL_APPEND },
209 { O_NONBLOCK, P9_DOTL_NONBLOCK },
210 { O_DSYNC, P9_DOTL_DSYNC },
211 { FASYNC, P9_DOTL_FASYNC },
212 { O_DIRECT, P9_DOTL_DIRECT },
213 { O_LARGEFILE, P9_DOTL_LARGEFILE },
214 { O_DIRECTORY, P9_DOTL_DIRECTORY },
215 { O_NOFOLLOW, P9_DOTL_NOFOLLOW },
216 { O_NOATIME, P9_DOTL_NOATIME },
217 { O_CLOEXEC, P9_DOTL_CLOEXEC },
218 { O_SYNC, P9_DOTL_SYNC},
219 };
220 for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) {
221 if (flags & dotl_oflag_map[i].open_flag)
222 rflags |= dotl_oflag_map[i].dotl_flag;
223 }
224 return rflags;
225}
226
227/**
228 * v9fs_open_to_dotl_flags- convert Linux specific open flags to
229 * plan 9 open flag.
230 * @flags: flags to convert
231 */
232int v9fs_open_to_dotl_flags(int flags)
233{
234 int rflags = 0;
235
236 /*
237 * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY
238 * and P9_DOTL_NOACCESS
239 */
240 rflags |= flags & O_ACCMODE;
241 rflags |= v9fs_mapped_dotl_flags(flags);
242
243 return rflags;
244}
245
193/** 246/**
194 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. 247 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
195 * @dir: directory inode that is being created 248 * @dir: directory inode that is being created
@@ -258,7 +311,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
258 "Failed to get acl values in creat %d\n", err); 311 "Failed to get acl values in creat %d\n", err);
259 goto error; 312 goto error;
260 } 313 }
261 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); 314 err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
315 mode, gid, &qid);
262 if (err < 0) { 316 if (err < 0) {
263 P9_DPRINTK(P9_DEBUG_VFS, 317 P9_DPRINTK(P9_DEBUG_VFS,
264 "p9_client_open_dotl failed in creat %d\n", 318 "p9_client_open_dotl failed in creat %d\n",
@@ -281,10 +335,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
281 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 335 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
282 goto error; 336 goto error;
283 } 337 }
284 d_instantiate(dentry, inode);
285 err = v9fs_fid_add(dentry, fid); 338 err = v9fs_fid_add(dentry, fid);
286 if (err < 0) 339 if (err < 0)
287 goto error; 340 goto error;
341 d_instantiate(dentry, inode);
288 342
289 /* Now set the ACL based on the default value */ 343 /* Now set the ACL based on the default value */
290 v9fs_set_create_acl(dentry, &dacl, &pacl); 344 v9fs_set_create_acl(dentry, &dacl, &pacl);
@@ -403,10 +457,10 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
403 err); 457 err);
404 goto error; 458 goto error;
405 } 459 }
406 d_instantiate(dentry, inode);
407 err = v9fs_fid_add(dentry, fid); 460 err = v9fs_fid_add(dentry, fid);
408 if (err < 0) 461 if (err < 0)
409 goto error; 462 goto error;
463 d_instantiate(dentry, inode);
410 fid = NULL; 464 fid = NULL;
411 } else { 465 } else {
412 /* 466 /*
@@ -414,7 +468,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
414 * inode with stat. We need to get an inode 468 * inode with stat. We need to get an inode
415 * so that we can set the acl with dentry 469 * so that we can set the acl with dentry
416 */ 470 */
417 inode = v9fs_get_inode(dir->i_sb, mode); 471 inode = v9fs_get_inode(dir->i_sb, mode, 0);
418 if (IS_ERR(inode)) { 472 if (IS_ERR(inode)) {
419 err = PTR_ERR(inode); 473 err = PTR_ERR(inode);
420 goto error; 474 goto error;
@@ -540,6 +594,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
540void 594void
541v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) 595v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
542{ 596{
597 mode_t mode;
543 struct v9fs_inode *v9inode = V9FS_I(inode); 598 struct v9fs_inode *v9inode = V9FS_I(inode);
544 599
545 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { 600 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
@@ -552,11 +607,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
552 inode->i_uid = stat->st_uid; 607 inode->i_uid = stat->st_uid;
553 inode->i_gid = stat->st_gid; 608 inode->i_gid = stat->st_gid;
554 inode->i_nlink = stat->st_nlink; 609 inode->i_nlink = stat->st_nlink;
555 inode->i_mode = stat->st_mode;
556 inode->i_rdev = new_decode_dev(stat->st_rdev);
557 610
558 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) 611 mode = stat->st_mode & S_IALLUGO;
559 init_special_inode(inode, inode->i_mode, inode->i_rdev); 612 mode |= inode->i_mode & ~S_IALLUGO;
613 inode->i_mode = mode;
560 614
561 i_size_write(inode, stat->st_size); 615 i_size_write(inode, stat->st_size);
562 inode->i_blocks = stat->st_blocks; 616 inode->i_blocks = stat->st_blocks;
@@ -657,14 +711,14 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
657 err); 711 err);
658 goto error; 712 goto error;
659 } 713 }
660 d_instantiate(dentry, inode);
661 err = v9fs_fid_add(dentry, fid); 714 err = v9fs_fid_add(dentry, fid);
662 if (err < 0) 715 if (err < 0)
663 goto error; 716 goto error;
717 d_instantiate(dentry, inode);
664 fid = NULL; 718 fid = NULL;
665 } else { 719 } else {
666 /* Not in cached mode. No need to populate inode with stat */ 720 /* Not in cached mode. No need to populate inode with stat */
667 inode = v9fs_get_inode(dir->i_sb, S_IFLNK); 721 inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
668 if (IS_ERR(inode)) { 722 if (IS_ERR(inode)) {
669 err = PTR_ERR(inode); 723 err = PTR_ERR(inode);
670 goto error; 724 goto error;
@@ -810,17 +864,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
810 err); 864 err);
811 goto error; 865 goto error;
812 } 866 }
813 d_instantiate(dentry, inode);
814 err = v9fs_fid_add(dentry, fid); 867 err = v9fs_fid_add(dentry, fid);
815 if (err < 0) 868 if (err < 0)
816 goto error; 869 goto error;
870 d_instantiate(dentry, inode);
817 fid = NULL; 871 fid = NULL;
818 } else { 872 } else {
819 /* 873 /*
820 * Not in cached mode. No need to populate inode with stat. 874 * Not in cached mode. No need to populate inode with stat.
821 * socket syscall returns a fd, so we need instantiate 875 * socket syscall returns a fd, so we need instantiate
822 */ 876 */
823 inode = v9fs_get_inode(dir->i_sb, mode); 877 inode = v9fs_get_inode(dir->i_sb, mode, rdev);
824 if (IS_ERR(inode)) { 878 if (IS_ERR(inode)) {
825 err = PTR_ERR(inode); 879 err = PTR_ERR(inode);
826 goto error; 880 goto error;
@@ -886,6 +940,11 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
886 st = p9_client_getattr_dotl(fid, P9_STATS_ALL); 940 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
887 if (IS_ERR(st)) 941 if (IS_ERR(st))
888 return PTR_ERR(st); 942 return PTR_ERR(st);
943 /*
944 * Don't update inode if the file type is different
945 */
946 if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
947 goto out;
889 948
890 spin_lock(&inode->i_lock); 949 spin_lock(&inode->i_lock);
891 /* 950 /*
@@ -897,6 +956,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
897 if (v9ses->cache) 956 if (v9ses->cache)
898 inode->i_size = i_size; 957 inode->i_size = i_size;
899 spin_unlock(&inode->i_lock); 958 spin_unlock(&inode->i_lock);
959out:
900 kfree(st); 960 kfree(st);
901 return 0; 961 return 0;
902} 962}
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index feef6cdc1fd2..c70251d47ed1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -149,7 +149,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
149 else 149 else
150 sb->s_d_op = &v9fs_dentry_operations; 150 sb->s_d_op = &v9fs_dentry_operations;
151 151
152 inode = v9fs_get_inode(sb, S_IFDIR | mode); 152 inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
153 if (IS_ERR(inode)) { 153 if (IS_ERR(inode)) {
154 retval = PTR_ERR(inode); 154 retval = PTR_ERR(inode);
155 goto release_sb; 155 goto release_sb;
diff --git a/fs/Makefile b/fs/Makefile
index afc109691a9b..d2c3353d5477 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
120obj-$(CONFIG_OCFS2_FS) += ocfs2/ 120obj-$(CONFIG_OCFS2_FS) += ocfs2/
121obj-$(CONFIG_BTRFS_FS) += btrfs/ 121obj-$(CONFIG_BTRFS_FS) += btrfs/
122obj-$(CONFIG_GFS2_FS) += gfs2/ 122obj-$(CONFIG_GFS2_FS) += gfs2/
123obj-$(CONFIG_EXOFS_FS) += exofs/ 123obj-y += exofs/ # Multiple modules
124obj-$(CONFIG_CEPH_FS) += ceph/ 124obj-$(CONFIG_CEPH_FS) += ceph/
125obj-$(CONFIG_PSTORE) += pstore/ 125obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/attr.c b/fs/attr.c
index 538e27959d3f..7ee7ba488313 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -13,6 +13,7 @@
13#include <linux/fsnotify.h> 13#include <linux/fsnotify.h>
14#include <linux/fcntl.h> 14#include <linux/fcntl.h>
15#include <linux/security.h> 15#include <linux/security.h>
16#include <linux/evm.h>
16 17
17/** 18/**
18 * inode_change_ok - check if attribute changes to an inode are allowed 19 * inode_change_ok - check if attribute changes to an inode are allowed
@@ -237,8 +238,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
237 else 238 else
238 error = simple_setattr(dentry, attr); 239 error = simple_setattr(dentry, attr);
239 240
240 if (!error) 241 if (!error) {
241 fsnotify_change(dentry, ia_valid); 242 fsnotify_change(dentry, ia_valid);
243 evm_inode_post_setattr(dentry, ia_valid);
244 }
242 245
243 return error; 246 return error;
244} 247}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 475f9c597cb7..326dc08d3e3f 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -39,27 +39,17 @@
39 39
40/* #define DEBUG */ 40/* #define DEBUG */
41 41
42#ifdef DEBUG 42#define DPRINTK(fmt, ...) \
43#define DPRINTK(fmt, args...) \ 43 pr_debug("pid %d: %s: " fmt "\n", \
44do { \ 44 current->pid, __func__, ##__VA_ARGS__)
45 printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \ 45
46 current->pid, __func__, ##args); \ 46#define AUTOFS_WARN(fmt, ...) \
47} while (0)
48#else
49#define DPRINTK(fmt, args...) do {} while (0)
50#endif
51
52#define AUTOFS_WARN(fmt, args...) \
53do { \
54 printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ 47 printk(KERN_WARNING "pid %d: %s: " fmt "\n", \
55 current->pid, __func__, ##args); \ 48 current->pid, __func__, ##__VA_ARGS__)
56} while (0)
57 49
58#define AUTOFS_ERROR(fmt, args...) \ 50#define AUTOFS_ERROR(fmt, ...) \
59do { \
60 printk(KERN_ERR "pid %d: %s: " fmt "\n", \ 51 printk(KERN_ERR "pid %d: %s: " fmt "\n", \
61 current->pid, __func__, ##args); \ 52 current->pid, __func__, ##__VA_ARGS__)
62} while (0)
63 53
64/* Unified info structure. This is pointed to by both the dentry and 54/* Unified info structure. This is pointed to by both the dentry and
65 inode structures. Each file in the filesystem has an instance of this 55 inode structures. Each file in the filesystem has an instance of this
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 25435987d6ae..e1fbdeef85db 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
104 size_t pktsz; 104 size_t pktsz;
105 105
106 DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", 106 DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
107 wq->wait_queue_token, wq->name.len, wq->name.name, type); 107 (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
108 108
109 memset(&pkt,0,sizeof pkt); /* For security reasons */ 109 memset(&pkt,0,sizeof pkt); /* For security reasons */
110 110
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 54b8c28bebc8..720d885e8dca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -474,17 +474,22 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
474 befs_data_stream *data = &befs_ino->i_data.ds; 474 befs_data_stream *data = &befs_ino->i_data.ds;
475 befs_off_t len = data->size; 475 befs_off_t len = data->size;
476 476
477 befs_debug(sb, "Follow long symlink"); 477 if (len == 0) {
478 478 befs_error(sb, "Long symlink with illegal length");
479 link = kmalloc(len, GFP_NOFS);
480 if (!link) {
481 link = ERR_PTR(-ENOMEM);
482 } else if (befs_read_lsymlink(sb, data, link, len) != len) {
483 kfree(link);
484 befs_error(sb, "Failed to read entire long symlink");
485 link = ERR_PTR(-EIO); 479 link = ERR_PTR(-EIO);
486 } else { 480 } else {
487 link[len - 1] = '\0'; 481 befs_debug(sb, "Follow long symlink");
482
483 link = kmalloc(len, GFP_NOFS);
484 if (!link) {
485 link = ERR_PTR(-ENOMEM);
486 } else if (befs_read_lsymlink(sb, data, link, len) != len) {
487 kfree(link);
488 befs_error(sb, "Failed to read entire long symlink");
489 link = ERR_PTR(-EIO);
490 } else {
491 link[len - 1] = '\0';
492 }
488 } 493 }
489 } else { 494 } else {
490 link = befs_ino->i_data.symlink; 495 link = befs_ino->i_data.symlink;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ff77262e887c..95f786ec7f08 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1429,6 +1429,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1429 WARN_ON_ONCE(bdev->bd_holders); 1429 WARN_ON_ONCE(bdev->bd_holders);
1430 sync_blockdev(bdev); 1430 sync_blockdev(bdev);
1431 kill_bdev(bdev); 1431 kill_bdev(bdev);
1432 /* ->release can cause the old bdi to disappear,
1433 * so must switch it out first
1434 */
1435 bdev_inode_switch_bdi(bdev->bd_inode,
1436 &default_backing_dev_info);
1432 } 1437 }
1433 if (bdev->bd_contains == bdev) { 1438 if (bdev->bd_contains == bdev) {
1434 if (disk->fops->release) 1439 if (disk->fops->release)
@@ -1442,8 +1447,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1442 disk_put_part(bdev->bd_part); 1447 disk_put_part(bdev->bd_part);
1443 bdev->bd_part = NULL; 1448 bdev->bd_part = NULL;
1444 bdev->bd_disk = NULL; 1449 bdev->bd_disk = NULL;
1445 bdev_inode_switch_bdi(bdev->bd_inode,
1446 &default_backing_dev_info);
1447 if (bdev != bdev->bd_contains) 1450 if (bdev != bdev->bd_contains)
1448 victim = bdev->bd_contains; 1451 victim = bdev->bd_contains;
1449 bdev->bd_contains = NULL; 1452 bdev->bd_contains = NULL;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 502b9e988679..d9f99a16edd6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -176,7 +176,11 @@ static inline u64 btrfs_ino(struct inode *inode)
176{ 176{
177 u64 ino = BTRFS_I(inode)->location.objectid; 177 u64 ino = BTRFS_I(inode)->location.objectid;
178 178
179 if (ino <= BTRFS_FIRST_FREE_OBJECTID) 179 /*
180 * !ino: btree_inode
181 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
182 */
183 if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
180 ino = inode->i_ino; 184 ino = inode->i_ino;
181 return ino; 185 return ino;
182} 186}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0469263e327e..03912c5c6f49 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1415,17 +1415,15 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
1415#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ 1415#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1416static inline u##bits btrfs_##name(struct extent_buffer *eb) \ 1416static inline u##bits btrfs_##name(struct extent_buffer *eb) \
1417{ \ 1417{ \
1418 type *p = kmap_atomic(eb->first_page, KM_USER0); \ 1418 type *p = page_address(eb->first_page); \
1419 u##bits res = le##bits##_to_cpu(p->member); \ 1419 u##bits res = le##bits##_to_cpu(p->member); \
1420 kunmap_atomic(p, KM_USER0); \
1421 return res; \ 1420 return res; \
1422} \ 1421} \
1423static inline void btrfs_set_##name(struct extent_buffer *eb, \ 1422static inline void btrfs_set_##name(struct extent_buffer *eb, \
1424 u##bits val) \ 1423 u##bits val) \
1425{ \ 1424{ \
1426 type *p = kmap_atomic(eb->first_page, KM_USER0); \ 1425 type *p = page_address(eb->first_page); \
1427 p->member = cpu_to_le##bits(val); \ 1426 p->member = cpu_to_le##bits(val); \
1428 kunmap_atomic(p, KM_USER0); \
1429} 1427}
1430 1428
1431#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ 1429#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
@@ -2367,8 +2365,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2367int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2365int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2368int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2366int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2369int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2367int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2370int btrfs_drop_snapshot(struct btrfs_root *root, 2368void btrfs_drop_snapshot(struct btrfs_root *root,
2371 struct btrfs_block_rsv *block_rsv, int update_ref); 2369 struct btrfs_block_rsv *block_rsv, int update_ref);
2372int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2370int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2373 struct btrfs_root *root, 2371 struct btrfs_root *root,
2374 struct extent_buffer *node, 2372 struct extent_buffer *node,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 66bac226944e..f5be06a2462f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1782,6 +1782,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1782 1782
1783 1783
1784 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1784 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard)
1786 continue;
1787
1785 ret = btrfs_issue_discard(stripe->dev->bdev, 1788 ret = btrfs_issue_discard(stripe->dev->bdev,
1786 stripe->physical, 1789 stripe->physical,
1787 stripe->length); 1790 stripe->length);
@@ -1789,11 +1792,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1789 discarded_bytes += stripe->length; 1792 discarded_bytes += stripe->length;
1790 else if (ret != -EOPNOTSUPP) 1793 else if (ret != -EOPNOTSUPP)
1791 break; 1794 break;
1795
1796 /*
1797 * Just in case we get back EOPNOTSUPP for some reason,
1798 * just ignore the return value so we don't screw up
1799 * people calling discard_extent.
1800 */
1801 ret = 0;
1792 } 1802 }
1793 kfree(multi); 1803 kfree(multi);
1794 } 1804 }
1795 if (discarded_bytes && ret == -EOPNOTSUPP)
1796 ret = 0;
1797 1805
1798 if (actual_bytes) 1806 if (actual_bytes)
1799 *actual_bytes = discarded_bytes; 1807 *actual_bytes = discarded_bytes;
@@ -6269,8 +6277,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6269 * also make sure backrefs for the shared block and all lower level 6277 * also make sure backrefs for the shared block and all lower level
6270 * blocks are properly updated. 6278 * blocks are properly updated.
6271 */ 6279 */
6272int btrfs_drop_snapshot(struct btrfs_root *root, 6280void btrfs_drop_snapshot(struct btrfs_root *root,
6273 struct btrfs_block_rsv *block_rsv, int update_ref) 6281 struct btrfs_block_rsv *block_rsv, int update_ref)
6274{ 6282{
6275 struct btrfs_path *path; 6283 struct btrfs_path *path;
6276 struct btrfs_trans_handle *trans; 6284 struct btrfs_trans_handle *trans;
@@ -6283,13 +6291,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6283 int level; 6291 int level;
6284 6292
6285 path = btrfs_alloc_path(); 6293 path = btrfs_alloc_path();
6286 if (!path) 6294 if (!path) {
6287 return -ENOMEM; 6295 err = -ENOMEM;
6296 goto out;
6297 }
6288 6298
6289 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6299 wc = kzalloc(sizeof(*wc), GFP_NOFS);
6290 if (!wc) { 6300 if (!wc) {
6291 btrfs_free_path(path); 6301 btrfs_free_path(path);
6292 return -ENOMEM; 6302 err = -ENOMEM;
6303 goto out;
6293 } 6304 }
6294 6305
6295 trans = btrfs_start_transaction(tree_root, 0); 6306 trans = btrfs_start_transaction(tree_root, 0);
@@ -6318,7 +6329,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6318 path->lowest_level = 0; 6329 path->lowest_level = 0;
6319 if (ret < 0) { 6330 if (ret < 0) {
6320 err = ret; 6331 err = ret;
6321 goto out; 6332 goto out_free;
6322 } 6333 }
6323 WARN_ON(ret > 0); 6334 WARN_ON(ret > 0);
6324 6335
@@ -6425,11 +6436,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6425 free_extent_buffer(root->commit_root); 6436 free_extent_buffer(root->commit_root);
6426 kfree(root); 6437 kfree(root);
6427 } 6438 }
6428out: 6439out_free:
6429 btrfs_end_transaction_throttle(trans, tree_root); 6440 btrfs_end_transaction_throttle(trans, tree_root);
6430 kfree(wc); 6441 kfree(wc);
6431 btrfs_free_path(path); 6442 btrfs_free_path(path);
6432 return err; 6443out:
6444 if (err)
6445 btrfs_std_error(root->fs_info, err);
6446 return;
6433} 6447}
6434 6448
6435/* 6449/*
@@ -6720,6 +6734,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6720 struct btrfs_space_info *space_info; 6734 struct btrfs_space_info *space_info;
6721 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6735 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6722 struct btrfs_device *device; 6736 struct btrfs_device *device;
6737 u64 min_free;
6738 u64 dev_min = 1;
6739 u64 dev_nr = 0;
6740 int index;
6723 int full = 0; 6741 int full = 0;
6724 int ret = 0; 6742 int ret = 0;
6725 6743
@@ -6729,8 +6747,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6729 if (!block_group) 6747 if (!block_group)
6730 return -1; 6748 return -1;
6731 6749
6750 min_free = btrfs_block_group_used(&block_group->item);
6751
6732 /* no bytes used, we're good */ 6752 /* no bytes used, we're good */
6733 if (!btrfs_block_group_used(&block_group->item)) 6753 if (!min_free)
6734 goto out; 6754 goto out;
6735 6755
6736 space_info = block_group->space_info; 6756 space_info = block_group->space_info;
@@ -6746,10 +6766,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6746 * all of the extents from this block group. If we can, we're good 6766 * all of the extents from this block group. If we can, we're good
6747 */ 6767 */
6748 if ((space_info->total_bytes != block_group->key.offset) && 6768 if ((space_info->total_bytes != block_group->key.offset) &&
6749 (space_info->bytes_used + space_info->bytes_reserved + 6769 (space_info->bytes_used + space_info->bytes_reserved +
6750 space_info->bytes_pinned + space_info->bytes_readonly + 6770 space_info->bytes_pinned + space_info->bytes_readonly +
6751 btrfs_block_group_used(&block_group->item) < 6771 min_free < space_info->total_bytes)) {
6752 space_info->total_bytes)) {
6753 spin_unlock(&space_info->lock); 6772 spin_unlock(&space_info->lock);
6754 goto out; 6773 goto out;
6755 } 6774 }
@@ -6766,9 +6785,31 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6766 if (full) 6785 if (full)
6767 goto out; 6786 goto out;
6768 6787
6788 /*
6789 * index:
6790 * 0: raid10
6791 * 1: raid1
6792 * 2: dup
6793 * 3: raid0
6794 * 4: single
6795 */
6796 index = get_block_group_index(block_group);
6797 if (index == 0) {
6798 dev_min = 4;
6799 /* Divide by 2 */
6800 min_free >>= 1;
6801 } else if (index == 1) {
6802 dev_min = 2;
6803 } else if (index == 2) {
6804 /* Multiply by 2 */
6805 min_free <<= 1;
6806 } else if (index == 3) {
6807 dev_min = fs_devices->rw_devices;
6808 do_div(min_free, dev_min);
6809 }
6810
6769 mutex_lock(&root->fs_info->chunk_mutex); 6811 mutex_lock(&root->fs_info->chunk_mutex);
6770 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 6812 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
6771 u64 min_free = btrfs_block_group_used(&block_group->item);
6772 u64 dev_offset; 6813 u64 dev_offset;
6773 6814
6774 /* 6815 /*
@@ -6779,7 +6820,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6779 ret = find_free_dev_extent(NULL, device, min_free, 6820 ret = find_free_dev_extent(NULL, device, min_free,
6780 &dev_offset, NULL); 6821 &dev_offset, NULL);
6781 if (!ret) 6822 if (!ret)
6823 dev_nr++;
6824
6825 if (dev_nr >= dev_min)
6782 break; 6826 break;
6827
6783 ret = -1; 6828 ret = -1;
6784 } 6829 }
6785 } 6830 }
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b910694f61ed..a1cb7821becd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -183,8 +183,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
183 * read from the commit root and sidestep a nasty deadlock 183 * read from the commit root and sidestep a nasty deadlock
184 * between reading the free space cache and updating the csum tree. 184 * between reading the free space cache and updating the csum tree.
185 */ 185 */
186 if (btrfs_is_free_space_inode(root, inode)) 186 if (btrfs_is_free_space_inode(root, inode)) {
187 path->search_commit_root = 1; 187 path->search_commit_root = 1;
188 path->skip_locking = 1;
189 }
188 190
189 disk_bytenr = (u64)bio->bi_sector << 9; 191 disk_bytenr = (u64)bio->bi_sector << 9;
190 if (dio) 192 if (dio)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 658d66959abe..1266f6e9cdb2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -150,6 +150,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
150 spin_lock(&root->fs_info->defrag_inodes_lock); 150 spin_lock(&root->fs_info->defrag_inodes_lock);
151 if (!BTRFS_I(inode)->in_defrag) 151 if (!BTRFS_I(inode)->in_defrag)
152 __btrfs_add_inode_defrag(inode, defrag); 152 __btrfs_add_inode_defrag(inode, defrag);
153 else
154 kfree(defrag);
153 spin_unlock(&root->fs_info->defrag_inodes_lock); 155 spin_unlock(&root->fs_info->defrag_inodes_lock);
154 return 0; 156 return 0;
155} 157}
@@ -1034,11 +1036,13 @@ out:
1034 * on error we return an unlocked page and the error value 1036 * on error we return an unlocked page and the error value
1035 * on success we return a locked page and 0 1037 * on success we return a locked page and 0
1036 */ 1038 */
1037static int prepare_uptodate_page(struct page *page, u64 pos) 1039static int prepare_uptodate_page(struct page *page, u64 pos,
1040 bool force_uptodate)
1038{ 1041{
1039 int ret = 0; 1042 int ret = 0;
1040 1043
1041 if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { 1044 if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
1045 !PageUptodate(page)) {
1042 ret = btrfs_readpage(NULL, page); 1046 ret = btrfs_readpage(NULL, page);
1043 if (ret) 1047 if (ret)
1044 return ret; 1048 return ret;
@@ -1059,7 +1063,7 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
1059static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1063static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1060 struct page **pages, size_t num_pages, 1064 struct page **pages, size_t num_pages,
1061 loff_t pos, unsigned long first_index, 1065 loff_t pos, unsigned long first_index,
1062 size_t write_bytes) 1066 size_t write_bytes, bool force_uptodate)
1063{ 1067{
1064 struct extent_state *cached_state = NULL; 1068 struct extent_state *cached_state = NULL;
1065 int i; 1069 int i;
@@ -1073,12 +1077,6 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1073 start_pos = pos & ~((u64)root->sectorsize - 1); 1077 start_pos = pos & ~((u64)root->sectorsize - 1);
1074 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 1078 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
1075 1079
1076 if (start_pos > inode->i_size) {
1077 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
1078 if (err)
1079 return err;
1080 }
1081
1082again: 1080again:
1083 for (i = 0; i < num_pages; i++) { 1081 for (i = 0; i < num_pages; i++) {
1084 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1082 pages[i] = find_or_create_page(inode->i_mapping, index + i,
@@ -1090,10 +1088,11 @@ again:
1090 } 1088 }
1091 1089
1092 if (i == 0) 1090 if (i == 0)
1093 err = prepare_uptodate_page(pages[i], pos); 1091 err = prepare_uptodate_page(pages[i], pos,
1092 force_uptodate);
1094 if (i == num_pages - 1) 1093 if (i == num_pages - 1)
1095 err = prepare_uptodate_page(pages[i], 1094 err = prepare_uptodate_page(pages[i],
1096 pos + write_bytes); 1095 pos + write_bytes, false);
1097 if (err) { 1096 if (err) {
1098 page_cache_release(pages[i]); 1097 page_cache_release(pages[i]);
1099 faili = i - 1; 1098 faili = i - 1;
@@ -1162,6 +1161,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1162 size_t num_written = 0; 1161 size_t num_written = 0;
1163 int nrptrs; 1162 int nrptrs;
1164 int ret = 0; 1163 int ret = 0;
1164 bool force_page_uptodate = false;
1165 1165
1166 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1166 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1167 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1167 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1204,7 +1204,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1204 * contents of pages from loop to loop 1204 * contents of pages from loop to loop
1205 */ 1205 */
1206 ret = prepare_pages(root, file, pages, num_pages, 1206 ret = prepare_pages(root, file, pages, num_pages,
1207 pos, first_index, write_bytes); 1207 pos, first_index, write_bytes,
1208 force_page_uptodate);
1208 if (ret) { 1209 if (ret) {
1209 btrfs_delalloc_release_space(inode, 1210 btrfs_delalloc_release_space(inode,
1210 num_pages << PAGE_CACHE_SHIFT); 1211 num_pages << PAGE_CACHE_SHIFT);
@@ -1221,12 +1222,15 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1221 if (copied < write_bytes) 1222 if (copied < write_bytes)
1222 nrptrs = 1; 1223 nrptrs = 1;
1223 1224
1224 if (copied == 0) 1225 if (copied == 0) {
1226 force_page_uptodate = true;
1225 dirty_pages = 0; 1227 dirty_pages = 0;
1226 else 1228 } else {
1229 force_page_uptodate = false;
1227 dirty_pages = (copied + offset + 1230 dirty_pages = (copied + offset +
1228 PAGE_CACHE_SIZE - 1) >> 1231 PAGE_CACHE_SIZE - 1) >>
1229 PAGE_CACHE_SHIFT; 1232 PAGE_CACHE_SHIFT;
1233 }
1230 1234
1231 /* 1235 /*
1232 * If we had a short copy we need to release the excess delaloc 1236 * If we had a short copy we need to release the excess delaloc
@@ -1336,6 +1340,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1336 struct inode *inode = fdentry(file)->d_inode; 1340 struct inode *inode = fdentry(file)->d_inode;
1337 struct btrfs_root *root = BTRFS_I(inode)->root; 1341 struct btrfs_root *root = BTRFS_I(inode)->root;
1338 loff_t *ppos = &iocb->ki_pos; 1342 loff_t *ppos = &iocb->ki_pos;
1343 u64 start_pos;
1339 ssize_t num_written = 0; 1344 ssize_t num_written = 0;
1340 ssize_t err = 0; 1345 ssize_t err = 0;
1341 size_t count, ocount; 1346 size_t count, ocount;
@@ -1384,6 +1389,15 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1384 file_update_time(file); 1389 file_update_time(file);
1385 BTRFS_I(inode)->sequence++; 1390 BTRFS_I(inode)->sequence++;
1386 1391
1392 start_pos = round_down(pos, root->sectorsize);
1393 if (start_pos > i_size_read(inode)) {
1394 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
1395 if (err) {
1396 mutex_unlock(&inode->i_mutex);
1397 goto out;
1398 }
1399 }
1400
1387 if (unlikely(file->f_flags & O_DIRECT)) { 1401 if (unlikely(file->f_flags & O_DIRECT)) {
1388 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1402 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1389 pos, ppos, count, ocount); 1403 pos, ppos, count, ocount);
@@ -1638,11 +1652,15 @@ static long btrfs_fallocate(struct file *file, int mode,
1638 1652
1639 cur_offset = alloc_start; 1653 cur_offset = alloc_start;
1640 while (1) { 1654 while (1) {
1655 u64 actual_end;
1656
1641 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 1657 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1642 alloc_end - cur_offset, 0); 1658 alloc_end - cur_offset, 0);
1643 BUG_ON(IS_ERR_OR_NULL(em)); 1659 BUG_ON(IS_ERR_OR_NULL(em));
1644 last_byte = min(extent_map_end(em), alloc_end); 1660 last_byte = min(extent_map_end(em), alloc_end);
1661 actual_end = min_t(u64, extent_map_end(em), offset + len);
1645 last_byte = (last_byte + mask) & ~mask; 1662 last_byte = (last_byte + mask) & ~mask;
1663
1646 if (em->block_start == EXTENT_MAP_HOLE || 1664 if (em->block_start == EXTENT_MAP_HOLE ||
1647 (cur_offset >= inode->i_size && 1665 (cur_offset >= inode->i_size &&
1648 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
@@ -1655,6 +1673,16 @@ static long btrfs_fallocate(struct file *file, int mode,
1655 free_extent_map(em); 1673 free_extent_map(em);
1656 break; 1674 break;
1657 } 1675 }
1676 } else if (actual_end > inode->i_size &&
1677 !(mode & FALLOC_FL_KEEP_SIZE)) {
1678 /*
1679 * We didn't need to allocate any more space, but we
1680 * still extended the size of the file so we need to
1681 * update i_size.
1682 */
1683 inode->i_ctime = CURRENT_TIME;
1684 i_size_write(inode, actual_end);
1685 btrfs_ordered_update_i_size(inode, actual_end, NULL);
1658 } 1686 }
1659 free_extent_map(em); 1687 free_extent_map(em);
1660 1688
@@ -1793,10 +1821,15 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
1793 switch (origin) { 1821 switch (origin) {
1794 case SEEK_END: 1822 case SEEK_END:
1795 case SEEK_CUR: 1823 case SEEK_CUR:
1796 offset = generic_file_llseek_unlocked(file, offset, origin); 1824 offset = generic_file_llseek(file, offset, origin);
1797 goto out; 1825 goto out;
1798 case SEEK_DATA: 1826 case SEEK_DATA:
1799 case SEEK_HOLE: 1827 case SEEK_HOLE:
1828 if (offset >= i_size_read(inode)) {
1829 mutex_unlock(&inode->i_mutex);
1830 return -ENXIO;
1831 }
1832
1800 ret = find_desired_extent(inode, &offset, origin); 1833 ret = find_desired_extent(inode, &offset, origin);
1801 if (ret) { 1834 if (ret) {
1802 mutex_unlock(&inode->i_mutex); 1835 mutex_unlock(&inode->i_mutex);
@@ -1804,10 +1837,14 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
1804 } 1837 }
1805 } 1838 }
1806 1839
1807 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 1840 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
1808 return -EINVAL; 1841 offset = -EINVAL;
1809 if (offset > inode->i_sb->s_maxbytes) 1842 goto out;
1810 return -EINVAL; 1843 }
1844 if (offset > inode->i_sb->s_maxbytes) {
1845 offset = -EINVAL;
1846 goto out;
1847 }
1811 1848
1812 /* Special lock needed here? */ 1849 /* Special lock needed here? */
1813 if (offset != file->f_pos) { 1850 if (offset != file->f_pos) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6377713f639c..41ac927401d0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -190,9 +190,11 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
190 struct btrfs_path *path, 190 struct btrfs_path *path,
191 struct inode *inode) 191 struct inode *inode)
192{ 192{
193 struct btrfs_block_rsv *rsv;
193 loff_t oldsize; 194 loff_t oldsize;
194 int ret = 0; 195 int ret = 0;
195 196
197 rsv = trans->block_rsv;
196 trans->block_rsv = root->orphan_block_rsv; 198 trans->block_rsv = root->orphan_block_rsv;
197 ret = btrfs_block_rsv_check(trans, root, 199 ret = btrfs_block_rsv_check(trans, root,
198 root->orphan_block_rsv, 200 root->orphan_block_rsv,
@@ -210,6 +212,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
210 */ 212 */
211 ret = btrfs_truncate_inode_items(trans, root, inode, 213 ret = btrfs_truncate_inode_items(trans, root, inode,
212 0, BTRFS_EXTENT_DATA_KEY); 214 0, BTRFS_EXTENT_DATA_KEY);
215
216 trans->block_rsv = rsv;
213 if (ret) { 217 if (ret) {
214 WARN_ON(1); 218 WARN_ON(1);
215 return ret; 219 return ret;
@@ -1168,9 +1172,9 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1168 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1172 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
1169} 1173}
1170 1174
1171static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1175static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1172 struct btrfs_free_space *info, u64 offset, 1176 struct btrfs_free_space *info,
1173 u64 bytes) 1177 u64 offset, u64 bytes)
1174{ 1178{
1175 unsigned long start, count; 1179 unsigned long start, count;
1176 1180
@@ -1181,6 +1185,13 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1181 bitmap_clear(info->bitmap, start, count); 1185 bitmap_clear(info->bitmap, start, count);
1182 1186
1183 info->bytes -= bytes; 1187 info->bytes -= bytes;
1188}
1189
1190static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1191 struct btrfs_free_space *info, u64 offset,
1192 u64 bytes)
1193{
1194 __bitmap_clear_bits(ctl, info, offset, bytes);
1184 ctl->free_space -= bytes; 1195 ctl->free_space -= bytes;
1185} 1196}
1186 1197
@@ -1984,7 +1995,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
1984 return 0; 1995 return 0;
1985 1996
1986 ret = search_start; 1997 ret = search_start;
1987 bitmap_clear_bits(ctl, entry, ret, bytes); 1998 __bitmap_clear_bits(ctl, entry, ret, bytes);
1988 1999
1989 return ret; 2000 return ret;
1990} 2001}
@@ -2039,7 +2050,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2039 continue; 2050 continue;
2040 } 2051 }
2041 } else { 2052 } else {
2042
2043 ret = entry->offset; 2053 ret = entry->offset;
2044 2054
2045 entry->offset += bytes; 2055 entry->offset += bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 15fceefbca0a..b2d004ad66a0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1786,7 +1786,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1786 &ordered_extent->list); 1786 &ordered_extent->list);
1787 1787
1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1789 if (!ret) { 1789 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1790 ret = btrfs_update_inode(trans, root, inode); 1790 ret = btrfs_update_inode(trans, root, inode);
1791 BUG_ON(ret); 1791 BUG_ON(ret);
1792 } 1792 }
@@ -3510,15 +3510,19 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3510 err = btrfs_drop_extents(trans, inode, cur_offset, 3510 err = btrfs_drop_extents(trans, inode, cur_offset,
3511 cur_offset + hole_size, 3511 cur_offset + hole_size,
3512 &hint_byte, 1); 3512 &hint_byte, 1);
3513 if (err) 3513 if (err) {
3514 btrfs_end_transaction(trans, root);
3514 break; 3515 break;
3516 }
3515 3517
3516 err = btrfs_insert_file_extent(trans, root, 3518 err = btrfs_insert_file_extent(trans, root,
3517 btrfs_ino(inode), cur_offset, 0, 3519 btrfs_ino(inode), cur_offset, 0,
3518 0, hole_size, 0, hole_size, 3520 0, hole_size, 0, hole_size,
3519 0, 0, 0); 3521 0, 0, 0);
3520 if (err) 3522 if (err) {
3523 btrfs_end_transaction(trans, root);
3521 break; 3524 break;
3525 }
3522 3526
3523 btrfs_drop_extent_cache(inode, hole_start, 3527 btrfs_drop_extent_cache(inode, hole_start,
3524 last_byte - 1, 0); 3528 last_byte - 1, 0);
@@ -3952,7 +3956,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3952 struct btrfs_root *root, int *new) 3956 struct btrfs_root *root, int *new)
3953{ 3957{
3954 struct inode *inode; 3958 struct inode *inode;
3955 int bad_inode = 0;
3956 3959
3957 inode = btrfs_iget_locked(s, location->objectid, root); 3960 inode = btrfs_iget_locked(s, location->objectid, root);
3958 if (!inode) 3961 if (!inode)
@@ -3968,15 +3971,12 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3968 if (new) 3971 if (new)
3969 *new = 1; 3972 *new = 1;
3970 } else { 3973 } else {
3971 bad_inode = 1; 3974 unlock_new_inode(inode);
3975 iput(inode);
3976 inode = ERR_PTR(-ESTALE);
3972 } 3977 }
3973 } 3978 }
3974 3979
3975 if (bad_inode) {
3976 iput(inode);
3977 inode = ERR_PTR(-ESTALE);
3978 }
3979
3980 return inode; 3980 return inode;
3981} 3981}
3982 3982
@@ -4018,7 +4018,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4018 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); 4018 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
4019 kfree(dentry->d_fsdata); 4019 kfree(dentry->d_fsdata);
4020 dentry->d_fsdata = NULL; 4020 dentry->d_fsdata = NULL;
4021 d_clear_need_lookup(dentry); 4021 /* This thing is hashed, drop it for now */
4022 d_drop(dentry);
4022 } else { 4023 } else {
4023 ret = btrfs_inode_by_name(dir, dentry, &location); 4024 ret = btrfs_inode_by_name(dir, dentry, &location);
4024 } 4025 }
@@ -4085,7 +4086,15 @@ static void btrfs_dentry_release(struct dentry *dentry)
4085static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4086static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4086 struct nameidata *nd) 4087 struct nameidata *nd)
4087{ 4088{
4088 return d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 4089 struct dentry *ret;
4090
4091 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4092 if (unlikely(d_need_lookup(dentry))) {
4093 spin_lock(&dentry->d_lock);
4094 dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
4095 spin_unlock(&dentry->d_lock);
4096 }
4097 return ret;
4089} 4098}
4090 4099
4091unsigned char btrfs_filetype_table[] = { 4100unsigned char btrfs_filetype_table[] = {
@@ -4125,7 +4134,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4125 4134
4126 /* special case for "." */ 4135 /* special case for "." */
4127 if (filp->f_pos == 0) { 4136 if (filp->f_pos == 0) {
4128 over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR); 4137 over = filldir(dirent, ".", 1,
4138 filp->f_pos, btrfs_ino(inode), DT_DIR);
4129 if (over) 4139 if (over)
4130 return 0; 4140 return 0;
4131 filp->f_pos = 1; 4141 filp->f_pos = 1;
@@ -4134,7 +4144,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4134 if (filp->f_pos == 1) { 4144 if (filp->f_pos == 1) {
4135 u64 pino = parent_ino(filp->f_path.dentry); 4145 u64 pino = parent_ino(filp->f_path.dentry);
4136 over = filldir(dirent, "..", 2, 4146 over = filldir(dirent, "..", 2,
4137 2, pino, DT_DIR); 4147 filp->f_pos, pino, DT_DIR);
4138 if (over) 4148 if (over)
4139 return 0; 4149 return 0;
4140 filp->f_pos = 2; 4150 filp->f_pos = 2;
@@ -5823,7 +5833,7 @@ again:
5823 5833
5824 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5834 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5825 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5835 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5826 if (!ret) 5836 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5827 btrfs_update_inode(trans, root, inode); 5837 btrfs_update_inode(trans, root, inode);
5828 ret = 0; 5838 ret = 0;
5829out_unlock: 5839out_unlock:
@@ -7354,11 +7364,15 @@ static int btrfs_set_page_dirty(struct page *page)
7354static int btrfs_permission(struct inode *inode, int mask) 7364static int btrfs_permission(struct inode *inode, int mask)
7355{ 7365{
7356 struct btrfs_root *root = BTRFS_I(inode)->root; 7366 struct btrfs_root *root = BTRFS_I(inode)->root;
7367 umode_t mode = inode->i_mode;
7357 7368
7358 if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) 7369 if (mask & MAY_WRITE &&
7359 return -EROFS; 7370 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
7360 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7371 if (btrfs_root_readonly(root))
7361 return -EACCES; 7372 return -EROFS;
7373 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
7374 return -EACCES;
7375 }
7362 return generic_permission(inode, mask); 7376 return generic_permission(inode, mask);
7363} 7377}
7364 7378
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7cf013349941..dae5dfe41ba5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1047,7 +1047,16 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1047 if (!max_to_defrag) 1047 if (!max_to_defrag)
1048 max_to_defrag = last_index - 1; 1048 max_to_defrag = last_index - 1;
1049 1049
1050 while (i <= last_index && defrag_count < max_to_defrag) { 1050 /*
1051 * make writeback starts from i, so the defrag range can be
1052 * written sequentially.
1053 */
1054 if (i < inode->i_mapping->writeback_index)
1055 inode->i_mapping->writeback_index = i;
1056
1057 while (i <= last_index && defrag_count < max_to_defrag &&
1058 (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
1059 PAGE_CACHE_SHIFT)) {
1051 /* 1060 /*
1052 * make sure we stop running if someone unmounts 1061 * make sure we stop running if someone unmounts
1053 * the FS 1062 * the FS
@@ -2177,6 +2186,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2177 if (!(src_file->f_mode & FMODE_READ)) 2186 if (!(src_file->f_mode & FMODE_READ))
2178 goto out_fput; 2187 goto out_fput;
2179 2188
2189 /* don't make the dst file partly checksummed */
2190 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
2191 (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
2192 goto out_fput;
2193
2180 ret = -EISDIR; 2194 ret = -EISDIR;
2181 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 2195 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
2182 goto out_fput; 2196 goto out_fput;
@@ -2220,6 +2234,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2220 !IS_ALIGNED(destoff, bs)) 2234 !IS_ALIGNED(destoff, bs))
2221 goto out_unlock; 2235 goto out_unlock;
2222 2236
2237 if (destoff > inode->i_size) {
2238 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
2239 if (ret)
2240 goto out_unlock;
2241 }
2242
2243 /* truncate page cache pages from target inode range */
2244 truncate_inode_pages_range(&inode->i_data, destoff,
2245 PAGE_CACHE_ALIGN(destoff + len) - 1);
2246
2223 /* do any pending delalloc/csum calc on src, one way or 2247 /* do any pending delalloc/csum calc on src, one way or
2224 another, and lock file content */ 2248 another, and lock file content */
2225 while (1) { 2249 while (1) {
@@ -2313,7 +2337,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2313 else 2337 else
2314 new_key.offset = destoff; 2338 new_key.offset = destoff;
2315 2339
2316 trans = btrfs_start_transaction(root, 1); 2340 /*
2341 * 1 - adjusting old extent (we may have to split it)
2342 * 1 - add new extent
2343 * 1 - inode update
2344 */
2345 trans = btrfs_start_transaction(root, 3);
2317 if (IS_ERR(trans)) { 2346 if (IS_ERR(trans)) {
2318 ret = PTR_ERR(trans); 2347 ret = PTR_ERR(trans);
2319 goto out; 2348 goto out;
@@ -2321,14 +2350,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2321 2350
2322 if (type == BTRFS_FILE_EXTENT_REG || 2351 if (type == BTRFS_FILE_EXTENT_REG ||
2323 type == BTRFS_FILE_EXTENT_PREALLOC) { 2352 type == BTRFS_FILE_EXTENT_PREALLOC) {
2353 /*
2354 * a | --- range to clone ---| b
2355 * | ------------- extent ------------- |
2356 */
2357
2358 /* substract range b */
2359 if (key.offset + datal > off + len)
2360 datal = off + len - key.offset;
2361
2362 /* substract range a */
2324 if (off > key.offset) { 2363 if (off > key.offset) {
2325 datao += off - key.offset; 2364 datao += off - key.offset;
2326 datal -= off - key.offset; 2365 datal -= off - key.offset;
2327 } 2366 }
2328 2367
2329 if (key.offset + datal > off + len)
2330 datal = off + len - key.offset;
2331
2332 ret = btrfs_drop_extents(trans, inode, 2368 ret = btrfs_drop_extents(trans, inode,
2333 new_key.offset, 2369 new_key.offset,
2334 new_key.offset + datal, 2370 new_key.offset + datal,
@@ -2425,7 +2461,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2425 if (endoff > inode->i_size) 2461 if (endoff > inode->i_size)
2426 btrfs_i_size_write(inode, endoff); 2462 btrfs_i_size_write(inode, endoff);
2427 2463
2428 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
2429 ret = btrfs_update_inode(trans, root, inode); 2464 ret = btrfs_update_inode(trans, root, inode);
2430 BUG_ON(ret); 2465 BUG_ON(ret);
2431 btrfs_end_transaction(trans, root); 2466 btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7dc36fab4afc..e24b7964a155 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -884,6 +884,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
884 struct btrfs_root *tree_root = fs_info->tree_root; 884 struct btrfs_root *tree_root = fs_info->tree_root;
885 struct btrfs_root *root = pending->root; 885 struct btrfs_root *root = pending->root;
886 struct btrfs_root *parent_root; 886 struct btrfs_root *parent_root;
887 struct btrfs_block_rsv *rsv;
887 struct inode *parent_inode; 888 struct inode *parent_inode;
888 struct dentry *parent; 889 struct dentry *parent;
889 struct dentry *dentry; 890 struct dentry *dentry;
@@ -895,6 +896,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
895 u64 objectid; 896 u64 objectid;
896 u64 root_flags; 897 u64 root_flags;
897 898
899 rsv = trans->block_rsv;
900
898 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 901 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
899 if (!new_root_item) { 902 if (!new_root_item) {
900 pending->error = -ENOMEM; 903 pending->error = -ENOMEM;
@@ -1002,6 +1005,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 btrfs_orphan_post_snapshot(trans, pending); 1005 btrfs_orphan_post_snapshot(trans, pending);
1003fail: 1006fail:
1004 kfree(new_root_item); 1007 kfree(new_root_item);
1008 trans->block_rsv = rsv;
1005 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); 1009 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1006 return 0; 1010 return 0;
1007} 1011}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index babee65f8eda..786639fca067 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -799,14 +799,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
799 struct extent_buffer *eb, int slot, 799 struct extent_buffer *eb, int slot,
800 struct btrfs_key *key) 800 struct btrfs_key *key)
801{ 801{
802 struct inode *dir;
803 int ret;
804 struct btrfs_inode_ref *ref; 802 struct btrfs_inode_ref *ref;
803 struct btrfs_dir_item *di;
804 struct inode *dir;
805 struct inode *inode; 805 struct inode *inode;
806 char *name;
807 int namelen;
808 unsigned long ref_ptr; 806 unsigned long ref_ptr;
809 unsigned long ref_end; 807 unsigned long ref_end;
808 char *name;
809 int namelen;
810 int ret;
810 int search_done = 0; 811 int search_done = 0;
811 812
812 /* 813 /*
@@ -909,6 +910,25 @@ again:
909 } 910 }
910 btrfs_release_path(path); 911 btrfs_release_path(path);
911 912
913 /* look for a conflicting sequence number */
914 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
915 btrfs_inode_ref_index(eb, ref),
916 name, namelen, 0);
917 if (di && !IS_ERR(di)) {
918 ret = drop_one_dir_item(trans, root, path, dir, di);
919 BUG_ON(ret);
920 }
921 btrfs_release_path(path);
922
923 /* look for a conflicing name */
924 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
925 name, namelen, 0);
926 if (di && !IS_ERR(di)) {
927 ret = drop_one_dir_item(trans, root, path, dir, di);
928 BUG_ON(ret);
929 }
930 btrfs_release_path(path);
931
912insert: 932insert:
913 /* insert our name */ 933 /* insert our name */
914 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 934 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 53875ae73ad4..f2a4cc79da61 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -142,6 +142,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
142 unsigned long limit; 142 unsigned long limit;
143 unsigned long last_waited = 0; 143 unsigned long last_waited = 0;
144 int force_reg = 0; 144 int force_reg = 0;
145 int sync_pending = 0;
145 struct blk_plug plug; 146 struct blk_plug plug;
146 147
147 /* 148 /*
@@ -229,6 +230,22 @@ loop_lock:
229 230
230 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 231 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
231 232
233 /*
234 * if we're doing the sync list, record that our
235 * plug has some sync requests on it
236 *
237 * If we're doing the regular list and there are
238 * sync requests sitting around, unplug before
239 * we add more
240 */
241 if (pending_bios == &device->pending_sync_bios) {
242 sync_pending = 1;
243 } else if (sync_pending) {
244 blk_finish_plug(&plug);
245 blk_start_plug(&plug);
246 sync_pending = 0;
247 }
248
232 submit_bio(cur->bi_rw, cur); 249 submit_bio(cur->bi_rw, cur);
233 num_run++; 250 num_run++;
234 batch_run++; 251 batch_run++;
@@ -500,6 +517,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
500 fs_devices->rw_devices--; 517 fs_devices->rw_devices--;
501 } 518 }
502 519
520 if (device->can_discard)
521 fs_devices->num_can_discard--;
522
503 new_device = kmalloc(sizeof(*new_device), GFP_NOFS); 523 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
504 BUG_ON(!new_device); 524 BUG_ON(!new_device);
505 memcpy(new_device, device, sizeof(*new_device)); 525 memcpy(new_device, device, sizeof(*new_device));
@@ -508,6 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
508 new_device->bdev = NULL; 528 new_device->bdev = NULL;
509 new_device->writeable = 0; 529 new_device->writeable = 0;
510 new_device->in_fs_metadata = 0; 530 new_device->in_fs_metadata = 0;
531 new_device->can_discard = 0;
511 list_replace_rcu(&device->dev_list, &new_device->dev_list); 532 list_replace_rcu(&device->dev_list, &new_device->dev_list);
512 533
513 call_rcu(&device->rcu, free_device); 534 call_rcu(&device->rcu, free_device);
@@ -547,6 +568,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
547static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 568static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
548 fmode_t flags, void *holder) 569 fmode_t flags, void *holder)
549{ 570{
571 struct request_queue *q;
550 struct block_device *bdev; 572 struct block_device *bdev;
551 struct list_head *head = &fs_devices->devices; 573 struct list_head *head = &fs_devices->devices;
552 struct btrfs_device *device; 574 struct btrfs_device *device;
@@ -603,6 +625,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
603 seeding = 0; 625 seeding = 0;
604 } 626 }
605 627
628 q = bdev_get_queue(bdev);
629 if (blk_queue_discard(q)) {
630 device->can_discard = 1;
631 fs_devices->num_can_discard++;
632 }
633
606 device->bdev = bdev; 634 device->bdev = bdev;
607 device->in_fs_metadata = 0; 635 device->in_fs_metadata = 0;
608 device->mode = flags; 636 device->mode = flags;
@@ -835,6 +863,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
835 863
836 max_hole_start = search_start; 864 max_hole_start = search_start;
837 max_hole_size = 0; 865 max_hole_size = 0;
866 hole_size = 0;
838 867
839 if (search_start >= search_end) { 868 if (search_start >= search_end) {
840 ret = -ENOSPC; 869 ret = -ENOSPC;
@@ -917,7 +946,14 @@ next:
917 cond_resched(); 946 cond_resched();
918 } 947 }
919 948
920 hole_size = search_end- search_start; 949 /*
950 * At this point, search_start should be the end of
951 * allocated dev extents, and when shrinking the device,
952 * search_end may be smaller than search_start.
953 */
954 if (search_end > search_start)
955 hole_size = search_end - search_start;
956
921 if (hole_size > max_hole_size) { 957 if (hole_size > max_hole_size) {
922 max_hole_start = search_start; 958 max_hole_start = search_start;
923 max_hole_size = hole_size; 959 max_hole_size = hole_size;
@@ -1543,6 +1579,7 @@ error:
1543 1579
1544int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1580int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1545{ 1581{
1582 struct request_queue *q;
1546 struct btrfs_trans_handle *trans; 1583 struct btrfs_trans_handle *trans;
1547 struct btrfs_device *device; 1584 struct btrfs_device *device;
1548 struct block_device *bdev; 1585 struct block_device *bdev;
@@ -1612,6 +1649,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1612 1649
1613 lock_chunks(root); 1650 lock_chunks(root);
1614 1651
1652 q = bdev_get_queue(bdev);
1653 if (blk_queue_discard(q))
1654 device->can_discard = 1;
1615 device->writeable = 1; 1655 device->writeable = 1;
1616 device->work.func = pending_bios_fn; 1656 device->work.func = pending_bios_fn;
1617 generate_random_uuid(device->uuid); 1657 generate_random_uuid(device->uuid);
@@ -1647,6 +1687,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1647 root->fs_info->fs_devices->num_devices++; 1687 root->fs_info->fs_devices->num_devices++;
1648 root->fs_info->fs_devices->open_devices++; 1688 root->fs_info->fs_devices->open_devices++;
1649 root->fs_info->fs_devices->rw_devices++; 1689 root->fs_info->fs_devices->rw_devices++;
1690 if (device->can_discard)
1691 root->fs_info->fs_devices->num_can_discard++;
1650 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1651 1693
1652 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1694 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
@@ -2413,9 +2455,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2413 total_avail = device->total_bytes - device->bytes_used; 2455 total_avail = device->total_bytes - device->bytes_used;
2414 else 2456 else
2415 total_avail = 0; 2457 total_avail = 0;
2416 /* avail is off by max(alloc_start, 1MB), but that is the same 2458
2417 * for all devices, so it doesn't hurt the sorting later on 2459 /* If there is no space on this device, skip it. */
2418 */ 2460 if (total_avail == 0)
2461 continue;
2419 2462
2420 ret = find_free_dev_extent(trans, device, 2463 ret = find_free_dev_extent(trans, device,
2421 max_stripe_size * dev_stripes, 2464 max_stripe_size * dev_stripes,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7c12d61ae7ae..6d866db4e177 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -48,6 +48,7 @@ struct btrfs_device {
48 int writeable; 48 int writeable;
49 int in_fs_metadata; 49 int in_fs_metadata;
50 int missing; 50 int missing;
51 int can_discard;
51 52
52 spinlock_t io_lock; 53 spinlock_t io_lock;
53 54
@@ -104,6 +105,7 @@ struct btrfs_fs_devices {
104 u64 rw_devices; 105 u64 rw_devices;
105 u64 missing_devices; 106 u64 missing_devices;
106 u64 total_rw_bytes; 107 u64 total_rw_bytes;
108 u64 num_can_discard;
107 struct block_device *latest_bdev; 109 struct block_device *latest_bdev;
108 110
109 /* all of the devices in the FS, protected by a mutex 111 /* all of the devices in the FS, protected by a mutex
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index d733b9cfea34..426aa464f1af 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -116,6 +116,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
116 if (ret) 116 if (ret)
117 goto out; 117 goto out;
118 btrfs_release_path(path); 118 btrfs_release_path(path);
119
120 /*
121 * remove the attribute
122 */
123 if (!value)
124 goto out;
119 } 125 }
120 126
121again: 127again:
@@ -158,6 +164,9 @@ out:
158 return ret; 164 return ret;
159} 165}
160 166
167/*
168 * @value: "" makes the attribute to empty, NULL removes it
169 */
161int __btrfs_setxattr(struct btrfs_trans_handle *trans, 170int __btrfs_setxattr(struct btrfs_trans_handle *trans,
162 struct inode *inode, const char *name, 171 struct inode *inode, const char *name,
163 const void *value, size_t size, int flags) 172 const void *value, size_t size, int flags)
@@ -374,36 +383,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
374 XATTR_REPLACE); 383 XATTR_REPLACE);
375} 384}
376 385
377int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, 386int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
378 struct inode *inode, struct inode *dir, 387 void *fs_info)
379 const struct qstr *qstr)
380{ 388{
381 int err; 389 const struct xattr *xattr;
382 size_t len; 390 struct btrfs_trans_handle *trans = fs_info;
383 void *value;
384 char *suffix;
385 char *name; 391 char *name;
392 int err = 0;
386 393
387 err = security_inode_init_security(inode, dir, qstr, &suffix, &value, 394 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
388 &len); 395 name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
389 if (err) { 396 strlen(xattr->name) + 1, GFP_NOFS);
390 if (err == -EOPNOTSUPP) 397 if (!name) {
391 return 0; 398 err = -ENOMEM;
392 return err; 399 break;
393 } 400 }
394
395 name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
396 GFP_NOFS);
397 if (!name) {
398 err = -ENOMEM;
399 } else {
400 strcpy(name, XATTR_SECURITY_PREFIX); 401 strcpy(name, XATTR_SECURITY_PREFIX);
401 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); 402 strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
402 err = __btrfs_setxattr(trans, inode, name, value, len, 0); 403 err = __btrfs_setxattr(trans, inode, name,
404 xattr->value, xattr->value_len, 0);
403 kfree(name); 405 kfree(name);
406 if (err < 0)
407 break;
404 } 408 }
405
406 kfree(suffix);
407 kfree(value);
408 return err; 409 return err;
409} 410}
411
412int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
413 struct inode *inode, struct inode *dir,
414 const struct qstr *qstr)
415{
416 return security_inode_init_security(inode, dir, qstr,
417 &btrfs_initxattrs, trans);
418}
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a80b048ade8..936d6035f6e2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1470,13 +1470,13 @@ static void discard_buffer(struct buffer_head * bh)
1470} 1470}
1471 1471
1472/** 1472/**
1473 * block_invalidatepage - invalidate part of all of a buffer-backed page 1473 * block_invalidatepage - invalidate part or all of a buffer-backed page
1474 * 1474 *
1475 * @page: the page which is affected 1475 * @page: the page which is affected
1476 * @offset: the index of the truncation point 1476 * @offset: the index of the truncation point
1477 * 1477 *
1478 * block_invalidatepage() is called when all or part of the page has become 1478 * block_invalidatepage() is called when all or part of the page has become
1479 * invalidatedby a truncate operation. 1479 * invalidated by a truncate operation.
1480 * 1480 *
1481 * block_invalidatepage() does not have to release all buffers, but it must 1481 * block_invalidatepage() does not have to release all buffers, but it must
1482 * ensure that no dirty buffer is left outside @offset and that no I/O 1482 * ensure that no dirty buffer is left outside @offset and that no I/O
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fee028b5332e..86c59e16ba74 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1595,7 +1595,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1595 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); 1595 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1596 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, 1596 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1597 *ppath); 1597 *ppath);
1598 } else if (rpath) { 1598 } else if (rpath || rino) {
1599 *ino = rino; 1599 *ino = rino;
1600 *ppath = rpath; 1600 *ppath = rpath;
1601 *pathlen = strlen(rpath); 1601 *pathlen = strlen(rpath);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d47c5ec7fb1f..88bacaf385d9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -813,8 +813,8 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
813 fsc = create_fs_client(fsopt, opt); 813 fsc = create_fs_client(fsopt, opt);
814 if (IS_ERR(fsc)) { 814 if (IS_ERR(fsc)) {
815 res = ERR_CAST(fsc); 815 res = ERR_CAST(fsc);
816 kfree(fsopt); 816 destroy_mount_options(fsopt);
817 kfree(opt); 817 ceph_destroy_options(opt);
818 goto out_final; 818 goto out_final;
819 } 819 }
820 820
diff --git a/fs/cifs/README b/fs/cifs/README
index c5c2c5e5f0f2..895da1dc1550 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -745,4 +745,18 @@ installed and something like the following lines should be added to the
745create cifs.spnego * * /usr/local/sbin/cifs.upcall %k 745create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
746create dns_resolver * * /usr/local/sbin/cifs.upcall %k 746create dns_resolver * * /usr/local/sbin/cifs.upcall %k
747 747
748CIFS kernel module parameters
749=============================
750These module parameters can be specified or modified either during the time of
751module loading or during the runtime by using the interface
752 /proc/module/cifs/parameters/<param>
753
754i.e. echo "value" > /sys/module/cifs/parameters/<param>
755
7561. echo_retries - The number of echo attempts before giving up and
757 reconnecting to the server. The default is 5. The value 0
758 means never reconnect.
759
7602. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
761 [Y/y/1]. To disable use any of [N/n/0].
748 762
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 2fe3cf13b2e9..84e8c0724704 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -176,7 +176,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
176 176
177#ifdef CONFIG_CIFS_STATS2 177#ifdef CONFIG_CIFS_STATS2
178 seq_printf(m, " In Send: %d In MaxReq Wait: %d", 178 seq_printf(m, " In Send: %d In MaxReq Wait: %d",
179 atomic_read(&server->inSend), 179 atomic_read(&server->in_send),
180 atomic_read(&server->num_waiters)); 180 atomic_read(&server->num_waiters));
181#endif 181#endif
182 182
@@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = {
511 511
512static int cifs_oplock_proc_show(struct seq_file *m, void *v) 512static int cifs_oplock_proc_show(struct seq_file *m, void *v)
513{ 513{
514 seq_printf(m, "%d\n", oplockEnabled); 514 seq_printf(m, "%d\n", enable_oplocks);
515 return 0; 515 return 0;
516} 516}
517 517
@@ -526,13 +526,16 @@ static ssize_t cifs_oplock_proc_write(struct file *file,
526 char c; 526 char c;
527 int rc; 527 int rc;
528 528
529 printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface "
530 "will be removed in kernel version 3.4. Please migrate to "
531 "using the 'enable_oplocks' module parameter in cifs.ko.\n");
529 rc = get_user(c, buffer); 532 rc = get_user(c, buffer);
530 if (rc) 533 if (rc)
531 return rc; 534 return rc;
532 if (c == '0' || c == 'n' || c == 'N') 535 if (c == '0' || c == 'n' || c == 'N')
533 oplockEnabled = 0; 536 enable_oplocks = false;
534 else if (c == '1' || c == 'y' || c == 'Y') 537 else if (c == '1' || c == 'y' || c == 'Y')
535 oplockEnabled = 1; 538 enable_oplocks = true;
536 539
537 return count; 540 return count;
538} 541}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7260e11e21f8..500d65859279 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,6 +43,8 @@
43#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ 43#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */
44#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ 44#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */
45#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ 45#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
46#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
47#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
46 48
47struct cifs_sb_info { 49struct cifs_sb_info {
48 struct rb_root tlink_tree; 50 struct rb_root tlink_tree;
@@ -55,6 +57,8 @@ struct cifs_sb_info {
55 atomic_t active; 57 atomic_t active;
56 uid_t mnt_uid; 58 uid_t mnt_uid;
57 gid_t mnt_gid; 59 gid_t mnt_gid;
60 uid_t mnt_backupuid;
61 gid_t mnt_backupgid;
58 mode_t mnt_file_mode; 62 mode_t mnt_file_mode;
59 mode_t mnt_dir_mode; 63 mode_t mnt_dir_mode;
60 unsigned int mnt_cifs_flags; 64 unsigned int mnt_cifs_flags;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 21de1d6d5849..72ddf23ef6f7 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
91 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); 91 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
92 spin_unlock(&sidgidlock); 92 spin_unlock(&sidgidlock);
93 93
94 root = &siduidtree;
95 spin_lock(&uidsidlock);
96 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
97 spin_unlock(&uidsidlock);
98
99 root = &sidgidtree;
100 spin_lock(&gidsidlock);
101 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
102 spin_unlock(&gidsidlock);
103
94 return nr_rem; 104 return nr_rem;
95} 105}
96 106
107static void
108sid_rb_insert(struct rb_root *root, unsigned long cid,
109 struct cifs_sid_id **psidid, char *typestr)
110{
111 char *strptr;
112 struct rb_node *node = root->rb_node;
113 struct rb_node *parent = NULL;
114 struct rb_node **linkto = &(root->rb_node);
115 struct cifs_sid_id *lsidid;
116
117 while (node) {
118 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
119 parent = node;
120 if (cid > lsidid->id) {
121 linkto = &(node->rb_left);
122 node = node->rb_left;
123 }
124 if (cid < lsidid->id) {
125 linkto = &(node->rb_right);
126 node = node->rb_right;
127 }
128 }
129
130 (*psidid)->id = cid;
131 (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
132 (*psidid)->refcount = 0;
133
134 sprintf((*psidid)->sidstr, "%s", typestr);
135 strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
136 sprintf(strptr, "%ld", cid);
137
138 clear_bit(SID_ID_PENDING, &(*psidid)->state);
139 clear_bit(SID_ID_MAPPED, &(*psidid)->state);
140
141 rb_link_node(&(*psidid)->rbnode, parent, linkto);
142 rb_insert_color(&(*psidid)->rbnode, root);
143}
144
145static struct cifs_sid_id *
146sid_rb_search(struct rb_root *root, unsigned long cid)
147{
148 struct rb_node *node = root->rb_node;
149 struct cifs_sid_id *lsidid;
150
151 while (node) {
152 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
153 if (cid > lsidid->id)
154 node = node->rb_left;
155 else if (cid < lsidid->id)
156 node = node->rb_right;
157 else /* node found */
158 return lsidid;
159 }
160
161 return NULL;
162}
163
97static struct shrinker cifs_shrinker = { 164static struct shrinker cifs_shrinker = {
98 .shrink = cifs_idmap_shrinker, 165 .shrink = cifs_idmap_shrinker,
99 .seeks = DEFAULT_SEEKS, 166 .seeks = DEFAULT_SEEKS,
@@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
110 177
111 memcpy(payload, data, datalen); 178 memcpy(payload, data, datalen);
112 key->payload.data = payload; 179 key->payload.data = payload;
180 key->datalen = datalen;
113 return 0; 181 return 0;
114} 182}
115 183
@@ -224,6 +292,120 @@ sidid_pending_wait(void *unused)
224} 292}
225 293
226static int 294static int
295id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
296{
297 int rc = 0;
298 struct key *sidkey;
299 const struct cred *saved_cred;
300 struct cifs_sid *lsid;
301 struct cifs_sid_id *psidid, *npsidid;
302 struct rb_root *cidtree;
303 spinlock_t *cidlock;
304
305 if (sidtype == SIDOWNER) {
306 cidlock = &siduidlock;
307 cidtree = &uidtree;
308 } else if (sidtype == SIDGROUP) {
309 cidlock = &sidgidlock;
310 cidtree = &gidtree;
311 } else
312 return -EINVAL;
313
314 spin_lock(cidlock);
315 psidid = sid_rb_search(cidtree, cid);
316
317 if (!psidid) { /* node does not exist, allocate one & attempt adding */
318 spin_unlock(cidlock);
319 npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
320 if (!npsidid)
321 return -ENOMEM;
322
323 npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
324 if (!npsidid->sidstr) {
325 kfree(npsidid);
326 return -ENOMEM;
327 }
328
329 spin_lock(cidlock);
330 psidid = sid_rb_search(cidtree, cid);
331 if (psidid) { /* node happened to get inserted meanwhile */
332 ++psidid->refcount;
333 spin_unlock(cidlock);
334 kfree(npsidid->sidstr);
335 kfree(npsidid);
336 } else {
337 psidid = npsidid;
338 sid_rb_insert(cidtree, cid, &psidid,
339 sidtype == SIDOWNER ? "oi:" : "gi:");
340 ++psidid->refcount;
341 spin_unlock(cidlock);
342 }
343 } else {
344 ++psidid->refcount;
345 spin_unlock(cidlock);
346 }
347
348 /*
349 * If we are here, it is safe to access psidid and its fields
350 * since a reference was taken earlier while holding the spinlock.
351 * A reference on the node is put without holding the spinlock
352 * and it is OK to do so in this case, shrinker will not erase
353 * this node until all references are put and we do not access
354 * any fields of the node after a reference is put .
355 */
356 if (test_bit(SID_ID_MAPPED, &psidid->state)) {
357 memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
358 psidid->time = jiffies; /* update ts for accessing */
359 goto id_sid_out;
360 }
361
362 if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) {
363 rc = -EINVAL;
364 goto id_sid_out;
365 }
366
367 if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
368 saved_cred = override_creds(root_cred);
369 sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
370 if (IS_ERR(sidkey)) {
371 rc = -EINVAL;
372 cFYI(1, "%s: Can't map and id to a SID", __func__);
373 } else {
374 lsid = (struct cifs_sid *)sidkey->payload.data;
375 memcpy(&psidid->sid, lsid,
376 sidkey->datalen < sizeof(struct cifs_sid) ?
377 sidkey->datalen : sizeof(struct cifs_sid));
378 memcpy(ssid, &psidid->sid,
379 sidkey->datalen < sizeof(struct cifs_sid) ?
380 sidkey->datalen : sizeof(struct cifs_sid));
381 set_bit(SID_ID_MAPPED, &psidid->state);
382 key_put(sidkey);
383 kfree(psidid->sidstr);
384 }
385 psidid->time = jiffies; /* update ts for accessing */
386 revert_creds(saved_cred);
387 clear_bit(SID_ID_PENDING, &psidid->state);
388 wake_up_bit(&psidid->state, SID_ID_PENDING);
389 } else {
390 rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
391 sidid_pending_wait, TASK_INTERRUPTIBLE);
392 if (rc) {
393 cFYI(1, "%s: sidid_pending_wait interrupted %d",
394 __func__, rc);
395 --psidid->refcount;
396 return rc;
397 }
398 if (test_bit(SID_ID_MAPPED, &psidid->state))
399 memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
400 else
401 rc = -EINVAL;
402 }
403id_sid_out:
404 --psidid->refcount;
405 return rc;
406}
407
408static int
227sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, 409sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
228 struct cifs_fattr *fattr, uint sidtype) 410 struct cifs_fattr *fattr, uint sidtype)
229{ 411{
@@ -383,6 +565,10 @@ init_cifs_idmap(void)
383 spin_lock_init(&sidgidlock); 565 spin_lock_init(&sidgidlock);
384 gidtree = RB_ROOT; 566 gidtree = RB_ROOT;
385 567
568 spin_lock_init(&uidsidlock);
569 siduidtree = RB_ROOT;
570 spin_lock_init(&gidsidlock);
571 sidgidtree = RB_ROOT;
386 register_shrinker(&cifs_shrinker); 572 register_shrinker(&cifs_shrinker);
387 573
388 cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring)); 574 cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
@@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void)
422 while ((node = rb_first(root))) 608 while ((node = rb_first(root)))
423 rb_erase(node, root); 609 rb_erase(node, root);
424 spin_unlock(&sidgidlock); 610 spin_unlock(&sidgidlock);
611
612 root = &siduidtree;
613 spin_lock(&uidsidlock);
614 while ((node = rb_first(root)))
615 rb_erase(node, root);
616 spin_unlock(&uidsidlock);
617
618 root = &sidgidtree;
619 spin_lock(&gidsidlock);
620 while ((node = rb_first(root)))
621 rb_erase(node, root);
622 spin_unlock(&gidsidlock);
425} 623}
426 624
427/* if the two SIDs (roughly equivalent to a UUID for a user or group) are 625/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -706,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
706 acl_size = sizeof(struct cifs_acl); 904 acl_size = sizeof(struct cifs_acl);
707 905
708 num_aces = le32_to_cpu(pdacl->num_aces); 906 num_aces = le32_to_cpu(pdacl->num_aces);
709 if (num_aces > 0) { 907 if (num_aces > 0) {
710 umode_t user_mask = S_IRWXU; 908 umode_t user_mask = S_IRWXU;
711 umode_t group_mask = S_IRWXG; 909 umode_t group_mask = S_IRWXG;
712 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; 910 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
@@ -868,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
868 else 1066 else
869 cFYI(1, "no ACL"); /* BB grant all or default perms? */ 1067 cFYI(1, "no ACL"); /* BB grant all or default perms? */
870 1068
871/* cifscred->uid = owner_sid_ptr->rid;
872 cifscred->gid = group_sid_ptr->rid;
873 memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
874 sizeof(struct cifs_sid));
875 memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
876 sizeof(struct cifs_sid)); */
877
878 return rc; 1069 return rc;
879} 1070}
880 1071
881
882/* Convert permission bits from mode to equivalent CIFS ACL */ 1072/* Convert permission bits from mode to equivalent CIFS ACL */
883static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, 1073static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
884 struct inode *inode, __u64 nmode) 1074 __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag)
885{ 1075{
886 int rc = 0; 1076 int rc = 0;
887 __u32 dacloffset; 1077 __u32 dacloffset;
888 __u32 ndacloffset; 1078 __u32 ndacloffset;
889 __u32 sidsoffset; 1079 __u32 sidsoffset;
890 struct cifs_sid *owner_sid_ptr, *group_sid_ptr; 1080 struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
1081 struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
891 struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ 1082 struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */
892 struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ 1083 struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
893 1084
894 if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL)) 1085 if (nmode != NO_CHANGE_64) { /* chmod */
895 return -EIO; 1086 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
896
897 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
898 le32_to_cpu(pntsd->osidoffset)); 1087 le32_to_cpu(pntsd->osidoffset));
899 group_sid_ptr = (struct cifs_sid *)((char *)pntsd + 1088 group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
900 le32_to_cpu(pntsd->gsidoffset)); 1089 le32_to_cpu(pntsd->gsidoffset));
901 1090 dacloffset = le32_to_cpu(pntsd->dacloffset);
902 dacloffset = le32_to_cpu(pntsd->dacloffset); 1091 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
903 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); 1092 ndacloffset = sizeof(struct cifs_ntsd);
904 1093 ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
905 ndacloffset = sizeof(struct cifs_ntsd); 1094 ndacl_ptr->revision = dacl_ptr->revision;
906 ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); 1095 ndacl_ptr->size = 0;
907 ndacl_ptr->revision = dacl_ptr->revision; 1096 ndacl_ptr->num_aces = 0;
908 ndacl_ptr->size = 0; 1097
909 ndacl_ptr->num_aces = 0; 1098 rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
910 1099 nmode);
911 rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode); 1100 sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
912 1101 /* copy sec desc control portion & owner and group sids */
913 sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); 1102 copy_sec_desc(pntsd, pnntsd, sidsoffset);
914 1103 *aclflag = CIFS_ACL_DACL;
915 /* copy security descriptor control portion and owner and group sid */ 1104 } else {
916 copy_sec_desc(pntsd, pnntsd, sidsoffset); 1105 memcpy(pnntsd, pntsd, secdesclen);
1106 if (uid != NO_CHANGE_32) { /* chown */
1107 owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
1108 le32_to_cpu(pnntsd->osidoffset));
1109 nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
1110 GFP_KERNEL);
1111 if (!nowner_sid_ptr)
1112 return -ENOMEM;
1113 rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr);
1114 if (rc) {
1115 cFYI(1, "%s: Mapping error %d for owner id %d",
1116 __func__, rc, uid);
1117 kfree(nowner_sid_ptr);
1118 return rc;
1119 }
1120 memcpy(owner_sid_ptr, nowner_sid_ptr,
1121 sizeof(struct cifs_sid));
1122 kfree(nowner_sid_ptr);
1123 *aclflag = CIFS_ACL_OWNER;
1124 }
1125 if (gid != NO_CHANGE_32) { /* chgrp */
1126 group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
1127 le32_to_cpu(pnntsd->gsidoffset));
1128 ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
1129 GFP_KERNEL);
1130 if (!ngroup_sid_ptr)
1131 return -ENOMEM;
1132 rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr);
1133 if (rc) {
1134 cFYI(1, "%s: Mapping error %d for group id %d",
1135 __func__, rc, gid);
1136 kfree(ngroup_sid_ptr);
1137 return rc;
1138 }
1139 memcpy(group_sid_ptr, ngroup_sid_ptr,
1140 sizeof(struct cifs_sid));
1141 kfree(ngroup_sid_ptr);
1142 *aclflag = CIFS_ACL_GROUP;
1143 }
1144 }
917 1145
918 return rc; 1146 return rc;
919} 1147}
@@ -945,7 +1173,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
945{ 1173{
946 struct cifs_ntsd *pntsd = NULL; 1174 struct cifs_ntsd *pntsd = NULL;
947 int oplock = 0; 1175 int oplock = 0;
948 int xid, rc; 1176 int xid, rc, create_options = 0;
949 __u16 fid; 1177 __u16 fid;
950 struct cifs_tcon *tcon; 1178 struct cifs_tcon *tcon;
951 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 1179 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -956,9 +1184,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
956 tcon = tlink_tcon(tlink); 1184 tcon = tlink_tcon(tlink);
957 xid = GetXid(); 1185 xid = GetXid();
958 1186
959 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, 1187 if (backup_cred(cifs_sb))
960 &fid, &oplock, NULL, cifs_sb->local_nls, 1188 create_options |= CREATE_OPEN_BACKUP_INTENT;
961 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1189
1190 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL,
1191 create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
1192 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
962 if (!rc) { 1193 if (!rc) {
963 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); 1194 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
964 CIFSSMBClose(xid, tcon, fid); 1195 CIFSSMBClose(xid, tcon, fid);
@@ -991,31 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
991 return pntsd; 1222 return pntsd;
992} 1223}
993 1224
994static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid, 1225 /* Set an ACL on the server */
995 struct cifs_ntsd *pnntsd, u32 acllen) 1226int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
996{ 1227 struct inode *inode, const char *path, int aclflag)
997 int xid, rc;
998 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
999
1000 if (IS_ERR(tlink))
1001 return PTR_ERR(tlink);
1002
1003 xid = GetXid();
1004 rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
1005 FreeXid(xid);
1006 cifs_put_tlink(tlink);
1007
1008 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
1009 return rc;
1010}
1011
1012static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
1013 struct cifs_ntsd *pnntsd, u32 acllen)
1014{ 1228{
1015 int oplock = 0; 1229 int oplock = 0;
1016 int xid, rc; 1230 int xid, rc, access_flags, create_options = 0;
1017 __u16 fid; 1231 __u16 fid;
1018 struct cifs_tcon *tcon; 1232 struct cifs_tcon *tcon;
1233 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1019 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 1234 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
1020 1235
1021 if (IS_ERR(tlink)) 1236 if (IS_ERR(tlink))
@@ -1024,15 +1239,23 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
1024 tcon = tlink_tcon(tlink); 1239 tcon = tlink_tcon(tlink);
1025 xid = GetXid(); 1240 xid = GetXid();
1026 1241
1027 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0, 1242 if (backup_cred(cifs_sb))
1028 &fid, &oplock, NULL, cifs_sb->local_nls, 1243 create_options |= CREATE_OPEN_BACKUP_INTENT;
1029 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1244
1245 if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
1246 access_flags = WRITE_OWNER;
1247 else
1248 access_flags = WRITE_DAC;
1249
1250 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags,
1251 create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
1252 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1030 if (rc) { 1253 if (rc) {
1031 cERROR(1, "Unable to open file to set ACL"); 1254 cERROR(1, "Unable to open file to set ACL");
1032 goto out; 1255 goto out;
1033 } 1256 }
1034 1257
1035 rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen); 1258 rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
1036 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 1259 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
1037 1260
1038 CIFSSMBClose(xid, tcon, fid); 1261 CIFSSMBClose(xid, tcon, fid);
@@ -1042,25 +1265,6 @@ out:
1042 return rc; 1265 return rc;
1043} 1266}
1044 1267
1045/* Set an ACL on the server */
1046int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
1047 struct inode *inode, const char *path)
1048{
1049 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1050 struct cifsFileInfo *open_file;
1051 int rc;
1052
1053 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
1054
1055 open_file = find_readable_file(CIFS_I(inode), true);
1056 if (!open_file)
1057 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
1058
1059 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
1060 cifsFileInfo_put(open_file);
1061 return rc;
1062}
1063
1064/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 1268/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
1065int 1269int
1066cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, 1270cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -1092,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
1092} 1296}
1093 1297
1094/* Convert mode bits to an ACL so we can update the ACL on the server */ 1298/* Convert mode bits to an ACL so we can update the ACL on the server */
1095int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) 1299int
1300id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
1301 uid_t uid, gid_t gid)
1096{ 1302{
1097 int rc = 0; 1303 int rc = 0;
1304 int aclflag = CIFS_ACL_DACL; /* default flag to set */
1098 __u32 secdesclen = 0; 1305 __u32 secdesclen = 0;
1099 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ 1306 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
1100 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ 1307 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
@@ -1124,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
1124 return -ENOMEM; 1331 return -ENOMEM;
1125 } 1332 }
1126 1333
1127 rc = build_sec_desc(pntsd, pnntsd, inode, nmode); 1334 rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
1335 &aclflag);
1128 1336
1129 cFYI(DBG2, "build_sec_desc rc: %d", rc); 1337 cFYI(DBG2, "build_sec_desc rc: %d", rc);
1130 1338
1131 if (!rc) { 1339 if (!rc) {
1132 /* Set the security descriptor */ 1340 /* Set the security descriptor */
1133 rc = set_cifs_acl(pnntsd, secdesclen, inode, path); 1341 rc = set_cifs_acl(pnntsd, secdesclen, inode,
1342 path, aclflag);
1134 cFYI(DBG2, "set_cifs_acl rc: %d", rc); 1343 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
1135 } 1344 }
1136 1345
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e76bfeb68267..2cfb695d1f89 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -37,83 +37,8 @@
37 * the sequence number before this function is called. Also, this function 37 * the sequence number before this function is called. Also, this function
38 * should be called with the server->srv_mutex held. 38 * should be called with the server->srv_mutex held.
39 */ 39 */
40static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 40static int cifs_calc_signature(const struct kvec *iov, int n_vec,
41 struct TCP_Server_Info *server, char *signature) 41 struct TCP_Server_Info *server, char *signature)
42{
43 int rc;
44
45 if (cifs_pdu == NULL || signature == NULL || server == NULL)
46 return -EINVAL;
47
48 if (!server->secmech.sdescmd5) {
49 cERROR(1, "%s: Can't generate signature\n", __func__);
50 return -1;
51 }
52
53 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
54 if (rc) {
55 cERROR(1, "%s: Could not init md5\n", __func__);
56 return rc;
57 }
58
59 rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
60 server->session_key.response, server->session_key.len);
61 if (rc) {
62 cERROR(1, "%s: Could not update with response\n", __func__);
63 return rc;
64 }
65
66 rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
67 cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
68 if (rc) {
69 cERROR(1, "%s: Could not update with payload\n", __func__);
70 return rc;
71 }
72
73 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
74 if (rc)
75 cERROR(1, "%s: Could not generate md5 hash\n", __func__);
76
77 return rc;
78}
79
80/* must be called with server->srv_mutex held */
81int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
82 __u32 *pexpected_response_sequence_number)
83{
84 int rc = 0;
85 char smb_signature[20];
86
87 if ((cifs_pdu == NULL) || (server == NULL))
88 return -EINVAL;
89
90 if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
91 server->tcpStatus == CifsNeedNegotiate)
92 return rc;
93
94 if (!server->session_estab) {
95 strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
96 return rc;
97 }
98
99 cifs_pdu->Signature.Sequence.SequenceNumber =
100 cpu_to_le32(server->sequence_number);
101 cifs_pdu->Signature.Sequence.Reserved = 0;
102
103 *pexpected_response_sequence_number = server->sequence_number++;
104 server->sequence_number++;
105
106 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
107 if (rc)
108 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
109 else
110 memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
111
112 return rc;
113}
114
115static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
116 struct TCP_Server_Info *server, char *signature)
117{ 42{
118 int i; 43 int i;
119 int rc; 44 int rc;
@@ -179,7 +104,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
179{ 104{
180 int rc = 0; 105 int rc = 0;
181 char smb_signature[20]; 106 char smb_signature[20];
182 struct smb_hdr *cifs_pdu = iov[0].iov_base; 107 struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
183 108
184 if ((cifs_pdu == NULL) || (server == NULL)) 109 if ((cifs_pdu == NULL) || (server == NULL))
185 return -EINVAL; 110 return -EINVAL;
@@ -189,7 +114,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
189 return rc; 114 return rc;
190 115
191 if (!server->session_estab) { 116 if (!server->session_estab) {
192 strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); 117 memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
193 return rc; 118 return rc;
194 } 119 }
195 120
@@ -200,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
200 *pexpected_response_sequence_number = server->sequence_number++; 125 *pexpected_response_sequence_number = server->sequence_number++;
201 server->sequence_number++; 126 server->sequence_number++;
202 127
203 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); 128 rc = cifs_calc_signature(iov, n_vec, server, smb_signature);
204 if (rc) 129 if (rc)
205 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 130 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
206 else 131 else
@@ -209,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
209 return rc; 134 return rc;
210} 135}
211 136
212int cifs_verify_signature(struct smb_hdr *cifs_pdu, 137/* must be called with server->srv_mutex held */
138int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
139 __u32 *pexpected_response_sequence_number)
140{
141 struct kvec iov;
142
143 iov.iov_base = cifs_pdu;
144 iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4;
145
146 return cifs_sign_smb2(&iov, 1, server,
147 pexpected_response_sequence_number);
148}
149
150int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
213 struct TCP_Server_Info *server, 151 struct TCP_Server_Info *server,
214 __u32 expected_sequence_number) 152 __u32 expected_sequence_number)
215{ 153{
216 unsigned int rc; 154 unsigned int rc;
217 char server_response_sig[8]; 155 char server_response_sig[8];
218 char what_we_think_sig_should_be[20]; 156 char what_we_think_sig_should_be[20];
157 struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
219 158
220 if (cifs_pdu == NULL || server == NULL) 159 if (cifs_pdu == NULL || server == NULL)
221 return -EINVAL; 160 return -EINVAL;
@@ -247,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
247 cifs_pdu->Signature.Sequence.Reserved = 0; 186 cifs_pdu->Signature.Sequence.Reserved = 0;
248 187
249 mutex_lock(&server->srv_mutex); 188 mutex_lock(&server->srv_mutex);
250 rc = cifs_calculate_signature(cifs_pdu, server, 189 rc = cifs_calc_signature(iov, nr_iov, server,
251 what_we_think_sig_should_be); 190 what_we_think_sig_should_be);
252 mutex_unlock(&server->srv_mutex); 191 mutex_unlock(&server->srv_mutex);
253 192
254 if (rc) 193 if (rc)
@@ -351,9 +290,7 @@ static int
351build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) 290build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
352{ 291{
353 unsigned int dlen; 292 unsigned int dlen;
354 unsigned int wlen; 293 unsigned int size = 2 * sizeof(struct ntlmssp2_name);
355 unsigned int size = 6 * sizeof(struct ntlmssp2_name);
356 __le64 curtime;
357 char *defdmname = "WORKGROUP"; 294 char *defdmname = "WORKGROUP";
358 unsigned char *blobptr; 295 unsigned char *blobptr;
359 struct ntlmssp2_name *attrptr; 296 struct ntlmssp2_name *attrptr;
@@ -365,15 +302,14 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
365 } 302 }
366 303
367 dlen = strlen(ses->domainName); 304 dlen = strlen(ses->domainName);
368 wlen = strlen(ses->server->hostname);
369 305
370 /* The length of this blob is a size which is 306 /*
371 * six times the size of a structure which holds name/size + 307 * The length of this blob is two times the size of a
372 * two times the unicode length of a domain name + 308 * structure (av pair) which holds name/size
373 * two times the unicode length of a server name + 309 * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) +
374 * size of a timestamp (which is 8 bytes). 310 * unicode length of a netbios domain name
375 */ 311 */
376 ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8; 312 ses->auth_key.len = size + 2 * dlen;
377 ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); 313 ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
378 if (!ses->auth_key.response) { 314 if (!ses->auth_key.response) {
379 ses->auth_key.len = 0; 315 ses->auth_key.len = 0;
@@ -384,44 +320,15 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
384 blobptr = ses->auth_key.response; 320 blobptr = ses->auth_key.response;
385 attrptr = (struct ntlmssp2_name *) blobptr; 321 attrptr = (struct ntlmssp2_name *) blobptr;
386 322
323 /*
324 * As defined in MS-NTLM 3.3.2, just this av pair field
325 * is sufficient as part of the temp
326 */
387 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME); 327 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
388 attrptr->length = cpu_to_le16(2 * dlen); 328 attrptr->length = cpu_to_le16(2 * dlen);
389 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); 329 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
390 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp); 330 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
391 331
392 blobptr += 2 * dlen;
393 attrptr = (struct ntlmssp2_name *) blobptr;
394
395 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
396 attrptr->length = cpu_to_le16(2 * wlen);
397 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
398 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
399
400 blobptr += 2 * wlen;
401 attrptr = (struct ntlmssp2_name *) blobptr;
402
403 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
404 attrptr->length = cpu_to_le16(2 * dlen);
405 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
406 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
407
408 blobptr += 2 * dlen;
409 attrptr = (struct ntlmssp2_name *) blobptr;
410
411 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
412 attrptr->length = cpu_to_le16(2 * wlen);
413 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
414 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
415
416 blobptr += 2 * wlen;
417 attrptr = (struct ntlmssp2_name *) blobptr;
418
419 attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
420 attrptr->length = cpu_to_le16(sizeof(__le64));
421 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
422 curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
423 memcpy(blobptr, &curtime, sizeof(__le64));
424
425 return 0; 332 return 0;
426} 333}
427 334
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f93eb948d071..8f1fe324162b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -53,7 +53,7 @@
53int cifsFYI = 0; 53int cifsFYI = 0;
54int cifsERROR = 1; 54int cifsERROR = 1;
55int traceSMB = 0; 55int traceSMB = 0;
56unsigned int oplockEnabled = 1; 56bool enable_oplocks = true;
57unsigned int linuxExtEnabled = 1; 57unsigned int linuxExtEnabled = 1;
58unsigned int lookupCacheEnabled = 1; 58unsigned int lookupCacheEnabled = 1;
59unsigned int multiuser_mount = 0; 59unsigned int multiuser_mount = 0;
@@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0);
74MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " 74MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
75 "Range: 2 to 256"); 75 "Range: 2 to 256");
76unsigned int cifs_max_pending = CIFS_MAX_REQ; 76unsigned int cifs_max_pending = CIFS_MAX_REQ;
77module_param(cifs_max_pending, int, 0); 77module_param(cifs_max_pending, int, 0444);
78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
79 "Default: 50 Range: 2 to 256"); 79 "Default: 50 Range: 2 to 256");
80unsigned short echo_retries = 5; 80unsigned short echo_retries = 5;
@@ -82,6 +82,10 @@ module_param(echo_retries, ushort, 0644);
82MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " 82MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
83 "reconnecting server. Default: 5. 0 means " 83 "reconnecting server. Default: 5. 0 means "
84 "never reconnect."); 84 "never reconnect.");
85module_param(enable_oplocks, bool, 0644);
86MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
87 "y/Y/1");
88
85extern mempool_t *cifs_sm_req_poolp; 89extern mempool_t *cifs_sm_req_poolp;
86extern mempool_t *cifs_req_poolp; 90extern mempool_t *cifs_req_poolp;
87extern mempool_t *cifs_mid_poolp; 91extern mempool_t *cifs_mid_poolp;
@@ -132,12 +136,12 @@ cifs_read_super(struct super_block *sb)
132 else 136 else
133 sb->s_d_op = &cifs_dentry_ops; 137 sb->s_d_op = &cifs_dentry_ops;
134 138
135#ifdef CIFS_NFSD_EXPORT 139#ifdef CONFIG_CIFS_NFSD_EXPORT
136 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 140 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
137 cFYI(1, "export ops supported"); 141 cFYI(1, "export ops supported");
138 sb->s_export_op = &cifs_export_ops; 142 sb->s_export_op = &cifs_export_ops;
139 } 143 }
140#endif /* CIFS_NFSD_EXPORT */ 144#endif /* CONFIG_CIFS_NFSD_EXPORT */
141 145
142 return 0; 146 return 0;
143 147
@@ -432,6 +436,12 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
432 seq_printf(s, ",mfsymlinks"); 436 seq_printf(s, ",mfsymlinks");
433 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) 437 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
434 seq_printf(s, ",fsc"); 438 seq_printf(s, ",fsc");
439 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
440 seq_printf(s, ",nostrictsync");
441 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
442 seq_printf(s, ",noperm");
443 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
444 seq_printf(s, ",strictcache");
435 445
436 seq_printf(s, ",rsize=%d", cifs_sb->rsize); 446 seq_printf(s, ",rsize=%d", cifs_sb->rsize);
437 seq_printf(s, ",wsize=%d", cifs_sb->wsize); 447 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -530,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
530 char *full_path = NULL; 540 char *full_path = NULL;
531 char *s, *p; 541 char *s, *p;
532 char sep; 542 char sep;
533 int xid;
534 543
535 full_path = cifs_build_path_to_root(vol, cifs_sb, 544 full_path = cifs_build_path_to_root(vol, cifs_sb,
536 cifs_sb_master_tcon(cifs_sb)); 545 cifs_sb_master_tcon(cifs_sb));
@@ -539,7 +548,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
539 548
540 cFYI(1, "Get root dentry for %s", full_path); 549 cFYI(1, "Get root dentry for %s", full_path);
541 550
542 xid = GetXid();
543 sep = CIFS_DIR_SEP(cifs_sb); 551 sep = CIFS_DIR_SEP(cifs_sb);
544 dentry = dget(sb->s_root); 552 dentry = dget(sb->s_root);
545 p = s = full_path; 553 p = s = full_path;
@@ -548,6 +556,12 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
548 struct inode *dir = dentry->d_inode; 556 struct inode *dir = dentry->d_inode;
549 struct dentry *child; 557 struct dentry *child;
550 558
559 if (!dir) {
560 dput(dentry);
561 dentry = ERR_PTR(-ENOENT);
562 break;
563 }
564
551 /* skip separators */ 565 /* skip separators */
552 while (*s == sep) 566 while (*s == sep)
553 s++; 567 s++;
@@ -563,12 +577,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
563 mutex_unlock(&dir->i_mutex); 577 mutex_unlock(&dir->i_mutex);
564 dput(dentry); 578 dput(dentry);
565 dentry = child; 579 dentry = child;
566 if (!dentry->d_inode) {
567 dput(dentry);
568 dentry = ERR_PTR(-ENOENT);
569 }
570 } while (!IS_ERR(dentry)); 580 } while (!IS_ERR(dentry));
571 _FreeXid(xid);
572 kfree(full_path); 581 kfree(full_path);
573 return dentry; 582 return dentry;
574} 583}
@@ -721,7 +730,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
721 if (rc < 0) 730 if (rc < 0)
722 return (loff_t)rc; 731 return (loff_t)rc;
723 } 732 }
724 return generic_file_llseek_unlocked(file, offset, origin); 733 return generic_file_llseek(file, offset, origin);
725} 734}
726 735
727static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 736static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
@@ -940,7 +949,8 @@ cifs_init_once(void *inode)
940 struct cifsInodeInfo *cifsi = inode; 949 struct cifsInodeInfo *cifsi = inode;
941 950
942 inode_init_once(&cifsi->vfs_inode); 951 inode_init_once(&cifsi->vfs_inode);
943 INIT_LIST_HEAD(&cifsi->lockList); 952 INIT_LIST_HEAD(&cifsi->llist);
953 mutex_init(&cifsi->lock_mutex);
944} 954}
945 955
946static int 956static int
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index cb71dc1f94d1..d9dbaf869cd1 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -121,9 +121,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
121extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); 121extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
122extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); 122extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
123 123
124#ifdef CIFS_NFSD_EXPORT 124#ifdef CONFIG_CIFS_NFSD_EXPORT
125extern const struct export_operations cifs_export_ops; 125extern const struct export_operations cifs_export_ops;
126#endif /* CIFS_NFSD_EXPORT */ 126#endif /* CONFIG_CIFS_NFSD_EXPORT */
127 127
128#define CIFS_VERSION "1.74" 128#define CIFS_VERSION "1.75"
129#endif /* _CIFSFS_H */ 129#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 38ce6d44b145..8238aa13e01c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -167,6 +167,8 @@ struct smb_vol {
167 uid_t cred_uid; 167 uid_t cred_uid;
168 uid_t linux_uid; 168 uid_t linux_uid;
169 gid_t linux_gid; 169 gid_t linux_gid;
170 uid_t backupuid;
171 gid_t backupgid;
170 mode_t file_mode; 172 mode_t file_mode;
171 mode_t dir_mode; 173 mode_t dir_mode;
172 unsigned secFlg; 174 unsigned secFlg;
@@ -179,6 +181,8 @@ struct smb_vol {
179 bool noperm:1; 181 bool noperm:1;
180 bool no_psx_acl:1; /* set if posix acl support should be disabled */ 182 bool no_psx_acl:1; /* set if posix acl support should be disabled */
181 bool cifs_acl:1; 183 bool cifs_acl:1;
184 bool backupuid_specified; /* mount option backupuid is specified */
185 bool backupgid_specified; /* mount option backupgid is specified */
182 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ 186 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/
183 bool server_ino:1; /* use inode numbers from server ie UniqueId */ 187 bool server_ino:1; /* use inode numbers from server ie UniqueId */
184 bool direct_io:1; 188 bool direct_io:1;
@@ -219,7 +223,8 @@ struct smb_vol {
219 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ 223 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
220 CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \ 224 CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
221 CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \ 225 CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
222 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO) 226 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
227 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
223 228
224#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ 229#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
225 MS_NODEV | MS_SYNCHRONOUS) 230 MS_NODEV | MS_SYNCHRONOUS)
@@ -286,12 +291,18 @@ struct TCP_Server_Info {
286 bool sec_kerberosu2u; /* supports U2U Kerberos */ 291 bool sec_kerberosu2u; /* supports U2U Kerberos */
287 bool sec_kerberos; /* supports plain Kerberos */ 292 bool sec_kerberos; /* supports plain Kerberos */
288 bool sec_mskerberos; /* supports legacy MS Kerberos */ 293 bool sec_mskerberos; /* supports legacy MS Kerberos */
294 bool large_buf; /* is current buffer large? */
289 struct delayed_work echo; /* echo ping workqueue job */ 295 struct delayed_work echo; /* echo ping workqueue job */
296 struct kvec *iov; /* reusable kvec array for receives */
297 unsigned int nr_iov; /* number of kvecs in array */
298 char *smallbuf; /* pointer to current "small" buffer */
299 char *bigbuf; /* pointer to current "big" buffer */
300 unsigned int total_read; /* total amount of data read in this pass */
290#ifdef CONFIG_CIFS_FSCACHE 301#ifdef CONFIG_CIFS_FSCACHE
291 struct fscache_cookie *fscache; /* client index cache cookie */ 302 struct fscache_cookie *fscache; /* client index cache cookie */
292#endif 303#endif
293#ifdef CONFIG_CIFS_STATS2 304#ifdef CONFIG_CIFS_STATS2
294 atomic_t inSend; /* requests trying to send */ 305 atomic_t in_send; /* requests trying to send */
295 atomic_t num_waiters; /* blocked waiting to get in sendrecv */ 306 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
296#endif 307#endif
297}; 308};
@@ -485,9 +496,13 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
485 */ 496 */
486struct cifsLockInfo { 497struct cifsLockInfo {
487 struct list_head llist; /* pointer to next cifsLockInfo */ 498 struct list_head llist; /* pointer to next cifsLockInfo */
499 struct list_head blist; /* pointer to locks blocked on this */
500 wait_queue_head_t block_q;
488 __u64 offset; 501 __u64 offset;
489 __u64 length; 502 __u64 length;
503 __u32 pid;
490 __u8 type; 504 __u8 type;
505 __u16 netfid;
491}; 506};
492 507
493/* 508/*
@@ -520,8 +535,6 @@ struct cifsFileInfo {
520 struct dentry *dentry; 535 struct dentry *dentry;
521 unsigned int f_flags; 536 unsigned int f_flags;
522 struct tcon_link *tlink; 537 struct tcon_link *tlink;
523 struct mutex lock_mutex;
524 struct list_head llist; /* list of byte range locks we have. */
525 bool invalidHandle:1; /* file closed via session abend */ 538 bool invalidHandle:1; /* file closed via session abend */
526 bool oplock_break_cancelled:1; 539 bool oplock_break_cancelled:1;
527 int count; /* refcount protected by cifs_file_list_lock */ 540 int count; /* refcount protected by cifs_file_list_lock */
@@ -554,7 +567,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
554 */ 567 */
555 568
556struct cifsInodeInfo { 569struct cifsInodeInfo {
557 struct list_head lockList; 570 struct list_head llist; /* brlocks for this inode */
571 bool can_cache_brlcks;
572 struct mutex lock_mutex; /* protect two fields above */
558 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 573 /* BB add in lists for dirty pages i.e. write caching info for oplock */
559 struct list_head openFileList; 574 struct list_head openFileList;
560 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 575 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
@@ -643,8 +658,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
643struct mid_q_entry; 658struct mid_q_entry;
644 659
645/* 660/*
646 * This is the prototype for the mid callback function. When creating one, 661 * This is the prototype for the mid receive function. This function is for
647 * take special care to avoid deadlocks. Things to bear in mind: 662 * receiving the rest of the SMB frame, starting with the WordCount (which is
663 * just after the MID in struct smb_hdr). Note:
664 *
665 * - This will be called by cifsd, with no locks held.
666 * - The mid will still be on the pending_mid_q.
667 * - mid->resp_buf will point to the current buffer.
668 *
669 * Returns zero on a successful receive, or an error. The receive state in
670 * the TCP_Server_Info will also be updated.
671 */
672typedef int (mid_receive_t)(struct TCP_Server_Info *server,
673 struct mid_q_entry *mid);
674
675/*
676 * This is the prototype for the mid callback function. This is called once the
677 * mid has been received off of the socket. When creating one, take special
678 * care to avoid deadlocks. Things to bear in mind:
648 * 679 *
649 * - it will be called by cifsd, with no locks held 680 * - it will be called by cifsd, with no locks held
650 * - the mid will be removed from any lists 681 * - the mid will be removed from any lists
@@ -662,9 +693,10 @@ struct mid_q_entry {
662 unsigned long when_sent; /* time when smb send finished */ 693 unsigned long when_sent; /* time when smb send finished */
663 unsigned long when_received; /* when demux complete (taken off wire) */ 694 unsigned long when_received; /* when demux complete (taken off wire) */
664#endif 695#endif
696 mid_receive_t *receive; /* call receive callback */
665 mid_callback_t *callback; /* call completion callback */ 697 mid_callback_t *callback; /* call completion callback */
666 void *callback_data; /* general purpose pointer for callback */ 698 void *callback_data; /* general purpose pointer for callback */
667 struct smb_hdr *resp_buf; /* response buffer */ 699 struct smb_hdr *resp_buf; /* pointer to received SMB header */
668 int midState; /* wish this were enum but can not pass to wait_event */ 700 int midState; /* wish this were enum but can not pass to wait_event */
669 __u8 command; /* smb command code */ 701 __u8 command; /* smb command code */
670 bool largeBuf:1; /* if valid response, is pointer to large buf */ 702 bool largeBuf:1; /* if valid response, is pointer to large buf */
@@ -672,12 +704,54 @@ struct mid_q_entry {
672 bool multiEnd:1; /* both received */ 704 bool multiEnd:1; /* both received */
673}; 705};
674 706
675struct oplock_q_entry { 707/* Make code in transport.c a little cleaner by moving
676 struct list_head qhead; 708 update of optional stats into function below */
677 struct inode *pinode; 709#ifdef CONFIG_CIFS_STATS2
678 struct cifs_tcon *tcon; 710
679 __u16 netfid; 711static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
680}; 712{
713 atomic_inc(&server->in_send);
714}
715
716static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
717{
718 atomic_dec(&server->in_send);
719}
720
721static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
722{
723 atomic_inc(&server->num_waiters);
724}
725
726static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
727{
728 atomic_dec(&server->num_waiters);
729}
730
731static inline void cifs_save_when_sent(struct mid_q_entry *mid)
732{
733 mid->when_sent = jiffies;
734}
735#else
736static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
737{
738}
739static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
740{
741}
742
743static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
744{
745}
746
747static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
748{
749}
750
751static inline void cifs_save_when_sent(struct mid_q_entry *mid)
752{
753}
754#endif
681 755
682/* for pending dnotify requests */ 756/* for pending dnotify requests */
683struct dir_notify_req { 757struct dir_notify_req {
@@ -922,7 +996,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
922 to be established on existing mount if we 996 to be established on existing mount if we
923 have the uid/password or Kerberos credential 997 have the uid/password or Kerberos credential
924 or equivalent for current user */ 998 or equivalent for current user */
925GLOBAL_EXTERN unsigned int oplockEnabled; 999/* enable or disable oplocks */
1000GLOBAL_EXTERN bool enable_oplocks;
926GLOBAL_EXTERN unsigned int lookupCacheEnabled; 1001GLOBAL_EXTERN unsigned int lookupCacheEnabled;
927GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent 1002GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
928 with more secure ntlmssp2 challenge/resp */ 1003 with more secure ntlmssp2 challenge/resp */
@@ -936,10 +1011,16 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
936/* reconnect after this many failed echo attempts */ 1011/* reconnect after this many failed echo attempts */
937GLOBAL_EXTERN unsigned short echo_retries; 1012GLOBAL_EXTERN unsigned short echo_retries;
938 1013
1014#ifdef CONFIG_CIFS_ACL
939GLOBAL_EXTERN struct rb_root uidtree; 1015GLOBAL_EXTERN struct rb_root uidtree;
940GLOBAL_EXTERN struct rb_root gidtree; 1016GLOBAL_EXTERN struct rb_root gidtree;
941GLOBAL_EXTERN spinlock_t siduidlock; 1017GLOBAL_EXTERN spinlock_t siduidlock;
942GLOBAL_EXTERN spinlock_t sidgidlock; 1018GLOBAL_EXTERN spinlock_t sidgidlock;
1019GLOBAL_EXTERN struct rb_root siduidtree;
1020GLOBAL_EXTERN struct rb_root sidgidtree;
1021GLOBAL_EXTERN spinlock_t uidsidlock;
1022GLOBAL_EXTERN spinlock_t gidsidlock;
1023#endif /* CONFIG_CIFS_ACL */
943 1024
944void cifs_oplock_break(struct work_struct *work); 1025void cifs_oplock_break(struct work_struct *work);
945 1026
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de3aa285de03..3fb03e2c8e86 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp {
1089 __le16 DataLengthHigh; 1089 __le16 DataLengthHigh;
1090 __u64 Reserved2; 1090 __u64 Reserved2;
1091 __u16 ByteCount; 1091 __u16 ByteCount;
1092 __u8 Pad; /* BB check for whether padded to DWORD 1092 /* read response data immediately follows */
1093 boundary and optimum performance here */
1094 char Data[1];
1095} __attribute__((packed)) READ_RSP; 1093} __attribute__((packed)) READ_RSP;
1096 1094
1097typedef struct locking_andx_range { 1095typedef struct locking_andx_range {
@@ -1913,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */
1913 1911
1914/* SETFSInfo Levels */ 1912/* SETFSInfo Levels */
1915#define SMB_SET_CIFS_UNIX_INFO 0x200 1913#define SMB_SET_CIFS_UNIX_INFO 0x200
1914/* level 0x203 is defined above in list of QFS info levels */
1915/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */
1916
1917/* Level 0x200 request structure follows */
1916typedef struct smb_com_transaction2_setfsi_req { 1918typedef struct smb_com_transaction2_setfsi_req {
1917 struct smb_hdr hdr; /* wct = 15 */ 1919 struct smb_hdr hdr; /* wct = 15 */
1918 __le16 TotalParameterCount; 1920 __le16 TotalParameterCount;
@@ -1940,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req {
1940 __le64 ClientUnixCap; /* Data end */ 1942 __le64 ClientUnixCap; /* Data end */
1941} __attribute__((packed)) TRANSACTION2_SETFSI_REQ; 1943} __attribute__((packed)) TRANSACTION2_SETFSI_REQ;
1942 1944
1945/* level 0x203 request structure follows */
1946typedef struct smb_com_transaction2_setfs_enc_req {
1947 struct smb_hdr hdr; /* wct = 15 */
1948 __le16 TotalParameterCount;
1949 __le16 TotalDataCount;
1950 __le16 MaxParameterCount;
1951 __le16 MaxDataCount;
1952 __u8 MaxSetupCount;
1953 __u8 Reserved;
1954 __le16 Flags;
1955 __le32 Timeout;
1956 __u16 Reserved2;
1957 __le16 ParameterCount; /* 4 */
1958 __le16 ParameterOffset;
1959 __le16 DataCount; /* 12 */
1960 __le16 DataOffset;
1961 __u8 SetupCount; /* one */
1962 __u8 Reserved3;
1963 __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */
1964 __le16 ByteCount;
1965 __u8 Pad;
1966 __u16 Reserved4; /* Parameters start. */
1967 __le16 InformationLevel;/* Parameters end. */
1968 /* NTLMSSP Blob, Data start. */
1969} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ;
1970
1971/* response for setfsinfo levels 0x200 and 0x203 */
1943typedef struct smb_com_transaction2_setfsi_rsp { 1972typedef struct smb_com_transaction2_setfsi_rsp {
1944 struct smb_hdr hdr; /* wct = 10 */ 1973 struct smb_hdr hdr; /* wct = 10 */
1945 struct trans2_resp t2; 1974 struct trans2_resp t2;
1946 __u16 ByteCount; 1975 __u16 ByteCount;
1947} __attribute__((packed)) TRANSACTION2_SETFSI_RSP; 1976} __attribute__((packed)) TRANSACTION2_SETFSI_RSP;
1948 1977
1949
1950typedef struct smb_com_transaction2_get_dfs_refer_req { 1978typedef struct smb_com_transaction2_get_dfs_refer_req {
1951 struct smb_hdr hdr; /* wct = 15 */ 1979 struct smb_hdr hdr; /* wct = 15 */
1952 __le16 TotalParameterCount; 1980 __le16 TotalParameterCount;
@@ -2098,13 +2126,13 @@ typedef struct {
2098#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and 2126#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and
2099 QFS PROXY call */ 2127 QFS PROXY call */
2100#ifdef CONFIG_CIFS_POSIX 2128#ifdef CONFIG_CIFS_POSIX
2101/* Can not set pathnames cap yet until we send new posix create SMB since 2129/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
2102 otherwise server can treat such handles opened with older ntcreatex 2130 LockingX instead of posix locking call on unix sess (and we do not expect
2103 (by a new client which knows how to send posix path ops) 2131 LockingX to use different (ie Windows) semantics than posix locking on
2104 as non-posix handles (can affect write behavior with byte range locks. 2132 the same session (if WINE needs to do this later, we can add this cap
2105 We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */ 2133 back in later */
2106/* #define CIFS_UNIX_CAP_MASK 0x000000fb */ 2134/* #define CIFS_UNIX_CAP_MASK 0x000000fb */
2107#define CIFS_UNIX_CAP_MASK 0x000000db 2135#define CIFS_UNIX_CAP_MASK 0x000003db
2108#else 2136#else
2109#define CIFS_UNIX_CAP_MASK 0x00000013 2137#define CIFS_UNIX_CAP_MASK 0x00000013
2110#endif /* CONFIG_CIFS_POSIX */ 2138#endif /* CONFIG_CIFS_POSIX */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8df28e925e5b..ef4f631e4c01 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
69 struct TCP_Server_Info *server); 69 struct TCP_Server_Info *server);
70extern void DeleteMidQEntry(struct mid_q_entry *midEntry); 70extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
71extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, 71extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
72 unsigned int nvec, mid_callback_t *callback, 72 unsigned int nvec, mid_receive_t *receive,
73 void *cbdata, bool ignore_pend); 73 mid_callback_t *callback, void *cbdata,
74 bool ignore_pend);
74extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, 75extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
75 struct smb_hdr * /* input */ , 76 struct smb_hdr * /* input */ ,
76 struct smb_hdr * /* out */ , 77 struct smb_hdr * /* out */ ,
@@ -90,6 +91,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
90extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); 91extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
91extern bool is_valid_oplock_break(struct smb_hdr *smb, 92extern bool is_valid_oplock_break(struct smb_hdr *smb,
92 struct TCP_Server_Info *); 93 struct TCP_Server_Info *);
94extern bool backup_cred(struct cifs_sb_info *);
93extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 95extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
94extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, 96extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
95 unsigned int bytes_written); 97 unsigned int bytes_written);
@@ -145,12 +147,19 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
145extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, 147extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
146 struct cifs_fattr *fattr, struct inode *inode, 148 struct cifs_fattr *fattr, struct inode *inode,
147 const char *path, const __u16 *pfid); 149 const char *path, const __u16 *pfid);
148extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); 150extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
151 uid_t, gid_t);
149extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, 152extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
150 const char *, u32 *); 153 const char *, u32 *);
151extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, 154extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
152 const char *); 155 const char *, int);
153 156
157extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
158extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
159 unsigned int to_read);
160extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
161 struct kvec *iov_orig, unsigned int nr_segs,
162 unsigned int to_read);
154extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, 163extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
155 struct cifs_sb_info *cifs_sb); 164 struct cifs_sb_info *cifs_sb);
156extern int cifs_match_super(struct super_block *, void *); 165extern int cifs_match_super(struct super_block *, void *);
@@ -359,14 +368,17 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
359 const struct nls_table *nls_codepage, 368 const struct nls_table *nls_codepage,
360 int remap_special_chars); 369 int remap_special_chars);
361 370
371extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
372 const __u8 lock_type, const __u32 num_unlock,
373 const __u32 num_lock, LOCKING_ANDX_RANGE *buf);
362extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, 374extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
363 const __u16 netfid, const __u64 len, 375 const __u16 netfid, const __u32 netpid, const __u64 len,
364 const __u64 offset, const __u32 numUnlock, 376 const __u64 offset, const __u32 numUnlock,
365 const __u32 numLock, const __u8 lockType, 377 const __u32 numLock, const __u8 lockType,
366 const bool waitFlag, const __u8 oplock_level); 378 const bool waitFlag, const __u8 oplock_level);
367extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, 379extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
368 const __u16 smb_file_id, const int get_flag, 380 const __u16 smb_file_id, const __u32 netpid,
369 const __u64 len, struct file_lock *, 381 const int get_flag, const __u64 len, struct file_lock *,
370 const __u16 lock_type, const bool waitFlag); 382 const __u16 lock_type, const bool waitFlag);
371extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon); 383extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon);
372extern int CIFSSMBEcho(struct TCP_Server_Info *server); 384extern int CIFSSMBEcho(struct TCP_Server_Info *server);
@@ -380,7 +392,7 @@ extern void tconInfoFree(struct cifs_tcon *);
380extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); 392extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
381extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, 393extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
382 __u32 *); 394 __u32 *);
383extern int cifs_verify_signature(struct smb_hdr *, 395extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
384 struct TCP_Server_Info *server, 396 struct TCP_Server_Info *server,
385 __u32 expected_sequence_number); 397 __u32 expected_sequence_number);
386extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); 398extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
@@ -419,7 +431,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon,
419extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, 431extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon,
420 __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); 432 __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
421extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16, 433extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16,
422 struct cifs_ntsd *, __u32); 434 struct cifs_ntsd *, __u32, int);
423extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon, 435extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
424 const unsigned char *searchName, 436 const unsigned char *searchName,
425 char *acl_inf, const int buflen, const int acl_type, 437 char *acl_inf, const int buflen, const int acl_type,
@@ -440,6 +452,24 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
440extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, 452extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
441 unsigned char *p24); 453 unsigned char *p24);
442 454
455/* asynchronous read support */
456struct cifs_readdata {
457 struct cifsFileInfo *cfile;
458 struct address_space *mapping;
459 __u64 offset;
460 unsigned int bytes;
461 pid_t pid;
462 int result;
463 struct list_head pages;
464 struct work_struct work;
465 unsigned int nr_iov;
466 struct kvec iov[1];
467};
468
469struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages);
470void cifs_readdata_free(struct cifs_readdata *rdata);
471int cifs_async_readv(struct cifs_readdata *rdata);
472
443/* asynchronous write support */ 473/* asynchronous write support */
444struct cifs_writedata { 474struct cifs_writedata {
445 struct kref refcount; 475 struct kref refcount;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index aac37d99a487..6600aa2d2ef3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -33,6 +33,8 @@
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <linux/posix_acl_xattr.h> 34#include <linux/posix_acl_xattr.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/swap.h>
37#include <linux/task_io_accounting_ops.h>
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37#include "cifspdu.h" 39#include "cifspdu.h"
38#include "cifsglob.h" 40#include "cifsglob.h"
@@ -40,6 +42,7 @@
40#include "cifsproto.h" 42#include "cifsproto.h"
41#include "cifs_unicode.h" 43#include "cifs_unicode.h"
42#include "cifs_debug.h" 44#include "cifs_debug.h"
45#include "fscache.h"
43 46
44#ifdef CONFIG_CIFS_POSIX 47#ifdef CONFIG_CIFS_POSIX
45static struct { 48static struct {
@@ -83,6 +86,9 @@ static struct {
83#endif /* CONFIG_CIFS_WEAK_PW_HASH */ 86#endif /* CONFIG_CIFS_WEAK_PW_HASH */
84#endif /* CIFS_POSIX */ 87#endif /* CIFS_POSIX */
85 88
89/* Forward declarations */
90static void cifs_readv_complete(struct work_struct *work);
91
86/* Mark as invalid, all open files on tree connections since they 92/* Mark as invalid, all open files on tree connections since they
87 were closed when session to server was lost */ 93 were closed when session to server was lost */
88static void mark_open_files_invalid(struct cifs_tcon *pTcon) 94static void mark_open_files_invalid(struct cifs_tcon *pTcon)
@@ -453,8 +459,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
453 } 459 }
454 server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); 460 server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
455 server->maxReq = le16_to_cpu(rsp->MaxMpxCount); 461 server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
456 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), 462 server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
457 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
458 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); 463 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
459 /* even though we do not use raw we might as well set this 464 /* even though we do not use raw we might as well set this
460 accurately, in case we ever find a need for it */ 465 accurately, in case we ever find a need for it */
@@ -561,8 +566,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
561 little endian */ 566 little endian */
562 server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); 567 server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
563 /* probably no need to store and check maxvcs */ 568 /* probably no need to store and check maxvcs */
564 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), 569 server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
565 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
566 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 570 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
567 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); 571 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
568 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 572 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -739,7 +743,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
739 iov.iov_base = smb; 743 iov.iov_base = smb;
740 iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; 744 iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
741 745
742 rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true); 746 rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback,
747 server, true);
743 if (rc) 748 if (rc)
744 cFYI(1, "Echo request failed: %d", rc); 749 cFYI(1, "Echo request failed: %d", rc);
745 750
@@ -1376,6 +1381,359 @@ openRetry:
1376 return rc; 1381 return rc;
1377} 1382}
1378 1383
1384struct cifs_readdata *
1385cifs_readdata_alloc(unsigned int nr_pages)
1386{
1387 struct cifs_readdata *rdata;
1388
1389 /* readdata + 1 kvec for each page */
1390 rdata = kzalloc(sizeof(*rdata) +
1391 sizeof(struct kvec) * nr_pages, GFP_KERNEL);
1392 if (rdata != NULL) {
1393 INIT_WORK(&rdata->work, cifs_readv_complete);
1394 INIT_LIST_HEAD(&rdata->pages);
1395 }
1396 return rdata;
1397}
1398
1399void
1400cifs_readdata_free(struct cifs_readdata *rdata)
1401{
1402 cifsFileInfo_put(rdata->cfile);
1403 kfree(rdata);
1404}
1405
1406/*
1407 * Discard any remaining data in the current SMB. To do this, we borrow the
1408 * current bigbuf.
1409 */
1410static int
1411cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1412{
1413 READ_RSP *rsp = (READ_RSP *)server->smallbuf;
1414 unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length);
1415 int remaining = rfclen + 4 - server->total_read;
1416 struct cifs_readdata *rdata = mid->callback_data;
1417
1418 while (remaining > 0) {
1419 int length;
1420
1421 length = cifs_read_from_socket(server, server->bigbuf,
1422 min_t(unsigned int, remaining,
1423 CIFSMaxBufSize + MAX_CIFS_HDR_SIZE));
1424 if (length < 0)
1425 return length;
1426 server->total_read += length;
1427 remaining -= length;
1428 }
1429
1430 dequeue_mid(mid, rdata->result);
1431 return 0;
1432}
1433
1434static int
1435cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1436{
1437 int length, len;
1438 unsigned int data_offset, remaining, data_len;
1439 struct cifs_readdata *rdata = mid->callback_data;
1440 READ_RSP *rsp = (READ_RSP *)server->smallbuf;
1441 unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4;
1442 u64 eof;
1443 pgoff_t eof_index;
1444 struct page *page, *tpage;
1445
1446 cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__,
1447 mid->mid, rdata->offset, rdata->bytes);
1448
1449 /*
1450 * read the rest of READ_RSP header (sans Data array), or whatever we
1451 * can if there's not enough data. At this point, we've read down to
1452 * the Mid.
1453 */
1454 len = min_t(unsigned int, rfclen, sizeof(*rsp)) -
1455 sizeof(struct smb_hdr) + 1;
1456
1457 rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1;
1458 rdata->iov[0].iov_len = len;
1459
1460 length = cifs_readv_from_socket(server, rdata->iov, 1, len);
1461 if (length < 0)
1462 return length;
1463 server->total_read += length;
1464
1465 /* Was the SMB read successful? */
1466 rdata->result = map_smb_to_linux_error(&rsp->hdr, false);
1467 if (rdata->result != 0) {
1468 cFYI(1, "%s: server returned error %d", __func__,
1469 rdata->result);
1470 return cifs_readv_discard(server, mid);
1471 }
1472
1473 /* Is there enough to get to the rest of the READ_RSP header? */
1474 if (server->total_read < sizeof(READ_RSP)) {
1475 cFYI(1, "%s: server returned short header. got=%u expected=%zu",
1476 __func__, server->total_read, sizeof(READ_RSP));
1477 rdata->result = -EIO;
1478 return cifs_readv_discard(server, mid);
1479 }
1480
1481 data_offset = le16_to_cpu(rsp->DataOffset) + 4;
1482 if (data_offset < server->total_read) {
1483 /*
1484 * win2k8 sometimes sends an offset of 0 when the read
1485 * is beyond the EOF. Treat it as if the data starts just after
1486 * the header.
1487 */
1488 cFYI(1, "%s: data offset (%u) inside read response header",
1489 __func__, data_offset);
1490 data_offset = server->total_read;
1491 } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
1492 /* data_offset is beyond the end of smallbuf */
1493 cFYI(1, "%s: data offset (%u) beyond end of smallbuf",
1494 __func__, data_offset);
1495 rdata->result = -EIO;
1496 return cifs_readv_discard(server, mid);
1497 }
1498
1499 cFYI(1, "%s: total_read=%u data_offset=%u", __func__,
1500 server->total_read, data_offset);
1501
1502 len = data_offset - server->total_read;
1503 if (len > 0) {
1504 /* read any junk before data into the rest of smallbuf */
1505 rdata->iov[0].iov_base = server->smallbuf + server->total_read;
1506 rdata->iov[0].iov_len = len;
1507 length = cifs_readv_from_socket(server, rdata->iov, 1, len);
1508 if (length < 0)
1509 return length;
1510 server->total_read += length;
1511 }
1512
1513 /* set up first iov for signature check */
1514 rdata->iov[0].iov_base = server->smallbuf;
1515 rdata->iov[0].iov_len = server->total_read;
1516 cFYI(1, "0: iov_base=%p iov_len=%zu",
1517 rdata->iov[0].iov_base, rdata->iov[0].iov_len);
1518
1519 /* how much data is in the response? */
1520 data_len = le16_to_cpu(rsp->DataLengthHigh) << 16;
1521 data_len += le16_to_cpu(rsp->DataLength);
1522 if (data_offset + data_len > rfclen) {
1523 /* data_len is corrupt -- discard frame */
1524 rdata->result = -EIO;
1525 return cifs_readv_discard(server, mid);
1526 }
1527
1528 /* marshal up the page array */
1529 len = 0;
1530 remaining = data_len;
1531 rdata->nr_iov = 1;
1532
1533 /* determine the eof that the server (probably) has */
1534 eof = CIFS_I(rdata->mapping->host)->server_eof;
1535 eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
1536 cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
1537
1538 list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
1539 if (remaining >= PAGE_CACHE_SIZE) {
1540 /* enough data to fill the page */
1541 rdata->iov[rdata->nr_iov].iov_base = kmap(page);
1542 rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
1543 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
1544 rdata->nr_iov, page->index,
1545 rdata->iov[rdata->nr_iov].iov_base,
1546 rdata->iov[rdata->nr_iov].iov_len);
1547 ++rdata->nr_iov;
1548 len += PAGE_CACHE_SIZE;
1549 remaining -= PAGE_CACHE_SIZE;
1550 } else if (remaining > 0) {
1551 /* enough for partial page, fill and zero the rest */
1552 rdata->iov[rdata->nr_iov].iov_base = kmap(page);
1553 rdata->iov[rdata->nr_iov].iov_len = remaining;
1554 cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
1555 rdata->nr_iov, page->index,
1556 rdata->iov[rdata->nr_iov].iov_base,
1557 rdata->iov[rdata->nr_iov].iov_len);
1558 memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
1559 '\0', PAGE_CACHE_SIZE - remaining);
1560 ++rdata->nr_iov;
1561 len += remaining;
1562 remaining = 0;
1563 } else if (page->index > eof_index) {
1564 /*
1565 * The VFS will not try to do readahead past the
1566 * i_size, but it's possible that we have outstanding
1567 * writes with gaps in the middle and the i_size hasn't
1568 * caught up yet. Populate those with zeroed out pages
1569 * to prevent the VFS from repeatedly attempting to
1570 * fill them until the writes are flushed.
1571 */
1572 zero_user(page, 0, PAGE_CACHE_SIZE);
1573 list_del(&page->lru);
1574 lru_cache_add_file(page);
1575 flush_dcache_page(page);
1576 SetPageUptodate(page);
1577 unlock_page(page);
1578 page_cache_release(page);
1579 } else {
1580 /* no need to hold page hostage */
1581 list_del(&page->lru);
1582 lru_cache_add_file(page);
1583 unlock_page(page);
1584 page_cache_release(page);
1585 }
1586 }
1587
1588 /* issue the read if we have any iovecs left to fill */
1589 if (rdata->nr_iov > 1) {
1590 length = cifs_readv_from_socket(server, &rdata->iov[1],
1591 rdata->nr_iov - 1, len);
1592 if (length < 0)
1593 return length;
1594 server->total_read += length;
1595 } else {
1596 length = 0;
1597 }
1598
1599 rdata->bytes = length;
1600
1601 cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read,
1602 rfclen, remaining);
1603
1604 /* discard anything left over */
1605 if (server->total_read < rfclen)
1606 return cifs_readv_discard(server, mid);
1607
1608 dequeue_mid(mid, false);
1609 return length;
1610}
1611
1612static void
1613cifs_readv_complete(struct work_struct *work)
1614{
1615 struct cifs_readdata *rdata = container_of(work,
1616 struct cifs_readdata, work);
1617 struct page *page, *tpage;
1618
1619 list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
1620 list_del(&page->lru);
1621 lru_cache_add_file(page);
1622
1623 if (rdata->result == 0) {
1624 kunmap(page);
1625 flush_dcache_page(page);
1626 SetPageUptodate(page);
1627 }
1628
1629 unlock_page(page);
1630
1631 if (rdata->result == 0)
1632 cifs_readpage_to_fscache(rdata->mapping->host, page);
1633
1634 page_cache_release(page);
1635 }
1636 cifs_readdata_free(rdata);
1637}
1638
1639static void
1640cifs_readv_callback(struct mid_q_entry *mid)
1641{
1642 struct cifs_readdata *rdata = mid->callback_data;
1643 struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
1644 struct TCP_Server_Info *server = tcon->ses->server;
1645
1646 cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__,
1647 mid->mid, mid->midState, rdata->result, rdata->bytes);
1648
1649 switch (mid->midState) {
1650 case MID_RESPONSE_RECEIVED:
1651 /* result already set, check signature */
1652 if (server->sec_mode &
1653 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1654 if (cifs_verify_signature(rdata->iov, rdata->nr_iov,
1655 server, mid->sequence_number + 1))
1656 cERROR(1, "Unexpected SMB signature");
1657 }
1658 /* FIXME: should this be counted toward the initiating task? */
1659 task_io_account_read(rdata->bytes);
1660 cifs_stats_bytes_read(tcon, rdata->bytes);
1661 break;
1662 case MID_REQUEST_SUBMITTED:
1663 case MID_RETRY_NEEDED:
1664 rdata->result = -EAGAIN;
1665 break;
1666 default:
1667 rdata->result = -EIO;
1668 }
1669
1670 queue_work(system_nrt_wq, &rdata->work);
1671 DeleteMidQEntry(mid);
1672 atomic_dec(&server->inFlight);
1673 wake_up(&server->request_q);
1674}
1675
1676/* cifs_async_readv - send an async write, and set up mid to handle result */
1677int
1678cifs_async_readv(struct cifs_readdata *rdata)
1679{
1680 int rc;
1681 READ_REQ *smb = NULL;
1682 int wct;
1683 struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
1684
1685 cFYI(1, "%s: offset=%llu bytes=%u", __func__,
1686 rdata->offset, rdata->bytes);
1687
1688 if (tcon->ses->capabilities & CAP_LARGE_FILES)
1689 wct = 12;
1690 else {
1691 wct = 10; /* old style read */
1692 if ((rdata->offset >> 32) > 0) {
1693 /* can not handle this big offset for old */
1694 return -EIO;
1695 }
1696 }
1697
1698 rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb);
1699 if (rc)
1700 return rc;
1701
1702 smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
1703 smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
1704
1705 smb->AndXCommand = 0xFF; /* none */
1706 smb->Fid = rdata->cfile->netfid;
1707 smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
1708 if (wct == 12)
1709 smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
1710 smb->Remaining = 0;
1711 smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
1712 smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
1713 if (wct == 12)
1714 smb->ByteCount = 0;
1715 else {
1716 /* old style read */
1717 struct smb_com_readx_req *smbr =
1718 (struct smb_com_readx_req *)smb;
1719 smbr->ByteCount = 0;
1720 }
1721
1722 /* 4 for RFC1001 length + 1 for BCC */
1723 rdata->iov[0].iov_base = smb;
1724 rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
1725
1726 rc = cifs_call_async(tcon->ses->server, rdata->iov, 1,
1727 cifs_readv_receive, cifs_readv_callback,
1728 rdata, false);
1729
1730 if (rc == 0)
1731 cifs_stats_inc(&tcon->num_reads);
1732
1733 cifs_small_buf_release(smb);
1734 return rc;
1735}
1736
1379int 1737int
1380CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, 1738CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
1381 char **buf, int *pbuf_type) 1739 char **buf, int *pbuf_type)
@@ -1836,7 +2194,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
1836 2194
1837 kref_get(&wdata->refcount); 2195 kref_get(&wdata->refcount);
1838 rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1, 2196 rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
1839 cifs_writev_callback, wdata, false); 2197 NULL, cifs_writev_callback, wdata, false);
1840 2198
1841 if (rc == 0) 2199 if (rc == 0)
1842 cifs_stats_inc(&tcon->num_writes); 2200 cifs_stats_inc(&tcon->num_writes);
@@ -1962,10 +2320,50 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
1962 return rc; 2320 return rc;
1963} 2321}
1964 2322
2323int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
2324 const __u8 lock_type, const __u32 num_unlock,
2325 const __u32 num_lock, LOCKING_ANDX_RANGE *buf)
2326{
2327 int rc = 0;
2328 LOCK_REQ *pSMB = NULL;
2329 struct kvec iov[2];
2330 int resp_buf_type;
2331 __u16 count;
2332
2333 cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock);
2334
2335 rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
2336 if (rc)
2337 return rc;
2338
2339 pSMB->Timeout = 0;
2340 pSMB->NumberOfLocks = cpu_to_le16(num_lock);
2341 pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock);
2342 pSMB->LockType = lock_type;
2343 pSMB->AndXCommand = 0xFF; /* none */
2344 pSMB->Fid = netfid; /* netfid stays le */
2345
2346 count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
2347 inc_rfc1001_len(pSMB, count);
2348 pSMB->ByteCount = cpu_to_le16(count);
2349
2350 iov[0].iov_base = (char *)pSMB;
2351 iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 -
2352 (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
2353 iov[1].iov_base = (char *)buf;
2354 iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
2355
2356 cifs_stats_inc(&tcon->num_locks);
2357 rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
2358 if (rc)
2359 cFYI(1, "Send error in cifs_lockv = %d", rc);
2360
2361 return rc;
2362}
1965 2363
1966int 2364int
1967CIFSSMBLock(const int xid, struct cifs_tcon *tcon, 2365CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
1968 const __u16 smb_file_id, const __u64 len, 2366 const __u16 smb_file_id, const __u32 netpid, const __u64 len,
1969 const __u64 offset, const __u32 numUnlock, 2367 const __u64 offset, const __u32 numUnlock,
1970 const __u32 numLock, const __u8 lockType, 2368 const __u32 numLock, const __u8 lockType,
1971 const bool waitFlag, const __u8 oplock_level) 2369 const bool waitFlag, const __u8 oplock_level)
@@ -2001,7 +2399,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
2001 pSMB->Fid = smb_file_id; /* netfid stays le */ 2399 pSMB->Fid = smb_file_id; /* netfid stays le */
2002 2400
2003 if ((numLock != 0) || (numUnlock != 0)) { 2401 if ((numLock != 0) || (numUnlock != 0)) {
2004 pSMB->Locks[0].Pid = cpu_to_le16(current->tgid); 2402 pSMB->Locks[0].Pid = cpu_to_le16(netpid);
2005 /* BB where to store pid high? */ 2403 /* BB where to store pid high? */
2006 pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len); 2404 pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len);
2007 pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32)); 2405 pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32));
@@ -2035,9 +2433,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
2035 2433
2036int 2434int
2037CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, 2435CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
2038 const __u16 smb_file_id, const int get_flag, const __u64 len, 2436 const __u16 smb_file_id, const __u32 netpid, const int get_flag,
2039 struct file_lock *pLockData, const __u16 lock_type, 2437 const __u64 len, struct file_lock *pLockData,
2040 const bool waitFlag) 2438 const __u16 lock_type, const bool waitFlag)
2041{ 2439{
2042 struct smb_com_transaction2_sfi_req *pSMB = NULL; 2440 struct smb_com_transaction2_sfi_req *pSMB = NULL;
2043 struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; 2441 struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
@@ -2095,7 +2493,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
2095 } else 2493 } else
2096 pSMB->Timeout = 0; 2494 pSMB->Timeout = 0;
2097 2495
2098 parm_data->pid = cpu_to_le32(current->tgid); 2496 parm_data->pid = cpu_to_le32(netpid);
2099 parm_data->start = cpu_to_le64(pLockData->fl_start); 2497 parm_data->start = cpu_to_le64(pLockData->fl_start);
2100 parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ 2498 parm_data->length = cpu_to_le64(len); /* normalize negative numbers */
2101 2499
@@ -2812,8 +3210,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon,
2812 pSMB->TotalDataCount = 0; 3210 pSMB->TotalDataCount = 0;
2813 pSMB->MaxParameterCount = cpu_to_le32(2); 3211 pSMB->MaxParameterCount = cpu_to_le32(2);
2814 /* BB find exact data count max from sess structure BB */ 3212 /* BB find exact data count max from sess structure BB */
2815 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - 3213 pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
2816 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
2817 pSMB->MaxSetupCount = 4; 3214 pSMB->MaxSetupCount = 4;
2818 pSMB->Reserved = 0; 3215 pSMB->Reserved = 0;
2819 pSMB->ParameterOffset = 0; 3216 pSMB->ParameterOffset = 0;
@@ -3306,8 +3703,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count,
3306 pSMB->Reserved = 0; 3703 pSMB->Reserved = 0;
3307 pSMB->TotalParameterCount = cpu_to_le32(parm_len); 3704 pSMB->TotalParameterCount = cpu_to_le32(parm_len);
3308 pSMB->TotalDataCount = 0; 3705 pSMB->TotalDataCount = 0;
3309 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - 3706 pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
3310 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
3311 pSMB->ParameterCount = pSMB->TotalParameterCount; 3707 pSMB->ParameterCount = pSMB->TotalParameterCount;
3312 pSMB->DataCount = pSMB->TotalDataCount; 3708 pSMB->DataCount = pSMB->TotalDataCount;
3313 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + 3709 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
@@ -3467,7 +3863,7 @@ qsec_out:
3467 3863
3468int 3864int
3469CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, 3865CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
3470 struct cifs_ntsd *pntsd, __u32 acllen) 3866 struct cifs_ntsd *pntsd, __u32 acllen, int aclflag)
3471{ 3867{
3472 __u16 byte_count, param_count, data_count, param_offset, data_offset; 3868 __u16 byte_count, param_count, data_count, param_offset, data_offset;
3473 int rc = 0; 3869 int rc = 0;
@@ -3504,7 +3900,7 @@ setCifsAclRetry:
3504 3900
3505 pSMB->Fid = fid; /* file handle always le */ 3901 pSMB->Fid = fid; /* file handle always le */
3506 pSMB->Reserved2 = 0; 3902 pSMB->Reserved2 = 0;
3507 pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL); 3903 pSMB->AclFlags = cpu_to_le32(aclflag);
3508 3904
3509 if (pntsd && acllen) { 3905 if (pntsd && acllen) {
3510 memcpy((char *) &pSMBr->hdr.Protocol + data_offset, 3906 memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
@@ -3977,8 +4373,7 @@ findFirstRetry:
3977 params = 12 + name_len /* includes null */ ; 4373 params = 12 + name_len /* includes null */ ;
3978 pSMB->TotalDataCount = 0; /* no EAs */ 4374 pSMB->TotalDataCount = 0; /* no EAs */
3979 pSMB->MaxParameterCount = cpu_to_le16(10); 4375 pSMB->MaxParameterCount = cpu_to_le16(10);
3980 pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf - 4376 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
3981 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
3982 pSMB->MaxSetupCount = 0; 4377 pSMB->MaxSetupCount = 0;
3983 pSMB->Reserved = 0; 4378 pSMB->Reserved = 0;
3984 pSMB->Flags = 0; 4379 pSMB->Flags = 0;
@@ -4052,8 +4447,7 @@ findFirstRetry:
4052 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + 4447 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
4053 psrch_inf->entries_in_buffer; 4448 psrch_inf->entries_in_buffer;
4054 lnoff = le16_to_cpu(parms->LastNameOffset); 4449 lnoff = le16_to_cpu(parms->LastNameOffset);
4055 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 4450 if (CIFSMaxBufSize < lnoff) {
4056 lnoff) {
4057 cERROR(1, "ignoring corrupt resume name"); 4451 cERROR(1, "ignoring corrupt resume name");
4058 psrch_inf->last_entry = NULL; 4452 psrch_inf->last_entry = NULL;
4059 return rc; 4453 return rc;
@@ -4079,7 +4473,8 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
4079 T2_FNEXT_RSP_PARMS *parms; 4473 T2_FNEXT_RSP_PARMS *parms;
4080 char *response_data; 4474 char *response_data;
4081 int rc = 0; 4475 int rc = 0;
4082 int bytes_returned, name_len; 4476 int bytes_returned;
4477 unsigned int name_len;
4083 __u16 params, byte_count; 4478 __u16 params, byte_count;
4084 4479
4085 cFYI(1, "In FindNext"); 4480 cFYI(1, "In FindNext");
@@ -4096,9 +4491,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
4096 byte_count = 0; 4491 byte_count = 0;
4097 pSMB->TotalDataCount = 0; /* no EAs */ 4492 pSMB->TotalDataCount = 0; /* no EAs */
4098 pSMB->MaxParameterCount = cpu_to_le16(8); 4493 pSMB->MaxParameterCount = cpu_to_le16(8);
4099 pSMB->MaxDataCount = 4494 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
4100 cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) &
4101 0xFFFFFF00);
4102 pSMB->MaxSetupCount = 0; 4495 pSMB->MaxSetupCount = 0;
4103 pSMB->Reserved = 0; 4496 pSMB->Reserved = 0;
4104 pSMB->Flags = 0; 4497 pSMB->Flags = 0;
@@ -4180,8 +4573,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
4180 psrch_inf->index_of_last_entry += 4573 psrch_inf->index_of_last_entry +=
4181 psrch_inf->entries_in_buffer; 4574 psrch_inf->entries_in_buffer;
4182 lnoff = le16_to_cpu(parms->LastNameOffset); 4575 lnoff = le16_to_cpu(parms->LastNameOffset);
4183 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 4576 if (CIFSMaxBufSize < lnoff) {
4184 lnoff) {
4185 cERROR(1, "ignoring corrupt resume name"); 4577 cERROR(1, "ignoring corrupt resume name");
4186 psrch_inf->last_entry = NULL; 4578 psrch_inf->last_entry = NULL;
4187 return rc; 4579 return rc;
@@ -5839,7 +6231,7 @@ QAllEAsRetry:
5839 6231
5840 if (ea_name) { 6232 if (ea_name) {
5841 if (ea_name_len == name_len && 6233 if (ea_name_len == name_len &&
5842 strncmp(ea_name, temp_ptr, name_len) == 0) { 6234 memcmp(ea_name, temp_ptr, name_len) == 0) {
5843 temp_ptr += name_len + 1; 6235 temp_ptr += name_len + 1;
5844 rc = value_len; 6236 rc = value_len;
5845 if (buf_size == 0) 6237 if (buf_size == 0)
@@ -6034,12 +6426,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
6034 pSMB->TotalParameterCount = 0 ; 6426 pSMB->TotalParameterCount = 0 ;
6035 pSMB->TotalDataCount = 0; 6427 pSMB->TotalDataCount = 0;
6036 pSMB->MaxParameterCount = cpu_to_le32(2); 6428 pSMB->MaxParameterCount = cpu_to_le32(2);
6037 /* BB find exact data count max from sess structure BB */ 6429 pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
6038 pSMB->MaxDataCount = 0; /* same in little endian or be */
6039/* BB VERIFY verify which is correct for above BB */
6040 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
6041 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
6042
6043 pSMB->MaxSetupCount = 4; 6430 pSMB->MaxSetupCount = 4;
6044 pSMB->Reserved = 0; 6431 pSMB->Reserved = 0;
6045 pSMB->ParameterOffset = 0; 6432 pSMB->ParameterOffset = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 80c2e3add3a2..d545a95c30ed 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -181,7 +181,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
181 -EINVAL = invalid transact2 181 -EINVAL = invalid transact2
182 182
183 */ 183 */
184static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) 184static int check2ndT2(struct smb_hdr *pSMB)
185{ 185{
186 struct smb_t2_rsp *pSMBt; 186 struct smb_t2_rsp *pSMBt;
187 int remaining; 187 int remaining;
@@ -214,9 +214,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
214 214
215 cFYI(1, "missing %d bytes from transact2, check next response", 215 cFYI(1, "missing %d bytes from transact2, check next response",
216 remaining); 216 remaining);
217 if (total_data_size > maxBufSize) { 217 if (total_data_size > CIFSMaxBufSize) {
218 cERROR(1, "TotalDataSize %d is over maximum buffer %d", 218 cERROR(1, "TotalDataSize %d is over maximum buffer %d",
219 total_data_size, maxBufSize); 219 total_data_size, CIFSMaxBufSize);
220 return -EINVAL; 220 return -EINVAL;
221 } 221 }
222 return remaining; 222 return remaining;
@@ -320,27 +320,24 @@ requeue_echo:
320} 320}
321 321
322static bool 322static bool
323allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, 323allocate_buffers(struct TCP_Server_Info *server)
324 bool is_large_buf)
325{ 324{
326 char *bbuf = *bigbuf, *sbuf = *smallbuf; 325 if (!server->bigbuf) {
327 326 server->bigbuf = (char *)cifs_buf_get();
328 if (bbuf == NULL) { 327 if (!server->bigbuf) {
329 bbuf = (char *)cifs_buf_get();
330 if (!bbuf) {
331 cERROR(1, "No memory for large SMB response"); 328 cERROR(1, "No memory for large SMB response");
332 msleep(3000); 329 msleep(3000);
333 /* retry will check if exiting */ 330 /* retry will check if exiting */
334 return false; 331 return false;
335 } 332 }
336 } else if (is_large_buf) { 333 } else if (server->large_buf) {
337 /* we are reusing a dirty large buf, clear its start */ 334 /* we are reusing a dirty large buf, clear its start */
338 memset(bbuf, 0, size); 335 memset(server->bigbuf, 0, sizeof(struct smb_hdr));
339 } 336 }
340 337
341 if (sbuf == NULL) { 338 if (!server->smallbuf) {
342 sbuf = (char *)cifs_small_buf_get(); 339 server->smallbuf = (char *)cifs_small_buf_get();
343 if (!sbuf) { 340 if (!server->smallbuf) {
344 cERROR(1, "No memory for SMB response"); 341 cERROR(1, "No memory for SMB response");
345 msleep(1000); 342 msleep(1000);
346 /* retry will check if exiting */ 343 /* retry will check if exiting */
@@ -349,36 +346,116 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
349 /* beginning of smb buffer is cleared in our buf_get */ 346 /* beginning of smb buffer is cleared in our buf_get */
350 } else { 347 } else {
351 /* if existing small buf clear beginning */ 348 /* if existing small buf clear beginning */
352 memset(sbuf, 0, size); 349 memset(server->smallbuf, 0, sizeof(struct smb_hdr));
353 } 350 }
354 351
355 *bigbuf = bbuf;
356 *smallbuf = sbuf;
357
358 return true; 352 return true;
359} 353}
360 354
361static int 355static bool
362read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, 356server_unresponsive(struct TCP_Server_Info *server)
363 struct kvec *iov, unsigned int to_read, 357{
364 unsigned int *ptotal_read, bool is_header_read) 358 if (echo_retries > 0 && server->tcpStatus == CifsGood &&
359 time_after(jiffies, server->lstrp +
360 (echo_retries * SMB_ECHO_INTERVAL))) {
361 cERROR(1, "Server %s has not responded in %d seconds. "
362 "Reconnecting...", server->hostname,
363 (echo_retries * SMB_ECHO_INTERVAL / HZ));
364 cifs_reconnect(server);
365 wake_up(&server->response_q);
366 return true;
367 }
368
369 return false;
370}
371
372/*
373 * kvec_array_init - clone a kvec array, and advance into it
374 * @new: pointer to memory for cloned array
375 * @iov: pointer to original array
376 * @nr_segs: number of members in original array
377 * @bytes: number of bytes to advance into the cloned array
378 *
379 * This function will copy the array provided in iov to a section of memory
380 * and advance the specified number of bytes into the new array. It returns
381 * the number of segments in the new array. "new" must be at least as big as
382 * the original iov array.
383 */
384static unsigned int
385kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
386 size_t bytes)
387{
388 size_t base = 0;
389
390 while (bytes || !iov->iov_len) {
391 int copy = min(bytes, iov->iov_len);
392
393 bytes -= copy;
394 base += copy;
395 if (iov->iov_len == base) {
396 iov++;
397 nr_segs--;
398 base = 0;
399 }
400 }
401 memcpy(new, iov, sizeof(*iov) * nr_segs);
402 new->iov_base += base;
403 new->iov_len -= base;
404 return nr_segs;
405}
406
407static struct kvec *
408get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
409{
410 struct kvec *new_iov;
411
412 if (server->iov && nr_segs <= server->nr_iov)
413 return server->iov;
414
415 /* not big enough -- allocate a new one and release the old */
416 new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
417 if (new_iov) {
418 kfree(server->iov);
419 server->iov = new_iov;
420 server->nr_iov = nr_segs;
421 }
422 return new_iov;
423}
424
425int
426cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
427 unsigned int nr_segs, unsigned int to_read)
365{ 428{
366 int length, rc = 0; 429 int length = 0;
367 unsigned int total_read; 430 int total_read;
368 char *buf = iov->iov_base; 431 unsigned int segs;
432 struct msghdr smb_msg;
433 struct kvec *iov;
434
435 iov = get_server_iovec(server, nr_segs);
436 if (!iov)
437 return -ENOMEM;
438
439 smb_msg.msg_control = NULL;
440 smb_msg.msg_controllen = 0;
441
442 for (total_read = 0; to_read; total_read += length, to_read -= length) {
443 if (server_unresponsive(server)) {
444 total_read = -EAGAIN;
445 break;
446 }
447
448 segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
449
450 length = kernel_recvmsg(server->ssocket, &smb_msg,
451 iov, segs, to_read, 0);
369 452
370 for (total_read = 0; total_read < to_read; total_read += length) {
371 length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1,
372 to_read - total_read, 0);
373 if (server->tcpStatus == CifsExiting) { 453 if (server->tcpStatus == CifsExiting) {
374 /* then will exit */ 454 total_read = -ESHUTDOWN;
375 rc = 2;
376 break; 455 break;
377 } else if (server->tcpStatus == CifsNeedReconnect) { 456 } else if (server->tcpStatus == CifsNeedReconnect) {
378 cifs_reconnect(server); 457 cifs_reconnect(server);
379 /* Reconnect wakes up rspns q */ 458 total_read = -EAGAIN;
380 /* Now we will reread sock */
381 rc = 1;
382 break; 459 break;
383 } else if (length == -ERESTARTSYS || 460 } else if (length == -ERESTARTSYS ||
384 length == -EAGAIN || 461 length == -EAGAIN ||
@@ -390,56 +467,54 @@ read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
390 */ 467 */
391 usleep_range(1000, 2000); 468 usleep_range(1000, 2000);
392 length = 0; 469 length = 0;
393 if (!is_header_read) 470 continue;
394 continue;
395 /* Special handling for header read */
396 if (total_read) {
397 iov->iov_base = (to_read - total_read) +
398 buf;
399 iov->iov_len = to_read - total_read;
400 smb_msg->msg_control = NULL;
401 smb_msg->msg_controllen = 0;
402 rc = 3;
403 } else
404 rc = 1;
405 break;
406 } else if (length <= 0) { 471 } else if (length <= 0) {
407 cERROR(1, "Received no data, expecting %d", 472 cFYI(1, "Received no data or error: expecting %d "
408 to_read - total_read); 473 "got %d", to_read, length);
409 cifs_reconnect(server); 474 cifs_reconnect(server);
410 rc = 1; 475 total_read = -EAGAIN;
411 break; 476 break;
412 } 477 }
413 } 478 }
479 return total_read;
480}
414 481
415 *ptotal_read = total_read; 482int
416 return rc; 483cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
484 unsigned int to_read)
485{
486 struct kvec iov;
487
488 iov.iov_base = buf;
489 iov.iov_len = to_read;
490
491 return cifs_readv_from_socket(server, &iov, 1, to_read);
417} 492}
418 493
419static bool 494static bool
420check_rfc1002_header(struct TCP_Server_Info *server, char *buf) 495is_smb_response(struct TCP_Server_Info *server, unsigned char type)
421{ 496{
422 char temp = *buf;
423 unsigned int pdu_length = be32_to_cpu(
424 ((struct smb_hdr *)buf)->smb_buf_length);
425
426 /* 497 /*
427 * The first byte big endian of the length field, 498 * The first byte big endian of the length field,
428 * is actually not part of the length but the type 499 * is actually not part of the length but the type
429 * with the most common, zero, as regular data. 500 * with the most common, zero, as regular data.
430 */ 501 */
431 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { 502 switch (type) {
432 return false; 503 case RFC1002_SESSION_MESSAGE:
433 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { 504 /* Regular SMB response */
434 cFYI(1, "Good RFC 1002 session rsp"); 505 return true;
435 return false; 506 case RFC1002_SESSION_KEEP_ALIVE:
436 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { 507 cFYI(1, "RFC 1002 session keep alive");
508 break;
509 case RFC1002_POSITIVE_SESSION_RESPONSE:
510 cFYI(1, "RFC 1002 positive session response");
511 break;
512 case RFC1002_NEGATIVE_SESSION_RESPONSE:
437 /* 513 /*
438 * We get this from Windows 98 instead of an error on 514 * We get this from Windows 98 instead of an error on
439 * SMB negprot response. 515 * SMB negprot response.
440 */ 516 */
441 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", 517 cFYI(1, "RFC 1002 negative session response");
442 pdu_length);
443 /* give server a second to clean up */ 518 /* give server a second to clean up */
444 msleep(1000); 519 msleep(1000);
445 /* 520 /*
@@ -448,87 +523,89 @@ check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
448 * is since we do not begin with RFC1001 session 523 * is since we do not begin with RFC1001 session
449 * initialize frame). 524 * initialize frame).
450 */ 525 */
451 cifs_set_port((struct sockaddr *) 526 cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
452 &server->dstaddr, CIFS_PORT);
453 cifs_reconnect(server); 527 cifs_reconnect(server);
454 wake_up(&server->response_q); 528 wake_up(&server->response_q);
455 return false; 529 break;
456 } else if (temp != (char) 0) { 530 default:
457 cERROR(1, "Unknown RFC 1002 frame"); 531 cERROR(1, "RFC 1002 unknown response type 0x%x", type);
458 cifs_dump_mem(" Received Data: ", buf, 4);
459 cifs_reconnect(server);
460 return false;
461 }
462
463 /* else we have an SMB response */
464 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
465 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
466 cERROR(1, "Invalid size SMB length %d pdu_length %d",
467 4, pdu_length+4);
468 cifs_reconnect(server); 532 cifs_reconnect(server);
469 wake_up(&server->response_q);
470 return false;
471 } 533 }
472 534
473 return true; 535 return false;
474} 536}
475 537
476static struct mid_q_entry * 538static struct mid_q_entry *
477find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, 539find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf)
478 int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
479{ 540{
480 struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL; 541 struct mid_q_entry *mid;
481 542
482 spin_lock(&GlobalMid_Lock); 543 spin_lock(&GlobalMid_Lock);
483 list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) { 544 list_for_each_entry(mid, &server->pending_mid_q, qhead) {
484 if (mid->mid != buf->Mid || 545 if (mid->mid == buf->Mid &&
485 mid->midState != MID_REQUEST_SUBMITTED || 546 mid->midState == MID_REQUEST_SUBMITTED &&
486 mid->command != buf->Command) 547 mid->command == buf->Command) {
487 continue; 548 spin_unlock(&GlobalMid_Lock);
488 549 return mid;
489 if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) {
490 /* We have a multipart transact2 resp */
491 *is_multi_rsp = true;
492 if (mid->resp_buf) {
493 /* merge response - fix up 1st*/
494 *length = coalesce_t2(buf, mid->resp_buf);
495 if (*length > 0) {
496 *length = 0;
497 mid->multiRsp = true;
498 break;
499 }
500 /* All parts received or packet is malformed. */
501 mid->multiEnd = true;
502 goto multi_t2_fnd;
503 }
504 if (!is_large_buf) {
505 /*FIXME: switch to already allocated largebuf?*/
506 cERROR(1, "1st trans2 resp needs bigbuf");
507 } else {
508 /* Have first buffer */
509 mid->resp_buf = buf;
510 mid->largeBuf = true;
511 *bigbuf = NULL;
512 }
513 break;
514 } 550 }
515 mid->resp_buf = buf; 551 }
516 mid->largeBuf = is_large_buf; 552 spin_unlock(&GlobalMid_Lock);
517multi_t2_fnd: 553 return NULL;
518 if (*length == 0) 554}
519 mid->midState = MID_RESPONSE_RECEIVED; 555
520 else 556void
521 mid->midState = MID_RESPONSE_MALFORMED; 557dequeue_mid(struct mid_q_entry *mid, bool malformed)
558{
522#ifdef CONFIG_CIFS_STATS2 559#ifdef CONFIG_CIFS_STATS2
523 mid->when_received = jiffies; 560 mid->when_received = jiffies;
524#endif 561#endif
525 list_del_init(&mid->qhead); 562 spin_lock(&GlobalMid_Lock);
526 ret = mid; 563 if (!malformed)
527 break; 564 mid->midState = MID_RESPONSE_RECEIVED;
528 } 565 else
566 mid->midState = MID_RESPONSE_MALFORMED;
567 list_del_init(&mid->qhead);
529 spin_unlock(&GlobalMid_Lock); 568 spin_unlock(&GlobalMid_Lock);
569}
530 570
531 return ret; 571static void
572handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
573 struct smb_hdr *buf, int malformed)
574{
575 if (malformed == 0 && check2ndT2(buf) > 0) {
576 mid->multiRsp = true;
577 if (mid->resp_buf) {
578 /* merge response - fix up 1st*/
579 malformed = coalesce_t2(buf, mid->resp_buf);
580 if (malformed > 0)
581 return;
582
583 /* All parts received or packet is malformed. */
584 mid->multiEnd = true;
585 return dequeue_mid(mid, malformed);
586 }
587 if (!server->large_buf) {
588 /*FIXME: switch to already allocated largebuf?*/
589 cERROR(1, "1st trans2 resp needs bigbuf");
590 } else {
591 /* Have first buffer */
592 mid->resp_buf = buf;
593 mid->largeBuf = true;
594 server->bigbuf = NULL;
595 }
596 return;
597 }
598 mid->resp_buf = buf;
599 mid->largeBuf = server->large_buf;
600 /* Was previous buf put in mpx struct for multi-rsp? */
601 if (!mid->multiRsp) {
602 /* smb buffer will be freed by user thread */
603 if (server->large_buf)
604 server->bigbuf = NULL;
605 else
606 server->smallbuf = NULL;
607 }
608 dequeue_mid(mid, malformed);
532} 609}
533 610
534static void clean_demultiplex_info(struct TCP_Server_Info *server) 611static void clean_demultiplex_info(struct TCP_Server_Info *server)
@@ -618,6 +695,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
618 } 695 }
619 696
620 kfree(server->hostname); 697 kfree(server->hostname);
698 kfree(server->iov);
621 kfree(server); 699 kfree(server);
622 700
623 length = atomic_dec_return(&tcpSesAllocCount); 701 length = atomic_dec_return(&tcpSesAllocCount);
@@ -627,20 +705,70 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
627} 705}
628 706
629static int 707static int
708standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
709{
710 int length;
711 char *buf = server->smallbuf;
712 struct smb_hdr *smb_buffer = (struct smb_hdr *)buf;
713 unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
714
715 /* make sure this will fit in a large buffer */
716 if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
717 cERROR(1, "SMB response too long (%u bytes)",
718 pdu_length);
719 cifs_reconnect(server);
720 wake_up(&server->response_q);
721 return -EAGAIN;
722 }
723
724 /* switch to large buffer if too big for a small one */
725 if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
726 server->large_buf = true;
727 memcpy(server->bigbuf, server->smallbuf, server->total_read);
728 buf = server->bigbuf;
729 smb_buffer = (struct smb_hdr *)buf;
730 }
731
732 /* now read the rest */
733 length = cifs_read_from_socket(server,
734 buf + sizeof(struct smb_hdr) - 1,
735 pdu_length - sizeof(struct smb_hdr) + 1 + 4);
736 if (length < 0)
737 return length;
738 server->total_read += length;
739
740 dump_smb(smb_buffer, server->total_read);
741
742 /*
743 * We know that we received enough to get to the MID as we
744 * checked the pdu_length earlier. Now check to see
745 * if the rest of the header is OK. We borrow the length
746 * var for the rest of the loop to avoid a new stack var.
747 *
748 * 48 bytes is enough to display the header and a little bit
749 * into the payload for debugging purposes.
750 */
751 length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read);
752 if (length != 0)
753 cifs_dump_mem("Bad SMB: ", buf,
754 min_t(unsigned int, server->total_read, 48));
755
756 if (mid)
757 handle_mid(mid, server, smb_buffer, length);
758
759 return length;
760}
761
762static int
630cifs_demultiplex_thread(void *p) 763cifs_demultiplex_thread(void *p)
631{ 764{
632 int length; 765 int length;
633 struct TCP_Server_Info *server = p; 766 struct TCP_Server_Info *server = p;
634 unsigned int pdu_length, total_read; 767 unsigned int pdu_length;
635 char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL; 768 char *buf = NULL;
636 struct smb_hdr *smb_buffer = NULL; 769 struct smb_hdr *smb_buffer = NULL;
637 struct msghdr smb_msg;
638 struct kvec iov;
639 struct task_struct *task_to_wake = NULL; 770 struct task_struct *task_to_wake = NULL;
640 struct mid_q_entry *mid_entry; 771 struct mid_q_entry *mid_entry;
641 bool isLargeBuf = false;
642 bool isMultiRsp = false;
643 int rc;
644 772
645 current->flags |= PF_MEMALLOC; 773 current->flags |= PF_MEMALLOC;
646 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); 774 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -655,111 +783,65 @@ cifs_demultiplex_thread(void *p)
655 if (try_to_freeze()) 783 if (try_to_freeze())
656 continue; 784 continue;
657 785
658 if (!allocate_buffers(&bigbuf, &smallbuf, 786 if (!allocate_buffers(server))
659 sizeof(struct smb_hdr), isLargeBuf))
660 continue; 787 continue;
661 788
662 isLargeBuf = false; 789 server->large_buf = false;
663 isMultiRsp = false; 790 smb_buffer = (struct smb_hdr *)server->smallbuf;
664 smb_buffer = (struct smb_hdr *)smallbuf; 791 buf = server->smallbuf;
665 buf = smallbuf;
666 iov.iov_base = buf;
667 iov.iov_len = 4;
668 smb_msg.msg_control = NULL;
669 smb_msg.msg_controllen = 0;
670 pdu_length = 4; /* enough to get RFC1001 header */ 792 pdu_length = 4; /* enough to get RFC1001 header */
671 793
672incomplete_rcv: 794 length = cifs_read_from_socket(server, buf, pdu_length);
673 if (echo_retries > 0 && server->tcpStatus == CifsGood && 795 if (length < 0)
674 time_after(jiffies, server->lstrp +
675 (echo_retries * SMB_ECHO_INTERVAL))) {
676 cERROR(1, "Server %s has not responded in %d seconds. "
677 "Reconnecting...", server->hostname,
678 (echo_retries * SMB_ECHO_INTERVAL / HZ));
679 cifs_reconnect(server);
680 wake_up(&server->response_q);
681 continue;
682 }
683
684 rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
685 &total_read, true /* header read */);
686 if (rc == 3)
687 goto incomplete_rcv;
688 else if (rc == 2)
689 break;
690 else if (rc == 1)
691 continue; 796 continue;
797 server->total_read = length;
692 798
693 /* 799 /*
694 * The right amount was read from socket - 4 bytes, 800 * The right amount was read from socket - 4 bytes,
695 * so we can now interpret the length field. 801 * so we can now interpret the length field.
696 */ 802 */
697
698 /*
699 * Note that RFC 1001 length is big endian on the wire,
700 * but we convert it here so it is always manipulated
701 * as host byte order.
702 */
703 pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); 803 pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
704 804
705 cFYI(1, "rfc1002 length 0x%x", pdu_length+4); 805 cFYI(1, "RFC1002 header 0x%x", pdu_length);
706 if (!check_rfc1002_header(server, buf)) 806 if (!is_smb_response(server, buf[0]))
707 continue; 807 continue;
708 808
709 /* else length ok */ 809 /* make sure we have enough to get to the MID */
710 if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { 810 if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) {
711 isLargeBuf = true; 811 cERROR(1, "SMB response too short (%u bytes)",
712 memcpy(bigbuf, smallbuf, 4); 812 pdu_length);
713 smb_buffer = (struct smb_hdr *)bigbuf; 813 cifs_reconnect(server);
714 buf = bigbuf; 814 wake_up(&server->response_q);
815 continue;
715 } 816 }
716 817
717 iov.iov_base = 4 + buf; 818 /* read down to the MID */
718 iov.iov_len = pdu_length; 819 length = cifs_read_from_socket(server, buf + 4,
719 rc = read_from_socket(server, &smb_msg, &iov, pdu_length, 820 sizeof(struct smb_hdr) - 1 - 4);
720 &total_read, false); 821 if (length < 0)
721 if (rc == 2)
722 break;
723 else if (rc == 1)
724 continue; 822 continue;
823 server->total_read += length;
725 824
726 total_read += 4; /* account for rfc1002 hdr */ 825 mid_entry = find_mid(server, smb_buffer);
727 826
728 dump_smb(smb_buffer, total_read); 827 if (!mid_entry || !mid_entry->receive)
828 length = standard_receive3(server, mid_entry);
829 else
830 length = mid_entry->receive(server, mid_entry);
729 831
730 /* 832 if (length < 0)
731 * We know that we received enough to get to the MID as we 833 continue;
732 * checked the pdu_length earlier. Now check to see
733 * if the rest of the header is OK. We borrow the length
734 * var for the rest of the loop to avoid a new stack var.
735 *
736 * 48 bytes is enough to display the header and a little bit
737 * into the payload for debugging purposes.
738 */
739 length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
740 if (length != 0)
741 cifs_dump_mem("Bad SMB: ", buf,
742 min_t(unsigned int, total_read, 48));
743 834
744 server->lstrp = jiffies; 835 if (server->large_buf) {
836 buf = server->bigbuf;
837 smb_buffer = (struct smb_hdr *)buf;
838 }
745 839
746 mid_entry = find_cifs_mid(server, smb_buffer, &length, 840 server->lstrp = jiffies;
747 isLargeBuf, &isMultiRsp, &bigbuf);
748 if (mid_entry != NULL) { 841 if (mid_entry != NULL) {
749 mid_entry->callback(mid_entry); 842 if (!mid_entry->multiRsp || mid_entry->multiEnd)
750 /* Was previous buf put in mpx struct for multi-rsp? */ 843 mid_entry->callback(mid_entry);
751 if (!isMultiRsp) { 844 } else if (!is_valid_oplock_break(smb_buffer, server)) {
752 /* smb buffer will be freed by user thread */
753 if (isLargeBuf)
754 bigbuf = NULL;
755 else
756 smallbuf = NULL;
757 }
758 } else if (length != 0) {
759 /* response sanity checks failed */
760 continue;
761 } else if (!is_valid_oplock_break(smb_buffer, server) &&
762 !isMultiRsp) {
763 cERROR(1, "No task to wake, unknown frame received! " 845 cERROR(1, "No task to wake, unknown frame received! "
764 "NumMids %d", atomic_read(&midCount)); 846 "NumMids %d", atomic_read(&midCount));
765 cifs_dump_mem("Received Data is: ", buf, 847 cifs_dump_mem("Received Data is: ", buf,
@@ -773,9 +855,9 @@ incomplete_rcv:
773 } /* end while !EXITING */ 855 } /* end while !EXITING */
774 856
775 /* buffer usually freed in free_mid - need to free it here on exit */ 857 /* buffer usually freed in free_mid - need to free it here on exit */
776 cifs_buf_release(bigbuf); 858 cifs_buf_release(server->bigbuf);
777 if (smallbuf) /* no sense logging a debug message if NULL */ 859 if (server->smallbuf) /* no sense logging a debug message if NULL */
778 cifs_small_buf_release(smallbuf); 860 cifs_small_buf_release(server->smallbuf);
779 861
780 task_to_wake = xchg(&server->tsk, NULL); 862 task_to_wake = xchg(&server->tsk, NULL);
781 clean_demultiplex_info(server); 863 clean_demultiplex_info(server);
@@ -827,6 +909,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
827{ 909{
828 char *value, *data, *end; 910 char *value, *data, *end;
829 char *mountdata_copy = NULL, *options; 911 char *mountdata_copy = NULL, *options;
912 int err;
830 unsigned int temp_len, i, j; 913 unsigned int temp_len, i, j;
831 char separator[2]; 914 char separator[2];
832 short int override_uid = -1; 915 short int override_uid = -1;
@@ -883,6 +966,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
883 cFYI(1, "Null separator not allowed"); 966 cFYI(1, "Null separator not allowed");
884 } 967 }
885 } 968 }
969 vol->backupuid_specified = false; /* no backup intent for a user */
970 vol->backupgid_specified = false; /* no backup intent for a group */
886 971
887 while ((data = strsep(&options, separator)) != NULL) { 972 while ((data = strsep(&options, separator)) != NULL) {
888 if (!*data) 973 if (!*data)
@@ -1298,7 +1383,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1298 /* ignore */ 1383 /* ignore */
1299 } else if (strnicmp(data, "guest", 5) == 0) { 1384 } else if (strnicmp(data, "guest", 5) == 0) {
1300 /* ignore */ 1385 /* ignore */
1301 } else if (strnicmp(data, "rw", 2) == 0) { 1386 } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) {
1302 /* ignore */ 1387 /* ignore */
1303 } else if (strnicmp(data, "ro", 2) == 0) { 1388 } else if (strnicmp(data, "ro", 2) == 0) {
1304 /* ignore */ 1389 /* ignore */
@@ -1401,7 +1486,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1401 vol->server_ino = 1; 1486 vol->server_ino = 1;
1402 } else if (strnicmp(data, "noserverino", 9) == 0) { 1487 } else if (strnicmp(data, "noserverino", 9) == 0) {
1403 vol->server_ino = 0; 1488 vol->server_ino = 0;
1404 } else if (strnicmp(data, "rwpidforward", 4) == 0) { 1489 } else if (strnicmp(data, "rwpidforward", 12) == 0) {
1405 vol->rwpidforward = 1; 1490 vol->rwpidforward = 1;
1406 } else if (strnicmp(data, "cifsacl", 7) == 0) { 1491 } else if (strnicmp(data, "cifsacl", 7) == 0) {
1407 vol->cifs_acl = 1; 1492 vol->cifs_acl = 1;
@@ -1442,6 +1527,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1442 vol->mfsymlinks = true; 1527 vol->mfsymlinks = true;
1443 } else if (strnicmp(data, "multiuser", 8) == 0) { 1528 } else if (strnicmp(data, "multiuser", 8) == 0) {
1444 vol->multiuser = true; 1529 vol->multiuser = true;
1530 } else if (!strnicmp(data, "backupuid", 9) && value && *value) {
1531 err = kstrtouint(value, 0, &vol->backupuid);
1532 if (err < 0) {
1533 cERROR(1, "%s: Invalid backupuid value",
1534 __func__);
1535 goto cifs_parse_mount_err;
1536 }
1537 vol->backupuid_specified = true;
1538 } else if (!strnicmp(data, "backupgid", 9) && value && *value) {
1539 err = kstrtouint(value, 0, &vol->backupgid);
1540 if (err < 0) {
1541 cERROR(1, "%s: Invalid backupgid value",
1542 __func__);
1543 goto cifs_parse_mount_err;
1544 }
1545 vol->backupgid_specified = true;
1445 } else 1546 } else
1446 printk(KERN_WARNING "CIFS: Unknown mount option %s\n", 1547 printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
1447 data); 1548 data);
@@ -2018,7 +2119,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2018 warned_on_ntlm = true; 2119 warned_on_ntlm = true;
2019 cERROR(1, "default security mechanism requested. The default " 2120 cERROR(1, "default security mechanism requested. The default "
2020 "security mechanism will be upgraded from ntlm to " 2121 "security mechanism will be upgraded from ntlm to "
2021 "ntlmv2 in kernel release 3.1"); 2122 "ntlmv2 in kernel release 3.2");
2022 } 2123 }
2023 ses->overrideSecFlg = volume_info->secFlg; 2124 ses->overrideSecFlg = volume_info->secFlg;
2024 2125
@@ -2209,16 +2310,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
2209 (new->mnt_cifs_flags & CIFS_MOUNT_MASK)) 2310 (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
2210 return 0; 2311 return 0;
2211 2312
2212 if (old->rsize != new->rsize)
2213 return 0;
2214
2215 /* 2313 /*
2216 * We want to share sb only if we don't specify wsize or specified wsize 2314 * We want to share sb only if we don't specify an r/wsize or
2217 * is greater or equal than existing one. 2315 * specified r/wsize is greater than or equal to existing one.
2218 */ 2316 */
2219 if (new->wsize && new->wsize < old->wsize) 2317 if (new->wsize && new->wsize < old->wsize)
2220 return 0; 2318 return 0;
2221 2319
2320 if (new->rsize && new->rsize < old->rsize)
2321 return 0;
2322
2222 if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) 2323 if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
2223 return 0; 2324 return 0;
2224 2325
@@ -2656,14 +2757,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
2656 CIFS_MOUNT_POSIX_PATHS; 2757 CIFS_MOUNT_POSIX_PATHS;
2657 } 2758 }
2658 2759
2659 if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) {
2660 if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
2661 cifs_sb->rsize = 127 * 1024;
2662 cFYI(DBG2, "larger reads not supported by srv");
2663 }
2664 }
2665
2666
2667 cFYI(1, "Negotiate caps 0x%x", (int)cap); 2760 cFYI(1, "Negotiate caps 0x%x", (int)cap);
2668#ifdef CONFIG_CIFS_DEBUG2 2761#ifdef CONFIG_CIFS_DEBUG2
2669 if (cap & CIFS_UNIX_FCNTL_CAP) 2762 if (cap & CIFS_UNIX_FCNTL_CAP)
@@ -2708,31 +2801,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
2708 spin_lock_init(&cifs_sb->tlink_tree_lock); 2801 spin_lock_init(&cifs_sb->tlink_tree_lock);
2709 cifs_sb->tlink_tree = RB_ROOT; 2802 cifs_sb->tlink_tree = RB_ROOT;
2710 2803
2711 if (pvolume_info->rsize > CIFSMaxBufSize) {
2712 cERROR(1, "rsize %d too large, using MaxBufSize",
2713 pvolume_info->rsize);
2714 cifs_sb->rsize = CIFSMaxBufSize;
2715 } else if ((pvolume_info->rsize) &&
2716 (pvolume_info->rsize <= CIFSMaxBufSize))
2717 cifs_sb->rsize = pvolume_info->rsize;
2718 else /* default */
2719 cifs_sb->rsize = CIFSMaxBufSize;
2720
2721 if (cifs_sb->rsize < 2048) {
2722 cifs_sb->rsize = 2048;
2723 /* Windows ME may prefer this */
2724 cFYI(1, "readsize set to minimum: 2048");
2725 }
2726
2727 /* 2804 /*
2728 * Temporarily set wsize for matching superblock. If we end up using 2805 * Temporarily set r/wsize for matching superblock. If we end up using
2729 * new sb then cifs_negotiate_wsize will later negotiate it downward 2806 * new sb then client will later negotiate it downward if needed.
2730 * if needed.
2731 */ 2807 */
2808 cifs_sb->rsize = pvolume_info->rsize;
2732 cifs_sb->wsize = pvolume_info->wsize; 2809 cifs_sb->wsize = pvolume_info->wsize;
2733 2810
2734 cifs_sb->mnt_uid = pvolume_info->linux_uid; 2811 cifs_sb->mnt_uid = pvolume_info->linux_uid;
2735 cifs_sb->mnt_gid = pvolume_info->linux_gid; 2812 cifs_sb->mnt_gid = pvolume_info->linux_gid;
2813 if (pvolume_info->backupuid_specified)
2814 cifs_sb->mnt_backupuid = pvolume_info->backupuid;
2815 if (pvolume_info->backupgid_specified)
2816 cifs_sb->mnt_backupgid = pvolume_info->backupgid;
2736 cifs_sb->mnt_file_mode = pvolume_info->file_mode; 2817 cifs_sb->mnt_file_mode = pvolume_info->file_mode;
2737 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; 2818 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
2738 cFYI(1, "file mode: 0x%x dir mode: 0x%x", 2819 cFYI(1, "file mode: 0x%x dir mode: 0x%x",
@@ -2763,6 +2844,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
2763 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; 2844 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
2764 if (pvolume_info->cifs_acl) 2845 if (pvolume_info->cifs_acl)
2765 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; 2846 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
2847 if (pvolume_info->backupuid_specified)
2848 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID;
2849 if (pvolume_info->backupgid_specified)
2850 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID;
2766 if (pvolume_info->override_uid) 2851 if (pvolume_info->override_uid)
2767 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; 2852 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
2768 if (pvolume_info->override_gid) 2853 if (pvolume_info->override_gid)
@@ -2795,29 +2880,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
2795} 2880}
2796 2881
2797/* 2882/*
2798 * When the server supports very large writes via POSIX extensions, we can 2883 * When the server supports very large reads and writes via POSIX extensions,
2799 * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including 2884 * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
2800 * the RFC1001 length. 2885 * including the RFC1001 length.
2801 * 2886 *
2802 * Note that this might make for "interesting" allocation problems during 2887 * Note that this might make for "interesting" allocation problems during
2803 * writeback however as we have to allocate an array of pointers for the 2888 * writeback however as we have to allocate an array of pointers for the
2804 * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. 2889 * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
2890 *
2891 * For reads, there is a similar problem as we need to allocate an array
2892 * of kvecs to handle the receive, though that should only need to be done
2893 * once.
2805 */ 2894 */
2806#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) 2895#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
2896#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
2807 2897
2808/* 2898/*
2809 * When the server doesn't allow large posix writes, only allow a wsize of 2899 * When the server doesn't allow large posix writes, only allow a rsize/wsize
2810 * 128k minus the size of the WRITE_AND_X header. That allows for a write up 2900 * of 2^17-1 minus the size of the call header. That allows for a read or
2811 * to the maximum size described by RFC1002. 2901 * write up to the maximum size described by RFC1002.
2812 */ 2902 */
2813#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4) 2903#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
2904#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
2814 2905
2815/* 2906/*
2816 * The default wsize is 1M. find_get_pages seems to return a maximum of 256 2907 * The default wsize is 1M. find_get_pages seems to return a maximum of 256
2817 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill 2908 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
2818 * a single wsize request with a single call. 2909 * a single wsize request with a single call.
2819 */ 2910 */
2820#define CIFS_DEFAULT_WSIZE (1024 * 1024) 2911#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
2912
2913/*
2914 * Windows only supports a max of 60k reads. Default to that when posix
2915 * extensions aren't in force.
2916 */
2917#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
2821 2918
2822static unsigned int 2919static unsigned int
2823cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) 2920cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
@@ -2825,7 +2922,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
2825 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); 2922 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
2826 struct TCP_Server_Info *server = tcon->ses->server; 2923 struct TCP_Server_Info *server = tcon->ses->server;
2827 unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : 2924 unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
2828 CIFS_DEFAULT_WSIZE; 2925 CIFS_DEFAULT_IOSIZE;
2829 2926
2830 /* can server support 24-bit write sizes? (via UNIX extensions) */ 2927 /* can server support 24-bit write sizes? (via UNIX extensions) */
2831 if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) 2928 if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -2848,6 +2945,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
2848 return wsize; 2945 return wsize;
2849} 2946}
2850 2947
2948static unsigned int
2949cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
2950{
2951 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
2952 struct TCP_Server_Info *server = tcon->ses->server;
2953 unsigned int rsize, defsize;
2954
2955 /*
2956 * Set default value...
2957 *
2958 * HACK alert! Ancient servers have very small buffers. Even though
2959 * MS-CIFS indicates that servers are only limited by the client's
2960 * bufsize for reads, testing against win98se shows that it throws
2961 * INVALID_PARAMETER errors if you try to request too large a read.
2962 *
2963 * If the server advertises a MaxBufferSize of less than one page,
2964 * assume that it also can't satisfy reads larger than that either.
2965 *
2966 * FIXME: Is there a better heuristic for this?
2967 */
2968 if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
2969 defsize = CIFS_DEFAULT_IOSIZE;
2970 else if (server->capabilities & CAP_LARGE_READ_X)
2971 defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
2972 else if (server->maxBuf >= PAGE_CACHE_SIZE)
2973 defsize = CIFSMaxBufSize;
2974 else
2975 defsize = server->maxBuf - sizeof(READ_RSP);
2976
2977 rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize;
2978
2979 /*
2980 * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
2981 * the client's MaxBufferSize.
2982 */
2983 if (!(server->capabilities & CAP_LARGE_READ_X))
2984 rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
2985
2986 /* hard limit of CIFS_MAX_RSIZE */
2987 rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
2988
2989 return rsize;
2990}
2991
2851static int 2992static int
2852is_path_accessible(int xid, struct cifs_tcon *tcon, 2993is_path_accessible(int xid, struct cifs_tcon *tcon,
2853 struct cifs_sb_info *cifs_sb, const char *full_path) 2994 struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -2877,8 +3018,9 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
2877{ 3018{
2878 kfree(volume_info->username); 3019 kfree(volume_info->username);
2879 kzfree(volume_info->password); 3020 kzfree(volume_info->password);
3021 if (volume_info->UNCip != volume_info->UNC + 2)
3022 kfree(volume_info->UNCip);
2880 kfree(volume_info->UNC); 3023 kfree(volume_info->UNC);
2881 kfree(volume_info->UNCip);
2882 kfree(volume_info->domainname); 3024 kfree(volume_info->domainname);
2883 kfree(volume_info->iocharset); 3025 kfree(volume_info->iocharset);
2884 kfree(volume_info->prepath); 3026 kfree(volume_info->prepath);
@@ -3040,6 +3182,22 @@ cifs_get_volume_info(char *mount_data, const char *devname)
3040 return volume_info; 3182 return volume_info;
3041} 3183}
3042 3184
3185/* make sure ra_pages is a multiple of rsize */
3186static inline unsigned int
3187cifs_ra_pages(struct cifs_sb_info *cifs_sb)
3188{
3189 unsigned int reads;
3190 unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
3191
3192 if (rsize_pages >= default_backing_dev_info.ra_pages)
3193 return default_backing_dev_info.ra_pages;
3194 else if (rsize_pages == 0)
3195 return rsize_pages;
3196
3197 reads = default_backing_dev_info.ra_pages / rsize_pages;
3198 return reads * rsize_pages;
3199}
3200
3043int 3201int
3044cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) 3202cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
3045{ 3203{
@@ -3058,8 +3216,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
3058 if (rc) 3216 if (rc)
3059 return rc; 3217 return rc;
3060 3218
3061 cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
3062
3063#ifdef CONFIG_CIFS_DFS_UPCALL 3219#ifdef CONFIG_CIFS_DFS_UPCALL
3064try_mount_again: 3220try_mount_again:
3065 /* cleanup activities if we're chasing a referral */ 3221 /* cleanup activities if we're chasing a referral */
@@ -3124,15 +3280,11 @@ try_mount_again:
3124 CIFSSMBQFSAttributeInfo(xid, tcon); 3280 CIFSSMBQFSAttributeInfo(xid, tcon);
3125 } 3281 }
3126 3282
3127 if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
3128 cifs_sb->rsize = 1024 * 127;
3129 cFYI(DBG2, "no very large read support, rsize now 127K");
3130 }
3131 if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
3132 cifs_sb->rsize = min(cifs_sb->rsize,
3133 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
3134
3135 cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); 3283 cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
3284 cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info);
3285
3286 /* tune readahead according to rsize */
3287 cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb);
3136 3288
3137remote_path_check: 3289remote_path_check:
3138#ifdef CONFIG_CIFS_DFS_UPCALL 3290#ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index ae576fbb5142..d7eeb9d3ed6f 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -105,8 +105,8 @@ cifs_bp_rename_retry:
105 } 105 }
106 rcu_read_unlock(); 106 rcu_read_unlock();
107 if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { 107 if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
108 cERROR(1, "did not end path lookup where expected namelen is %d", 108 cFYI(1, "did not end path lookup where expected. namelen=%d "
109 namelen); 109 "dfsplen=%d", namelen, dfsplen);
110 /* presumably this is only possible if racing with a rename 110 /* presumably this is only possible if racing with a rename
111 of one of the parent directories (we can not lock the dentries 111 of one of the parent directories (we can not lock the dentries
112 above us to prevent this, but retrying should be harmless) */ 112 above us to prevent this, but retrying should be harmless) */
@@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
171 } 171 }
172 tcon = tlink_tcon(tlink); 172 tcon = tlink_tcon(tlink);
173 173
174 if (oplockEnabled) 174 if (enable_oplocks)
175 oplock = REQ_OPLOCK; 175 oplock = REQ_OPLOCK;
176 176
177 if (nd) 177 if (nd)
@@ -244,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
244 if (!tcon->unix_ext && (mode & S_IWUGO) == 0) 244 if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
245 create_options |= CREATE_OPTION_READONLY; 245 create_options |= CREATE_OPTION_READONLY;
246 246
247 if (backup_cred(cifs_sb))
248 create_options |= CREATE_OPEN_BACKUP_INTENT;
249
247 if (tcon->ses->capabilities & CAP_NT_SMBS) 250 if (tcon->ses->capabilities & CAP_NT_SMBS)
248 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 251 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
249 desiredAccess, create_options, 252 desiredAccess, create_options,
@@ -357,6 +360,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
357{ 360{
358 int rc = -EPERM; 361 int rc = -EPERM;
359 int xid; 362 int xid;
363 int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
360 struct cifs_sb_info *cifs_sb; 364 struct cifs_sb_info *cifs_sb;
361 struct tcon_link *tlink; 365 struct tcon_link *tlink;
362 struct cifs_tcon *pTcon; 366 struct cifs_tcon *pTcon;
@@ -431,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
431 return rc; 435 return rc;
432 } 436 }
433 437
434 /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */ 438 if (backup_cred(cifs_sb))
439 create_options |= CREATE_OPEN_BACKUP_INTENT;
440
435 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, 441 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
436 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, 442 GENERIC_WRITE, create_options,
437 &fileHandle, &oplock, buf, cifs_sb->local_nls, 443 &fileHandle, &oplock, buf, cifs_sb->local_nls,
438 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 444 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
439 if (rc) 445 if (rc)
@@ -642,8 +648,16 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
642 if (direntry->d_inode) { 648 if (direntry->d_inode) {
643 if (cifs_revalidate_dentry(direntry)) 649 if (cifs_revalidate_dentry(direntry))
644 return 0; 650 return 0;
645 else 651 else {
652 /*
653 * Forcibly invalidate automounting directory inodes
654 * (remote DFS directories) so to have them
655 * instantiated again for automount
656 */
657 if (IS_AUTOMOUNT(direntry->d_inode))
658 return 0;
646 return 1; 659 return 1;
660 }
647 } 661 }
648 662
649 /* 663 /*
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 55d87ac52000..9c7ecdccf2f3 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,7 +45,7 @@
45#include "cifs_debug.h" 45#include "cifs_debug.h"
46#include "cifsfs.h" 46#include "cifsfs.h"
47 47
48#ifdef CIFS_NFSD_EXPORT 48#ifdef CONFIG_CIFS_NFSD_EXPORT
49static struct dentry *cifs_get_parent(struct dentry *dentry) 49static struct dentry *cifs_get_parent(struct dentry *dentry)
50{ 50{
51 /* BB need to add code here eventually to enable export via NFSD */ 51 /* BB need to add code here eventually to enable export via NFSD */
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
63 .encode_fs = */ 63 .encode_fs = */
64}; 64};
65 65
66#endif /* CIFS_NFSD_EXPORT */ 66#endif /* CONFIG_CIFS_NFSD_EXPORT */
67 67
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9f41a10523a1..ea096ce5d4f7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -32,6 +32,7 @@
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h> 33#include <linux/mount.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/swap.h>
35#include <asm/div64.h> 36#include <asm/div64.h>
36#include "cifsfs.h" 37#include "cifsfs.h"
37#include "cifspdu.h" 38#include "cifspdu.h"
@@ -174,6 +175,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
174 int rc; 175 int rc;
175 int desiredAccess; 176 int desiredAccess;
176 int disposition; 177 int disposition;
178 int create_options = CREATE_NOT_DIR;
177 FILE_ALL_INFO *buf; 179 FILE_ALL_INFO *buf;
178 180
179 desiredAccess = cifs_convert_flags(f_flags); 181 desiredAccess = cifs_convert_flags(f_flags);
@@ -210,9 +212,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
210 if (!buf) 212 if (!buf)
211 return -ENOMEM; 213 return -ENOMEM;
212 214
215 if (backup_cred(cifs_sb))
216 create_options |= CREATE_OPEN_BACKUP_INTENT;
217
213 if (tcon->ses->capabilities & CAP_NT_SMBS) 218 if (tcon->ses->capabilities & CAP_NT_SMBS)
214 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 219 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
215 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, 220 desiredAccess, create_options, pnetfid, poplock, buf,
216 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 221 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
217 & CIFS_MOUNT_MAP_SPECIAL_CHR); 222 & CIFS_MOUNT_MAP_SPECIAL_CHR);
218 else 223 else
@@ -258,8 +263,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
258 pCifsFile->invalidHandle = false; 263 pCifsFile->invalidHandle = false;
259 pCifsFile->tlink = cifs_get_tlink(tlink); 264 pCifsFile->tlink = cifs_get_tlink(tlink);
260 mutex_init(&pCifsFile->fh_mutex); 265 mutex_init(&pCifsFile->fh_mutex);
261 mutex_init(&pCifsFile->lock_mutex);
262 INIT_LIST_HEAD(&pCifsFile->llist);
263 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); 266 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
264 267
265 spin_lock(&cifs_file_list_lock); 268 spin_lock(&cifs_file_list_lock);
@@ -272,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
272 spin_unlock(&cifs_file_list_lock); 275 spin_unlock(&cifs_file_list_lock);
273 276
274 cifs_set_oplock_level(pCifsInode, oplock); 277 cifs_set_oplock_level(pCifsInode, oplock);
278 pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll;
275 279
276 file->private_data = pCifsFile; 280 file->private_data = pCifsFile;
277 return pCifsFile; 281 return pCifsFile;
278} 282}
279 283
284static void cifs_del_lock_waiters(struct cifsLockInfo *lock);
285
280/* 286/*
281 * Release a reference on the file private data. This may involve closing 287 * Release a reference on the file private data. This may involve closing
282 * the filehandle out on the server. Must be called without holding 288 * the filehandle out on the server. Must be called without holding
@@ -327,12 +333,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
327 /* Delete any outstanding lock records. We'll lose them when the file 333 /* Delete any outstanding lock records. We'll lose them when the file
328 * is closed anyway. 334 * is closed anyway.
329 */ 335 */
330 mutex_lock(&cifs_file->lock_mutex); 336 mutex_lock(&cifsi->lock_mutex);
331 list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) { 337 list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) {
338 if (li->netfid != cifs_file->netfid)
339 continue;
332 list_del(&li->llist); 340 list_del(&li->llist);
341 cifs_del_lock_waiters(li);
333 kfree(li); 342 kfree(li);
334 } 343 }
335 mutex_unlock(&cifs_file->lock_mutex); 344 mutex_unlock(&cifsi->lock_mutex);
336 345
337 cifs_put_tlink(cifs_file->tlink); 346 cifs_put_tlink(cifs_file->tlink);
338 dput(cifs_file->dentry); 347 dput(cifs_file->dentry);
@@ -371,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file)
371 cFYI(1, "inode = 0x%p file flags are 0x%x for %s", 380 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
372 inode, file->f_flags, full_path); 381 inode, file->f_flags, full_path);
373 382
374 if (oplockEnabled) 383 if (enable_oplocks)
375 oplock = REQ_OPLOCK; 384 oplock = REQ_OPLOCK;
376 else 385 else
377 oplock = 0; 386 oplock = 0;
@@ -465,6 +474,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
465 char *full_path = NULL; 474 char *full_path = NULL;
466 int desiredAccess; 475 int desiredAccess;
467 int disposition = FILE_OPEN; 476 int disposition = FILE_OPEN;
477 int create_options = CREATE_NOT_DIR;
468 __u16 netfid; 478 __u16 netfid;
469 479
470 xid = GetXid(); 480 xid = GetXid();
@@ -495,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
495 cFYI(1, "inode = 0x%p file flags 0x%x for %s", 505 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
496 inode, pCifsFile->f_flags, full_path); 506 inode, pCifsFile->f_flags, full_path);
497 507
498 if (oplockEnabled) 508 if (enable_oplocks)
499 oplock = REQ_OPLOCK; 509 oplock = REQ_OPLOCK;
500 else 510 else
501 oplock = 0; 511 oplock = 0;
@@ -524,6 +534,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
524 534
525 desiredAccess = cifs_convert_flags(pCifsFile->f_flags); 535 desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
526 536
537 if (backup_cred(cifs_sb))
538 create_options |= CREATE_OPEN_BACKUP_INTENT;
539
527 /* Can not refresh inode by passing in file_info buf to be returned 540 /* Can not refresh inode by passing in file_info buf to be returned
528 by SMBOpen and then calling get_inode_info with returned buf 541 by SMBOpen and then calling get_inode_info with returned buf
529 since file might have write behind data that needs to be flushed 542 since file might have write behind data that needs to be flushed
@@ -531,7 +544,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
531 that inode was not dirty locally we could do this */ 544 that inode was not dirty locally we could do this */
532 545
533 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, 546 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
534 CREATE_NOT_DIR, &netfid, &oplock, NULL, 547 create_options, &netfid, &oplock, NULL,
535 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 548 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
536 CIFS_MOUNT_MAP_SPECIAL_CHR); 549 CIFS_MOUNT_MAP_SPECIAL_CHR);
537 if (rc) { 550 if (rc) {
@@ -631,219 +644,687 @@ int cifs_closedir(struct inode *inode, struct file *file)
631 return rc; 644 return rc;
632} 645}
633 646
634static int store_file_lock(struct cifsFileInfo *fid, __u64 len, 647static struct cifsLockInfo *
635 __u64 offset, __u8 lockType) 648cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid)
636{ 649{
637 struct cifsLockInfo *li = 650 struct cifsLockInfo *li =
638 kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); 651 kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
639 if (li == NULL) 652 if (!li)
640 return -ENOMEM; 653 return li;
654 li->netfid = netfid;
641 li->offset = offset; 655 li->offset = offset;
642 li->length = len; 656 li->length = len;
643 li->type = lockType; 657 li->type = type;
644 mutex_lock(&fid->lock_mutex); 658 li->pid = current->tgid;
645 list_add(&li->llist, &fid->llist); 659 INIT_LIST_HEAD(&li->blist);
646 mutex_unlock(&fid->lock_mutex); 660 init_waitqueue_head(&li->block_q);
661 return li;
662}
663
664static void
665cifs_del_lock_waiters(struct cifsLockInfo *lock)
666{
667 struct cifsLockInfo *li, *tmp;
668 list_for_each_entry_safe(li, tmp, &lock->blist, blist) {
669 list_del_init(&li->blist);
670 wake_up(&li->block_q);
671 }
672}
673
674static bool
675cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
676 __u64 length, __u8 type, __u16 netfid,
677 struct cifsLockInfo **conf_lock)
678{
679 struct cifsLockInfo *li, *tmp;
680
681 list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
682 if (offset + length <= li->offset ||
683 offset >= li->offset + li->length)
684 continue;
685 else if ((type & LOCKING_ANDX_SHARED_LOCK) &&
686 ((netfid == li->netfid && current->tgid == li->pid) ||
687 type == li->type))
688 continue;
689 else {
690 *conf_lock = li;
691 return true;
692 }
693 }
694 return false;
695}
696
697static int
698cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
699 __u8 type, __u16 netfid, struct file_lock *flock)
700{
701 int rc = 0;
702 struct cifsLockInfo *conf_lock;
703 bool exist;
704
705 mutex_lock(&cinode->lock_mutex);
706
707 exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
708 &conf_lock);
709 if (exist) {
710 flock->fl_start = conf_lock->offset;
711 flock->fl_end = conf_lock->offset + conf_lock->length - 1;
712 flock->fl_pid = conf_lock->pid;
713 if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK)
714 flock->fl_type = F_RDLCK;
715 else
716 flock->fl_type = F_WRLCK;
717 } else if (!cinode->can_cache_brlcks)
718 rc = 1;
719 else
720 flock->fl_type = F_UNLCK;
721
722 mutex_unlock(&cinode->lock_mutex);
723 return rc;
724}
725
726static int
727cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset,
728 __u8 type, __u16 netfid)
729{
730 struct cifsLockInfo *li;
731
732 li = cifs_lock_init(len, offset, type, netfid);
733 if (!li)
734 return -ENOMEM;
735
736 mutex_lock(&cinode->lock_mutex);
737 list_add_tail(&li->llist, &cinode->llist);
738 mutex_unlock(&cinode->lock_mutex);
647 return 0; 739 return 0;
648} 740}
649 741
650int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) 742static int
743cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
744 __u8 type, __u16 netfid, bool wait)
651{ 745{
652 int rc, xid; 746 struct cifsLockInfo *lock, *conf_lock;
653 __u32 numLock = 0; 747 bool exist;
654 __u32 numUnlock = 0; 748 int rc = 0;
655 __u64 length; 749
656 bool wait_flag = false; 750 lock = cifs_lock_init(length, offset, type, netfid);
657 struct cifs_sb_info *cifs_sb; 751 if (!lock)
752 return -ENOMEM;
753
754try_again:
755 exist = false;
756 mutex_lock(&cinode->lock_mutex);
757
758 exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
759 &conf_lock);
760 if (!exist && cinode->can_cache_brlcks) {
761 list_add_tail(&lock->llist, &cinode->llist);
762 mutex_unlock(&cinode->lock_mutex);
763 return rc;
764 }
765
766 if (!exist)
767 rc = 1;
768 else if (!wait)
769 rc = -EACCES;
770 else {
771 list_add_tail(&lock->blist, &conf_lock->blist);
772 mutex_unlock(&cinode->lock_mutex);
773 rc = wait_event_interruptible(lock->block_q,
774 (lock->blist.prev == &lock->blist) &&
775 (lock->blist.next == &lock->blist));
776 if (!rc)
777 goto try_again;
778 else {
779 mutex_lock(&cinode->lock_mutex);
780 list_del_init(&lock->blist);
781 mutex_unlock(&cinode->lock_mutex);
782 }
783 }
784
785 kfree(lock);
786 mutex_unlock(&cinode->lock_mutex);
787 return rc;
788}
789
790static int
791cifs_posix_lock_test(struct file *file, struct file_lock *flock)
792{
793 int rc = 0;
794 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
795 unsigned char saved_type = flock->fl_type;
796
797 mutex_lock(&cinode->lock_mutex);
798 posix_test_lock(file, flock);
799
800 if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
801 flock->fl_type = saved_type;
802 rc = 1;
803 }
804
805 mutex_unlock(&cinode->lock_mutex);
806 return rc;
807}
808
809static int
810cifs_posix_lock_set(struct file *file, struct file_lock *flock)
811{
812 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
813 int rc;
814
815 mutex_lock(&cinode->lock_mutex);
816 if (!cinode->can_cache_brlcks) {
817 mutex_unlock(&cinode->lock_mutex);
818 return 1;
819 }
820 rc = posix_lock_file_wait(file, flock);
821 mutex_unlock(&cinode->lock_mutex);
822 return rc;
823}
824
825static int
826cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
827{
828 int xid, rc = 0, stored_rc;
829 struct cifsLockInfo *li, *tmp;
658 struct cifs_tcon *tcon; 830 struct cifs_tcon *tcon;
659 __u16 netfid; 831 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
660 __u8 lockType = LOCKING_ANDX_LARGE_FILES; 832 unsigned int num, max_num;
661 bool posix_locking = 0; 833 LOCKING_ANDX_RANGE *buf, *cur;
834 int types[] = {LOCKING_ANDX_LARGE_FILES,
835 LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
836 int i;
837
838 xid = GetXid();
839 tcon = tlink_tcon(cfile->tlink);
840
841 mutex_lock(&cinode->lock_mutex);
842 if (!cinode->can_cache_brlcks) {
843 mutex_unlock(&cinode->lock_mutex);
844 FreeXid(xid);
845 return rc;
846 }
847
848 max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
849 sizeof(LOCKING_ANDX_RANGE);
850 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
851 if (!buf) {
852 mutex_unlock(&cinode->lock_mutex);
853 FreeXid(xid);
854 return rc;
855 }
856
857 for (i = 0; i < 2; i++) {
858 cur = buf;
859 num = 0;
860 list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
861 if (li->type != types[i])
862 continue;
863 cur->Pid = cpu_to_le16(li->pid);
864 cur->LengthLow = cpu_to_le32((u32)li->length);
865 cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
866 cur->OffsetLow = cpu_to_le32((u32)li->offset);
867 cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
868 if (++num == max_num) {
869 stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
870 li->type, 0, num, buf);
871 if (stored_rc)
872 rc = stored_rc;
873 cur = buf;
874 num = 0;
875 } else
876 cur++;
877 }
878
879 if (num) {
880 stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
881 types[i], 0, num, buf);
882 if (stored_rc)
883 rc = stored_rc;
884 }
885 }
886
887 cinode->can_cache_brlcks = false;
888 mutex_unlock(&cinode->lock_mutex);
889
890 kfree(buf);
891 FreeXid(xid);
892 return rc;
893}
894
895/* copied from fs/locks.c with a name change */
896#define cifs_for_each_lock(inode, lockp) \
897 for (lockp = &inode->i_flock; *lockp != NULL; \
898 lockp = &(*lockp)->fl_next)
899
900static int
901cifs_push_posix_locks(struct cifsFileInfo *cfile)
902{
903 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
904 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
905 struct file_lock *flock, **before;
906 struct cifsLockInfo *lck, *tmp;
907 int rc = 0, xid, type;
908 __u64 length;
909 struct list_head locks_to_send;
662 910
663 length = 1 + pfLock->fl_end - pfLock->fl_start;
664 rc = -EACCES;
665 xid = GetXid(); 911 xid = GetXid();
666 912
667 cFYI(1, "Lock parm: 0x%x flockflags: " 913 mutex_lock(&cinode->lock_mutex);
668 "0x%x flocktype: 0x%x start: %lld end: %lld", 914 if (!cinode->can_cache_brlcks) {
669 cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, 915 mutex_unlock(&cinode->lock_mutex);
670 pfLock->fl_end); 916 FreeXid(xid);
917 return rc;
918 }
919
920 INIT_LIST_HEAD(&locks_to_send);
671 921
672 if (pfLock->fl_flags & FL_POSIX) 922 lock_flocks();
923 cifs_for_each_lock(cfile->dentry->d_inode, before) {
924 flock = *before;
925 length = 1 + flock->fl_end - flock->fl_start;
926 if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
927 type = CIFS_RDLCK;
928 else
929 type = CIFS_WRLCK;
930
931 lck = cifs_lock_init(length, flock->fl_start, type,
932 cfile->netfid);
933 if (!lck) {
934 rc = -ENOMEM;
935 goto send_locks;
936 }
937 lck->pid = flock->fl_pid;
938
939 list_add_tail(&lck->llist, &locks_to_send);
940 }
941
942send_locks:
943 unlock_flocks();
944
945 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
946 struct file_lock tmp_lock;
947 int stored_rc;
948
949 tmp_lock.fl_start = lck->offset;
950 stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid,
951 0, lck->length, &tmp_lock,
952 lck->type, 0);
953 if (stored_rc)
954 rc = stored_rc;
955 list_del(&lck->llist);
956 kfree(lck);
957 }
958
959 cinode->can_cache_brlcks = false;
960 mutex_unlock(&cinode->lock_mutex);
961
962 FreeXid(xid);
963 return rc;
964}
965
966static int
967cifs_push_locks(struct cifsFileInfo *cfile)
968{
969 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
970 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
971
972 if ((tcon->ses->capabilities & CAP_UNIX) &&
973 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
974 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
975 return cifs_push_posix_locks(cfile);
976
977 return cifs_push_mandatory_locks(cfile);
978}
979
980static void
981cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
982 bool *wait_flag)
983{
984 if (flock->fl_flags & FL_POSIX)
673 cFYI(1, "Posix"); 985 cFYI(1, "Posix");
674 if (pfLock->fl_flags & FL_FLOCK) 986 if (flock->fl_flags & FL_FLOCK)
675 cFYI(1, "Flock"); 987 cFYI(1, "Flock");
676 if (pfLock->fl_flags & FL_SLEEP) { 988 if (flock->fl_flags & FL_SLEEP) {
677 cFYI(1, "Blocking lock"); 989 cFYI(1, "Blocking lock");
678 wait_flag = true; 990 *wait_flag = true;
679 } 991 }
680 if (pfLock->fl_flags & FL_ACCESS) 992 if (flock->fl_flags & FL_ACCESS)
681 cFYI(1, "Process suspended by mandatory locking - " 993 cFYI(1, "Process suspended by mandatory locking - "
682 "not implemented yet"); 994 "not implemented yet");
683 if (pfLock->fl_flags & FL_LEASE) 995 if (flock->fl_flags & FL_LEASE)
684 cFYI(1, "Lease on file - not implemented yet"); 996 cFYI(1, "Lease on file - not implemented yet");
685 if (pfLock->fl_flags & 997 if (flock->fl_flags &
686 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) 998 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
687 cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags); 999 cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
688 1000
689 if (pfLock->fl_type == F_WRLCK) { 1001 *type = LOCKING_ANDX_LARGE_FILES;
1002 if (flock->fl_type == F_WRLCK) {
690 cFYI(1, "F_WRLCK "); 1003 cFYI(1, "F_WRLCK ");
691 numLock = 1; 1004 *lock = 1;
692 } else if (pfLock->fl_type == F_UNLCK) { 1005 } else if (flock->fl_type == F_UNLCK) {
693 cFYI(1, "F_UNLCK"); 1006 cFYI(1, "F_UNLCK");
694 numUnlock = 1; 1007 *unlock = 1;
695 /* Check if unlock includes more than 1008 /* Check if unlock includes more than one lock range */
696 one lock range */ 1009 } else if (flock->fl_type == F_RDLCK) {
697 } else if (pfLock->fl_type == F_RDLCK) {
698 cFYI(1, "F_RDLCK"); 1010 cFYI(1, "F_RDLCK");
699 lockType |= LOCKING_ANDX_SHARED_LOCK; 1011 *type |= LOCKING_ANDX_SHARED_LOCK;
700 numLock = 1; 1012 *lock = 1;
701 } else if (pfLock->fl_type == F_EXLCK) { 1013 } else if (flock->fl_type == F_EXLCK) {
702 cFYI(1, "F_EXLCK"); 1014 cFYI(1, "F_EXLCK");
703 numLock = 1; 1015 *lock = 1;
704 } else if (pfLock->fl_type == F_SHLCK) { 1016 } else if (flock->fl_type == F_SHLCK) {
705 cFYI(1, "F_SHLCK"); 1017 cFYI(1, "F_SHLCK");
706 lockType |= LOCKING_ANDX_SHARED_LOCK; 1018 *type |= LOCKING_ANDX_SHARED_LOCK;
707 numLock = 1; 1019 *lock = 1;
708 } else 1020 } else
709 cFYI(1, "Unknown type of lock"); 1021 cFYI(1, "Unknown type of lock");
1022}
710 1023
711 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1024static int
712 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); 1025cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
713 netfid = ((struct cifsFileInfo *)file->private_data)->netfid; 1026 bool wait_flag, bool posix_lck, int xid)
1027{
1028 int rc = 0;
1029 __u64 length = 1 + flock->fl_end - flock->fl_start;
1030 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
1031 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1032 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1033 __u16 netfid = cfile->netfid;
714 1034
715 if ((tcon->ses->capabilities & CAP_UNIX) && 1035 if (posix_lck) {
716 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && 1036 int posix_lock_type;
717 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 1037
718 posix_locking = 1; 1038 rc = cifs_posix_lock_test(file, flock);
719 /* BB add code here to normalize offset and length to 1039 if (!rc)
720 account for negative length which we can not accept over the
721 wire */
722 if (IS_GETLK(cmd)) {
723 if (posix_locking) {
724 int posix_lock_type;
725 if (lockType & LOCKING_ANDX_SHARED_LOCK)
726 posix_lock_type = CIFS_RDLCK;
727 else
728 posix_lock_type = CIFS_WRLCK;
729 rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
730 length, pfLock, posix_lock_type,
731 wait_flag);
732 FreeXid(xid);
733 return rc; 1040 return rc;
734 }
735 1041
736 /* BB we could chain these into one lock request BB */ 1042 if (type & LOCKING_ANDX_SHARED_LOCK)
737 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 1043 posix_lock_type = CIFS_RDLCK;
738 0, 1, lockType, 0 /* wait flag */, 0); 1044 else
739 if (rc == 0) { 1045 posix_lock_type = CIFS_WRLCK;
740 rc = CIFSSMBLock(xid, tcon, netfid, length, 1046 rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
741 pfLock->fl_start, 1 /* numUnlock */ , 1047 1 /* get */, length, flock,
742 0 /* numLock */ , lockType, 1048 posix_lock_type, wait_flag);
743 0 /* wait flag */, 0); 1049 return rc;
744 pfLock->fl_type = F_UNLCK; 1050 }
745 if (rc != 0)
746 cERROR(1, "Error unlocking previously locked "
747 "range %d during test of lock", rc);
748 rc = 0;
749 1051
750 } else { 1052 rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid,
751 /* if rc == ERR_SHARING_VIOLATION ? */ 1053 flock);
752 rc = 0; 1054 if (!rc)
1055 return rc;
753 1056
754 if (lockType & LOCKING_ANDX_SHARED_LOCK) { 1057 /* BB we could chain these into one lock request BB */
755 pfLock->fl_type = F_WRLCK; 1058 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
756 } else { 1059 flock->fl_start, 0, 1, type, 0, 0);
757 rc = CIFSSMBLock(xid, tcon, netfid, length, 1060 if (rc == 0) {
758 pfLock->fl_start, 0, 1, 1061 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
759 lockType | LOCKING_ANDX_SHARED_LOCK, 1062 length, flock->fl_start, 1, 0,
760 0 /* wait flag */, 0); 1063 type, 0, 0);
761 if (rc == 0) { 1064 flock->fl_type = F_UNLCK;
762 rc = CIFSSMBLock(xid, tcon, netfid, 1065 if (rc != 0)
763 length, pfLock->fl_start, 1, 0, 1066 cERROR(1, "Error unlocking previously locked "
764 lockType | 1067 "range %d during test of lock", rc);
765 LOCKING_ANDX_SHARED_LOCK, 1068 rc = 0;
766 0 /* wait flag */, 0); 1069 return rc;
767 pfLock->fl_type = F_RDLCK; 1070 }
768 if (rc != 0)
769 cERROR(1, "Error unlocking "
770 "previously locked range %d "
771 "during test of lock", rc);
772 rc = 0;
773 } else {
774 pfLock->fl_type = F_WRLCK;
775 rc = 0;
776 }
777 }
778 }
779 1071
780 FreeXid(xid); 1072 if (type & LOCKING_ANDX_SHARED_LOCK) {
1073 flock->fl_type = F_WRLCK;
1074 rc = 0;
781 return rc; 1075 return rc;
782 } 1076 }
783 1077
784 if (!numLock && !numUnlock) { 1078 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
785 /* if no lock or unlock then nothing 1079 flock->fl_start, 0, 1,
786 to do since we do not know what it is */ 1080 type | LOCKING_ANDX_SHARED_LOCK, 0, 0);
787 FreeXid(xid); 1081 if (rc == 0) {
788 return -EOPNOTSUPP; 1082 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
1083 length, flock->fl_start, 1, 0,
1084 type | LOCKING_ANDX_SHARED_LOCK,
1085 0, 0);
1086 flock->fl_type = F_RDLCK;
1087 if (rc != 0)
1088 cERROR(1, "Error unlocking previously locked "
1089 "range %d during test of lock", rc);
1090 } else
1091 flock->fl_type = F_WRLCK;
1092
1093 rc = 0;
1094 return rc;
1095}
1096
1097static void
1098cifs_move_llist(struct list_head *source, struct list_head *dest)
1099{
1100 struct list_head *li, *tmp;
1101 list_for_each_safe(li, tmp, source)
1102 list_move(li, dest);
1103}
1104
1105static void
1106cifs_free_llist(struct list_head *llist)
1107{
1108 struct cifsLockInfo *li, *tmp;
1109 list_for_each_entry_safe(li, tmp, llist, llist) {
1110 cifs_del_lock_waiters(li);
1111 list_del(&li->llist);
1112 kfree(li);
789 } 1113 }
1114}
1115
1116static int
1117cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
1118{
1119 int rc = 0, stored_rc;
1120 int types[] = {LOCKING_ANDX_LARGE_FILES,
1121 LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
1122 unsigned int i;
1123 unsigned int max_num, num;
1124 LOCKING_ANDX_RANGE *buf, *cur;
1125 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1126 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1127 struct cifsLockInfo *li, *tmp;
1128 __u64 length = 1 + flock->fl_end - flock->fl_start;
1129 struct list_head tmp_llist;
1130
1131 INIT_LIST_HEAD(&tmp_llist);
1132
1133 max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
1134 sizeof(LOCKING_ANDX_RANGE);
1135 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
1136 if (!buf)
1137 return -ENOMEM;
1138
1139 mutex_lock(&cinode->lock_mutex);
1140 for (i = 0; i < 2; i++) {
1141 cur = buf;
1142 num = 0;
1143 list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
1144 if (flock->fl_start > li->offset ||
1145 (flock->fl_start + length) <
1146 (li->offset + li->length))
1147 continue;
1148 if (current->tgid != li->pid)
1149 continue;
1150 if (cfile->netfid != li->netfid)
1151 continue;
1152 if (types[i] != li->type)
1153 continue;
1154 if (!cinode->can_cache_brlcks) {
1155 cur->Pid = cpu_to_le16(li->pid);
1156 cur->LengthLow = cpu_to_le32((u32)li->length);
1157 cur->LengthHigh =
1158 cpu_to_le32((u32)(li->length>>32));
1159 cur->OffsetLow = cpu_to_le32((u32)li->offset);
1160 cur->OffsetHigh =
1161 cpu_to_le32((u32)(li->offset>>32));
1162 /*
1163 * We need to save a lock here to let us add
1164 * it again to the inode list if the unlock
1165 * range request fails on the server.
1166 */
1167 list_move(&li->llist, &tmp_llist);
1168 if (++num == max_num) {
1169 stored_rc = cifs_lockv(xid, tcon,
1170 cfile->netfid,
1171 li->type, num,
1172 0, buf);
1173 if (stored_rc) {
1174 /*
1175 * We failed on the unlock range
1176 * request - add all locks from
1177 * the tmp list to the head of
1178 * the inode list.
1179 */
1180 cifs_move_llist(&tmp_llist,
1181 &cinode->llist);
1182 rc = stored_rc;
1183 } else
1184 /*
1185 * The unlock range request
1186 * succeed - free the tmp list.
1187 */
1188 cifs_free_llist(&tmp_llist);
1189 cur = buf;
1190 num = 0;
1191 } else
1192 cur++;
1193 } else {
1194 /*
1195 * We can cache brlock requests - simply remove
1196 * a lock from the inode list.
1197 */
1198 list_del(&li->llist);
1199 cifs_del_lock_waiters(li);
1200 kfree(li);
1201 }
1202 }
1203 if (num) {
1204 stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
1205 types[i], num, 0, buf);
1206 if (stored_rc) {
1207 cifs_move_llist(&tmp_llist, &cinode->llist);
1208 rc = stored_rc;
1209 } else
1210 cifs_free_llist(&tmp_llist);
1211 }
1212 }
1213
1214 mutex_unlock(&cinode->lock_mutex);
1215 kfree(buf);
1216 return rc;
1217}
790 1218
791 if (posix_locking) { 1219static int
1220cifs_setlk(struct file *file, struct file_lock *flock, __u8 type,
1221 bool wait_flag, bool posix_lck, int lock, int unlock, int xid)
1222{
1223 int rc = 0;
1224 __u64 length = 1 + flock->fl_end - flock->fl_start;
1225 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
1226 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1227 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
1228 __u16 netfid = cfile->netfid;
1229
1230 if (posix_lck) {
792 int posix_lock_type; 1231 int posix_lock_type;
793 if (lockType & LOCKING_ANDX_SHARED_LOCK) 1232
1233 rc = cifs_posix_lock_set(file, flock);
1234 if (!rc || rc < 0)
1235 return rc;
1236
1237 if (type & LOCKING_ANDX_SHARED_LOCK)
794 posix_lock_type = CIFS_RDLCK; 1238 posix_lock_type = CIFS_RDLCK;
795 else 1239 else
796 posix_lock_type = CIFS_WRLCK; 1240 posix_lock_type = CIFS_WRLCK;
797 1241
798 if (numUnlock == 1) 1242 if (unlock == 1)
799 posix_lock_type = CIFS_UNLCK; 1243 posix_lock_type = CIFS_UNLCK;
800 1244
801 rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, 1245 rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
802 length, pfLock, posix_lock_type, 1246 0 /* set */, length, flock,
803 wait_flag); 1247 posix_lock_type, wait_flag);
804 } else { 1248 goto out;
805 struct cifsFileInfo *fid = file->private_data; 1249 }
806
807 if (numLock) {
808 rc = CIFSSMBLock(xid, tcon, netfid, length,
809 pfLock->fl_start, 0, numLock, lockType,
810 wait_flag, 0);
811 1250
812 if (rc == 0) { 1251 if (lock) {
813 /* For Windows locks we must store them. */ 1252 rc = cifs_lock_add_if(cinode, flock->fl_start, length,
814 rc = store_file_lock(fid, length, 1253 type, netfid, wait_flag);
815 pfLock->fl_start, lockType); 1254 if (rc < 0)
816 } 1255 return rc;
817 } else if (numUnlock) { 1256 else if (!rc)
818 /* For each stored lock that this unlock overlaps 1257 goto out;
819 completely, unlock it. */
820 int stored_rc = 0;
821 struct cifsLockInfo *li, *tmp;
822 1258
823 rc = 0; 1259 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
824 mutex_lock(&fid->lock_mutex); 1260 flock->fl_start, 0, 1, type, wait_flag, 0);
825 list_for_each_entry_safe(li, tmp, &fid->llist, llist) { 1261 if (rc == 0) {
826 if (pfLock->fl_start <= li->offset && 1262 /* For Windows locks we must store them. */
827 (pfLock->fl_start + length) >= 1263 rc = cifs_lock_add(cinode, length, flock->fl_start,
828 (li->offset + li->length)) { 1264 type, netfid);
829 stored_rc = CIFSSMBLock(xid, tcon,
830 netfid, li->length,
831 li->offset, 1, 0,
832 li->type, false, 0);
833 if (stored_rc)
834 rc = stored_rc;
835 else {
836 list_del(&li->llist);
837 kfree(li);
838 }
839 }
840 }
841 mutex_unlock(&fid->lock_mutex);
842 } 1265 }
1266 } else if (unlock)
1267 rc = cifs_unlock_range(cfile, flock, xid);
1268
1269out:
1270 if (flock->fl_flags & FL_POSIX)
1271 posix_lock_file_wait(file, flock);
1272 return rc;
1273}
1274
1275int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
1276{
1277 int rc, xid;
1278 int lock = 0, unlock = 0;
1279 bool wait_flag = false;
1280 bool posix_lck = false;
1281 struct cifs_sb_info *cifs_sb;
1282 struct cifs_tcon *tcon;
1283 struct cifsInodeInfo *cinode;
1284 struct cifsFileInfo *cfile;
1285 __u16 netfid;
1286 __u8 type;
1287
1288 rc = -EACCES;
1289 xid = GetXid();
1290
1291 cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld "
1292 "end: %lld", cmd, flock->fl_flags, flock->fl_type,
1293 flock->fl_start, flock->fl_end);
1294
1295 cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag);
1296
1297 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1298 cfile = (struct cifsFileInfo *)file->private_data;
1299 tcon = tlink_tcon(cfile->tlink);
1300 netfid = cfile->netfid;
1301 cinode = CIFS_I(file->f_path.dentry->d_inode);
1302
1303 if ((tcon->ses->capabilities & CAP_UNIX) &&
1304 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
1305 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
1306 posix_lck = true;
1307 /*
1308 * BB add code here to normalize offset and length to account for
1309 * negative length which we can not accept over the wire.
1310 */
1311 if (IS_GETLK(cmd)) {
1312 rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid);
1313 FreeXid(xid);
1314 return rc;
843 } 1315 }
844 1316
845 if (pfLock->fl_flags & FL_POSIX) 1317 if (!lock && !unlock) {
846 posix_lock_file_wait(file, pfLock); 1318 /*
1319 * if no lock or unlock then nothing to do since we do not
1320 * know what it is
1321 */
1322 FreeXid(xid);
1323 return -EOPNOTSUPP;
1324 }
1325
1326 rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock,
1327 xid);
847 FreeXid(xid); 1328 FreeXid(xid);
848 return rc; 1329 return rc;
849} 1330}
@@ -1714,6 +2195,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
1714 struct smb_com_read_rsp *pSMBr; 2195 struct smb_com_read_rsp *pSMBr;
1715 struct cifs_io_parms io_parms; 2196 struct cifs_io_parms io_parms;
1716 char *read_data; 2197 char *read_data;
2198 unsigned int rsize;
1717 __u32 pid; 2199 __u32 pid;
1718 2200
1719 if (!nr_segs) 2201 if (!nr_segs)
@@ -1726,6 +2208,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
1726 xid = GetXid(); 2208 xid = GetXid();
1727 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2209 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1728 2210
2211 /* FIXME: set up handlers for larger reads and/or convert to async */
2212 rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
2213
1729 open_file = file->private_data; 2214 open_file = file->private_data;
1730 pTcon = tlink_tcon(open_file->tlink); 2215 pTcon = tlink_tcon(open_file->tlink);
1731 2216
@@ -1738,7 +2223,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
1738 cFYI(1, "attempting read on write only file instance"); 2223 cFYI(1, "attempting read on write only file instance");
1739 2224
1740 for (total_read = 0; total_read < len; total_read += bytes_read) { 2225 for (total_read = 0; total_read < len; total_read += bytes_read) {
1741 cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); 2226 cur_len = min_t(const size_t, len - total_read, rsize);
1742 rc = -EAGAIN; 2227 rc = -EAGAIN;
1743 read_data = NULL; 2228 read_data = NULL;
1744 2229
@@ -1830,6 +2315,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1830 unsigned int bytes_read = 0; 2315 unsigned int bytes_read = 0;
1831 unsigned int total_read; 2316 unsigned int total_read;
1832 unsigned int current_read_size; 2317 unsigned int current_read_size;
2318 unsigned int rsize;
1833 struct cifs_sb_info *cifs_sb; 2319 struct cifs_sb_info *cifs_sb;
1834 struct cifs_tcon *pTcon; 2320 struct cifs_tcon *pTcon;
1835 int xid; 2321 int xid;
@@ -1842,6 +2328,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1842 xid = GetXid(); 2328 xid = GetXid();
1843 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2329 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1844 2330
2331 /* FIXME: set up handlers for larger reads and/or convert to async */
2332 rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
2333
1845 if (file->private_data == NULL) { 2334 if (file->private_data == NULL) {
1846 rc = -EBADF; 2335 rc = -EBADF;
1847 FreeXid(xid); 2336 FreeXid(xid);
@@ -1861,14 +2350,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1861 for (total_read = 0, current_offset = read_data; 2350 for (total_read = 0, current_offset = read_data;
1862 read_size > total_read; 2351 read_size > total_read;
1863 total_read += bytes_read, current_offset += bytes_read) { 2352 total_read += bytes_read, current_offset += bytes_read) {
1864 current_read_size = min_t(const int, read_size - total_read, 2353 current_read_size = min_t(uint, read_size - total_read, rsize);
1865 cifs_sb->rsize); 2354
1866 /* For windows me and 9x we do not want to request more 2355 /* For windows me and 9x we do not want to request more
1867 than it negotiated since it will refuse the read then */ 2356 than it negotiated since it will refuse the read then */
1868 if ((pTcon->ses) && 2357 if ((pTcon->ses) &&
1869 !(pTcon->ses->capabilities & CAP_LARGE_FILES)) { 2358 !(pTcon->ses->capabilities & CAP_LARGE_FILES)) {
1870 current_read_size = min_t(const int, current_read_size, 2359 current_read_size = min_t(uint, current_read_size,
1871 pTcon->ses->server->maxBuf - 128); 2360 CIFSMaxBufSize);
1872 } 2361 }
1873 rc = -EAGAIN; 2362 rc = -EAGAIN;
1874 while (rc == -EAGAIN) { 2363 while (rc == -EAGAIN) {
@@ -1957,82 +2446,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1957 return rc; 2446 return rc;
1958} 2447}
1959 2448
1960
1961static void cifs_copy_cache_pages(struct address_space *mapping,
1962 struct list_head *pages, int bytes_read, char *data)
1963{
1964 struct page *page;
1965 char *target;
1966
1967 while (bytes_read > 0) {
1968 if (list_empty(pages))
1969 break;
1970
1971 page = list_entry(pages->prev, struct page, lru);
1972 list_del(&page->lru);
1973
1974 if (add_to_page_cache_lru(page, mapping, page->index,
1975 GFP_KERNEL)) {
1976 page_cache_release(page);
1977 cFYI(1, "Add page cache failed");
1978 data += PAGE_CACHE_SIZE;
1979 bytes_read -= PAGE_CACHE_SIZE;
1980 continue;
1981 }
1982 page_cache_release(page);
1983
1984 target = kmap_atomic(page, KM_USER0);
1985
1986 if (PAGE_CACHE_SIZE > bytes_read) {
1987 memcpy(target, data, bytes_read);
1988 /* zero the tail end of this partial page */
1989 memset(target + bytes_read, 0,
1990 PAGE_CACHE_SIZE - bytes_read);
1991 bytes_read = 0;
1992 } else {
1993 memcpy(target, data, PAGE_CACHE_SIZE);
1994 bytes_read -= PAGE_CACHE_SIZE;
1995 }
1996 kunmap_atomic(target, KM_USER0);
1997
1998 flush_dcache_page(page);
1999 SetPageUptodate(page);
2000 unlock_page(page);
2001 data += PAGE_CACHE_SIZE;
2002
2003 /* add page to FS-Cache */
2004 cifs_readpage_to_fscache(mapping->host, page);
2005 }
2006 return;
2007}
2008
2009static int cifs_readpages(struct file *file, struct address_space *mapping, 2449static int cifs_readpages(struct file *file, struct address_space *mapping,
2010 struct list_head *page_list, unsigned num_pages) 2450 struct list_head *page_list, unsigned num_pages)
2011{ 2451{
2012 int rc = -EACCES; 2452 int rc;
2013 int xid; 2453 struct list_head tmplist;
2014 loff_t offset; 2454 struct cifsFileInfo *open_file = file->private_data;
2015 struct page *page; 2455 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2016 struct cifs_sb_info *cifs_sb; 2456 unsigned int rsize = cifs_sb->rsize;
2017 struct cifs_tcon *pTcon; 2457 pid_t pid;
2018 unsigned int bytes_read = 0;
2019 unsigned int read_size, i;
2020 char *smb_read_data = NULL;
2021 struct smb_com_read_rsp *pSMBr;
2022 struct cifsFileInfo *open_file;
2023 struct cifs_io_parms io_parms;
2024 int buf_type = CIFS_NO_BUFFER;
2025 __u32 pid;
2026 2458
2027 xid = GetXid(); 2459 /*
2028 if (file->private_data == NULL) { 2460 * Give up immediately if rsize is too small to read an entire page.
2029 rc = -EBADF; 2461 * The VFS will fall back to readpage. We should never reach this
2030 FreeXid(xid); 2462 * point however since we set ra_pages to 0 when the rsize is smaller
2031 return rc; 2463 * than a cache page.
2032 } 2464 */
2033 open_file = file->private_data; 2465 if (unlikely(rsize < PAGE_CACHE_SIZE))
2034 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2466 return 0;
2035 pTcon = tlink_tcon(open_file->tlink);
2036 2467
2037 /* 2468 /*
2038 * Reads as many pages as possible from fscache. Returns -ENOBUFS 2469 * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2041,125 +2472,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2041 rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, 2472 rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
2042 &num_pages); 2473 &num_pages);
2043 if (rc == 0) 2474 if (rc == 0)
2044 goto read_complete; 2475 return rc;
2045 2476
2046 cFYI(DBG2, "rpages: num pages %d", num_pages);
2047 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) 2477 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
2048 pid = open_file->pid; 2478 pid = open_file->pid;
2049 else 2479 else
2050 pid = current->tgid; 2480 pid = current->tgid;
2051 2481
2052 for (i = 0; i < num_pages; ) { 2482 rc = 0;
2053 unsigned contig_pages; 2483 INIT_LIST_HEAD(&tmplist);
2054 struct page *tmp_page;
2055 unsigned long expected_index;
2056 2484
2057 if (list_empty(page_list)) 2485 cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file,
2058 break; 2486 mapping, num_pages);
2487
2488 /*
2489 * Start with the page at end of list and move it to private
2490 * list. Do the same with any following pages until we hit
2491 * the rsize limit, hit an index discontinuity, or run out of
2492 * pages. Issue the async read and then start the loop again
2493 * until the list is empty.
2494 *
2495 * Note that list order is important. The page_list is in
2496 * the order of declining indexes. When we put the pages in
2497 * the rdata->pages, then we want them in increasing order.
2498 */
2499 while (!list_empty(page_list)) {
2500 unsigned int bytes = PAGE_CACHE_SIZE;
2501 unsigned int expected_index;
2502 unsigned int nr_pages = 1;
2503 loff_t offset;
2504 struct page *page, *tpage;
2505 struct cifs_readdata *rdata;
2059 2506
2060 page = list_entry(page_list->prev, struct page, lru); 2507 page = list_entry(page_list->prev, struct page, lru);
2508
2509 /*
2510 * Lock the page and put it in the cache. Since no one else
2511 * should have access to this page, we're safe to simply set
2512 * PG_locked without checking it first.
2513 */
2514 __set_page_locked(page);
2515 rc = add_to_page_cache_locked(page, mapping,
2516 page->index, GFP_KERNEL);
2517
2518 /* give up if we can't stick it in the cache */
2519 if (rc) {
2520 __clear_page_locked(page);
2521 break;
2522 }
2523
2524 /* move first page to the tmplist */
2061 offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 2525 offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
2526 list_move_tail(&page->lru, &tmplist);
2062 2527
2063 /* count adjacent pages that we will read into */ 2528 /* now try and add more pages onto the request */
2064 contig_pages = 0; 2529 expected_index = page->index + 1;
2065 expected_index = 2530 list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
2066 list_entry(page_list->prev, struct page, lru)->index; 2531 /* discontinuity ? */
2067 list_for_each_entry_reverse(tmp_page, page_list, lru) { 2532 if (page->index != expected_index)
2068 if (tmp_page->index == expected_index) {
2069 contig_pages++;
2070 expected_index++;
2071 } else
2072 break; 2533 break;
2534
2535 /* would this page push the read over the rsize? */
2536 if (bytes + PAGE_CACHE_SIZE > rsize)
2537 break;
2538
2539 __set_page_locked(page);
2540 if (add_to_page_cache_locked(page, mapping,
2541 page->index, GFP_KERNEL)) {
2542 __clear_page_locked(page);
2543 break;
2544 }
2545 list_move_tail(&page->lru, &tmplist);
2546 bytes += PAGE_CACHE_SIZE;
2547 expected_index++;
2548 nr_pages++;
2073 } 2549 }
2074 if (contig_pages + i > num_pages) 2550
2075 contig_pages = num_pages - i; 2551 rdata = cifs_readdata_alloc(nr_pages);
2076 2552 if (!rdata) {
2077 /* for reads over a certain size could initiate async 2553 /* best to give up if we're out of mem */
2078 read ahead */ 2554 list_for_each_entry_safe(page, tpage, &tmplist, lru) {
2079 2555 list_del(&page->lru);
2080 read_size = contig_pages * PAGE_CACHE_SIZE; 2556 lru_cache_add_file(page);
2081 /* Read size needs to be in multiples of one page */ 2557 unlock_page(page);
2082 read_size = min_t(const unsigned int, read_size, 2558 page_cache_release(page);
2083 cifs_sb->rsize & PAGE_CACHE_MASK); 2559 }
2084 cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d", 2560 rc = -ENOMEM;
2085 read_size, contig_pages); 2561 break;
2086 rc = -EAGAIN; 2562 }
2087 while (rc == -EAGAIN) { 2563
2564 spin_lock(&cifs_file_list_lock);
2565 cifsFileInfo_get(open_file);
2566 spin_unlock(&cifs_file_list_lock);
2567 rdata->cfile = open_file;
2568 rdata->mapping = mapping;
2569 rdata->offset = offset;
2570 rdata->bytes = bytes;
2571 rdata->pid = pid;
2572 list_splice_init(&tmplist, &rdata->pages);
2573
2574 do {
2088 if (open_file->invalidHandle) { 2575 if (open_file->invalidHandle) {
2089 rc = cifs_reopen_file(open_file, true); 2576 rc = cifs_reopen_file(open_file, true);
2090 if (rc != 0) 2577 if (rc != 0)
2091 break; 2578 continue;
2092 } 2579 }
2093 io_parms.netfid = open_file->netfid; 2580 rc = cifs_async_readv(rdata);
2094 io_parms.pid = pid; 2581 } while (rc == -EAGAIN);
2095 io_parms.tcon = pTcon;
2096 io_parms.offset = offset;
2097 io_parms.length = read_size;
2098 rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
2099 &smb_read_data, &buf_type);
2100 /* BB more RC checks ? */
2101 if (rc == -EAGAIN) {
2102 if (smb_read_data) {
2103 if (buf_type == CIFS_SMALL_BUFFER)
2104 cifs_small_buf_release(smb_read_data);
2105 else if (buf_type == CIFS_LARGE_BUFFER)
2106 cifs_buf_release(smb_read_data);
2107 smb_read_data = NULL;
2108 }
2109 }
2110 }
2111 if ((rc < 0) || (smb_read_data == NULL)) {
2112 cFYI(1, "Read error in readpages: %d", rc);
2113 break;
2114 } else if (bytes_read > 0) {
2115 task_io_account_read(bytes_read);
2116 pSMBr = (struct smb_com_read_rsp *)smb_read_data;
2117 cifs_copy_cache_pages(mapping, page_list, bytes_read,
2118 smb_read_data + 4 /* RFC1001 hdr */ +
2119 le16_to_cpu(pSMBr->DataOffset));
2120
2121 i += bytes_read >> PAGE_CACHE_SHIFT;
2122 cifs_stats_bytes_read(pTcon, bytes_read);
2123 if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) {
2124 i++; /* account for partial page */
2125
2126 /* server copy of file can have smaller size
2127 than client */
2128 /* BB do we need to verify this common case ?
2129 this case is ok - if we are at server EOF
2130 we will hit it on next read */
2131 2582
2132 /* break; */ 2583 if (rc != 0) {
2584 list_for_each_entry_safe(page, tpage, &rdata->pages,
2585 lru) {
2586 list_del(&page->lru);
2587 lru_cache_add_file(page);
2588 unlock_page(page);
2589 page_cache_release(page);
2133 } 2590 }
2134 } else { 2591 cifs_readdata_free(rdata);
2135 cFYI(1, "No bytes read (%d) at offset %lld . "
2136 "Cleaning remaining pages from readahead list",
2137 bytes_read, offset);
2138 /* BB turn off caching and do new lookup on
2139 file size at server? */
2140 break; 2592 break;
2141 } 2593 }
2142 if (smb_read_data) {
2143 if (buf_type == CIFS_SMALL_BUFFER)
2144 cifs_small_buf_release(smb_read_data);
2145 else if (buf_type == CIFS_LARGE_BUFFER)
2146 cifs_buf_release(smb_read_data);
2147 smb_read_data = NULL;
2148 }
2149 bytes_read = 0;
2150 } 2594 }
2151 2595
2152/* need to free smb_read_data buf before exit */
2153 if (smb_read_data) {
2154 if (buf_type == CIFS_SMALL_BUFFER)
2155 cifs_small_buf_release(smb_read_data);
2156 else if (buf_type == CIFS_LARGE_BUFFER)
2157 cifs_buf_release(smb_read_data);
2158 smb_read_data = NULL;
2159 }
2160
2161read_complete:
2162 FreeXid(xid);
2163 return rc; 2596 return rc;
2164} 2597}
2165 2598
@@ -2408,6 +2841,10 @@ void cifs_oplock_break(struct work_struct *work)
2408 cFYI(1, "Oplock flush inode %p rc %d", inode, rc); 2841 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
2409 } 2842 }
2410 2843
2844 rc = cifs_push_locks(cfile);
2845 if (rc)
2846 cERROR(1, "Push locks rc = %d", rc);
2847
2411 /* 2848 /*
2412 * releasing stale oplock after recent reconnect of smb session using 2849 * releasing stale oplock after recent reconnect of smb session using
2413 * a now incorrect file handle is not a data integrity issue but do 2850 * a now incorrect file handle is not a data integrity issue but do
@@ -2415,8 +2852,9 @@ void cifs_oplock_break(struct work_struct *work)
2415 * disconnected since oplock already released by the server 2852 * disconnected since oplock already released by the server
2416 */ 2853 */
2417 if (!cfile->oplock_break_cancelled) { 2854 if (!cfile->oplock_break_cancelled) {
2418 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, 2855 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid,
2419 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, 2856 current->tgid, 0, 0, 0, 0,
2857 LOCKING_ANDX_OPLOCK_RELEASE, false,
2420 cinode->clientCanCacheRead ? 1 : 0); 2858 cinode->clientCanCacheRead ? 1 : 0);
2421 cFYI(1, "Oplock release rc = %d", rc); 2859 cFYI(1, "Oplock release rc = %d", rc);
2422 } 2860 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a7b2dcd4a53e..2c50bd2f65d1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp)
562 562
563 xid = GetXid(); 563 xid = GetXid();
564 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); 564 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
565 if (rc == -EOPNOTSUPP || rc == -EINVAL) { 565 switch (rc) {
566 case 0:
567 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
568 break;
569 case -EREMOTE:
570 cifs_create_dfs_fattr(&fattr, inode->i_sb);
571 rc = 0;
572 break;
573 case -EOPNOTSUPP:
574 case -EINVAL:
566 /* 575 /*
567 * FIXME: legacy server -- fall back to path-based call? 576 * FIXME: legacy server -- fall back to path-based call?
568 * for now, just skip revalidating and mark inode for 577 * for now, just skip revalidating and mark inode for
@@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp)
570 */ 579 */
571 rc = 0; 580 rc = 0;
572 CIFS_I(inode)->time = 0; 581 CIFS_I(inode)->time = 0;
582 default:
573 goto cgfi_exit; 583 goto cgfi_exit;
574 } else if (rc == -EREMOTE) { 584 }
575 cifs_create_dfs_fattr(&fattr, inode->i_sb);
576 rc = 0;
577 } else if (rc)
578 goto cgfi_exit;
579 585
580 /* 586 /*
581 * don't bother with SFU junk here -- just mark inode as needing 587 * don't bother with SFU junk here -- just mark inode as needing
582 * revalidation. 588 * revalidation.
583 */ 589 */
584 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
585 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; 590 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
586 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; 591 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
587 cifs_fattr_to_inode(inode, &fattr); 592 cifs_fattr_to_inode(inode, &fattr);
@@ -2096,6 +2101,8 @@ static int
2096cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) 2101cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2097{ 2102{
2098 int xid; 2103 int xid;
2104 uid_t uid = NO_CHANGE_32;
2105 gid_t gid = NO_CHANGE_32;
2099 struct inode *inode = direntry->d_inode; 2106 struct inode *inode = direntry->d_inode;
2100 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 2107 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
2101 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 2108 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2146,13 +2153,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2146 goto cifs_setattr_exit; 2153 goto cifs_setattr_exit;
2147 } 2154 }
2148 2155
2149 /* 2156 if (attrs->ia_valid & ATTR_UID)
2150 * Without unix extensions we can't send ownership changes to the 2157 uid = attrs->ia_uid;
2151 * server, so silently ignore them. This is consistent with how 2158
2152 * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With 2159 if (attrs->ia_valid & ATTR_GID)
2153 * CIFSACL support + proper Windows to Unix idmapping, we may be 2160 gid = attrs->ia_gid;
2154 * able to support this in the future. 2161
2155 */ 2162#ifdef CONFIG_CIFS_ACL
2163 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
2164 if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) {
2165 rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
2166 uid, gid);
2167 if (rc) {
2168 cFYI(1, "%s: Setting id failed with error: %d",
2169 __func__, rc);
2170 goto cifs_setattr_exit;
2171 }
2172 }
2173 } else
2174#endif /* CONFIG_CIFS_ACL */
2156 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) 2175 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
2157 attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); 2176 attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
2158 2177
@@ -2161,15 +2180,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2161 attrs->ia_valid &= ~ATTR_MODE; 2180 attrs->ia_valid &= ~ATTR_MODE;
2162 2181
2163 if (attrs->ia_valid & ATTR_MODE) { 2182 if (attrs->ia_valid & ATTR_MODE) {
2164 cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
2165 mode = attrs->ia_mode; 2183 mode = attrs->ia_mode;
2166 }
2167
2168 if (attrs->ia_valid & ATTR_MODE) {
2169 rc = 0; 2184 rc = 0;
2170#ifdef CONFIG_CIFS_ACL 2185#ifdef CONFIG_CIFS_ACL
2171 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 2186 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
2172 rc = mode_to_cifs_acl(inode, full_path, mode); 2187 rc = id_mode_to_cifs_acl(inode, full_path, mode,
2188 NO_CHANGE_32, NO_CHANGE_32);
2173 if (rc) { 2189 if (rc) {
2174 cFYI(1, "%s: Setting ACL failed with error: %d", 2190 cFYI(1, "%s: Setting ACL failed with error: %d",
2175 __func__, rc); 2191 __func__, rc);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index db3f18cdf024..8693b5d0e180 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -183,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
183static int 183static int
184CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, 184CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
185 const char *fromName, const char *toName, 185 const char *fromName, const char *toName,
186 const struct nls_table *nls_codepage, int remap) 186 struct cifs_sb_info *cifs_sb)
187{ 187{
188 int rc; 188 int rc;
189 int oplock = 0; 189 int oplock = 0;
190 int remap;
191 int create_options = CREATE_NOT_DIR;
190 __u16 netfid = 0; 192 __u16 netfid = 0;
191 u8 *buf; 193 u8 *buf;
192 unsigned int bytes_written = 0; 194 unsigned int bytes_written = 0;
193 struct cifs_io_parms io_parms; 195 struct cifs_io_parms io_parms;
196 struct nls_table *nls_codepage;
197
198 nls_codepage = cifs_sb->local_nls;
199 remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
194 200
195 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); 201 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
196 if (!buf) 202 if (!buf)
@@ -202,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
202 return rc; 208 return rc;
203 } 209 }
204 210
211 if (backup_cred(cifs_sb))
212 create_options |= CREATE_OPEN_BACKUP_INTENT;
213
205 rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE, 214 rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
206 CREATE_NOT_DIR, &netfid, &oplock, NULL, 215 create_options, &netfid, &oplock, NULL,
207 nls_codepage, remap); 216 nls_codepage, remap);
208 if (rc != 0) { 217 if (rc != 0) {
209 kfree(buf); 218 kfree(buf);
@@ -559,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
559 /* BB what if DFS and this volume is on different share? BB */ 568 /* BB what if DFS and this volume is on different share? BB */
560 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) 569 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
561 rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, 570 rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
562 cifs_sb->local_nls, 571 cifs_sb);
563 cifs_sb->mnt_cifs_flags &
564 CIFS_MOUNT_MAP_SPECIAL_CHR);
565 else if (pTcon->unix_ext) 572 else if (pTcon->unix_ext)
566 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, 573 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
567 cifs_sb->local_nls); 574 cifs_sb->local_nls);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 7c1693392598..703ef5c6fdb1 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
420} 420}
421 421
422int 422int
423checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) 423checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read)
424{ 424{
425 __u32 len = be32_to_cpu(smb->smb_buf_length); 425 __u32 rfclen = be32_to_cpu(smb->smb_buf_length);
426 __u32 clc_len; /* calculated length */ 426 __u32 clc_len; /* calculated length */
427 cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len); 427 cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x",
428 total_read, rfclen);
428 429
429 if (length < 2 + sizeof(struct smb_hdr)) { 430 /* is this frame too small to even get to a BCC? */
430 if ((length >= sizeof(struct smb_hdr) - 1) 431 if (total_read < 2 + sizeof(struct smb_hdr)) {
432 if ((total_read >= sizeof(struct smb_hdr) - 1)
431 && (smb->Status.CifsError != 0)) { 433 && (smb->Status.CifsError != 0)) {
434 /* it's an error return */
432 smb->WordCount = 0; 435 smb->WordCount = 0;
433 /* some error cases do not return wct and bcc */ 436 /* some error cases do not return wct and bcc */
434 return 0; 437 return 0;
435 } else if ((length == sizeof(struct smb_hdr) + 1) && 438 } else if ((total_read == sizeof(struct smb_hdr) + 1) &&
436 (smb->WordCount == 0)) { 439 (smb->WordCount == 0)) {
437 char *tmp = (char *)smb; 440 char *tmp = (char *)smb;
438 /* Need to work around a bug in two servers here */ 441 /* Need to work around a bug in two servers here */
@@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
452 } else { 455 } else {
453 cERROR(1, "Length less than smb header size"); 456 cERROR(1, "Length less than smb header size");
454 } 457 }
455 return 1; 458 return -EIO;
456 }
457 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
458 cERROR(1, "smb length greater than MaxBufSize, mid=%d",
459 smb->Mid);
460 return 1;
461 } 459 }
462 460
461 /* otherwise, there is enough to get to the BCC */
463 if (check_smb_hdr(smb, mid)) 462 if (check_smb_hdr(smb, mid))
464 return 1; 463 return -EIO;
465 clc_len = smbCalcSize(smb); 464 clc_len = smbCalcSize(smb);
466 465
467 if (4 + len != length) { 466 if (4 + rfclen != total_read) {
468 cERROR(1, "Length read does not match RFC1001 length %d", 467 cERROR(1, "Length read does not match RFC1001 length %d",
469 len); 468 rfclen);
470 return 1; 469 return -EIO;
471 } 470 }
472 471
473 if (4 + len != clc_len) { 472 if (4 + rfclen != clc_len) {
474 /* check if bcc wrapped around for large read responses */ 473 /* check if bcc wrapped around for large read responses */
475 if ((len > 64 * 1024) && (len > clc_len)) { 474 if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
476 /* check if lengths match mod 64K */ 475 /* check if lengths match mod 64K */
477 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) 476 if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
478 return 0; /* bcc wrapped */ 477 return 0; /* bcc wrapped */
479 } 478 }
480 cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", 479 cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
481 clc_len, 4 + len, smb->Mid); 480 clc_len, 4 + rfclen, smb->Mid);
482 481
483 if (4 + len < clc_len) { 482 if (4 + rfclen < clc_len) {
484 cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", 483 cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
485 len, smb->Mid); 484 rfclen, smb->Mid);
486 return 1; 485 return -EIO;
487 } else if (len > clc_len + 512) { 486 } else if (rfclen > clc_len + 512) {
488 /* 487 /*
489 * Some servers (Windows XP in particular) send more 488 * Some servers (Windows XP in particular) send more
490 * data than the lengths in the SMB packet would 489 * data than the lengths in the SMB packet would
@@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
495 * data to 512 bytes. 494 * data to 512 bytes.
496 */ 495 */
497 cERROR(1, "RFC1001 size %u more than 512 bytes larger " 496 cERROR(1, "RFC1001 size %u more than 512 bytes larger "
498 "than SMB for mid=%u", len, smb->Mid); 497 "than SMB for mid=%u", rfclen, smb->Mid);
499 return 1; 498 return -EIO;
500 } 499 }
501 } 500 }
502 return 0; 501 return 0;
@@ -676,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
676 cinode->clientCanCacheRead = false; 675 cinode->clientCanCacheRead = false;
677 } 676 }
678} 677}
678
679bool
680backup_cred(struct cifs_sb_info *cifs_sb)
681{
682 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
683 if (cifs_sb->mnt_backupuid == current_fsuid())
684 return true;
685 }
686 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
687 if (in_group_p(cifs_sb->mnt_backupgid))
688 return true;
689 }
690
691 return false;
692}
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d3e619692ee0..c7d80e24f24e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
124 /* that we use in next few lines */ 124 /* that we use in next few lines */
125 /* Note that header is initialized to zero in header_assemble */ 125 /* Note that header is initialized to zero in header_assemble */
126 pSMB->req.AndXCommand = 0xFF; 126 pSMB->req.AndXCommand = 0xFF;
127 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); 127 pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32,
128 CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
129 USHRT_MAX));
128 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); 130 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
129 pSMB->req.VcNumber = get_next_vcnum(ses); 131 pSMB->req.VcNumber = get_next_vcnum(ses);
130 132
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 42b9fff48751..ac1221d969d6 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -265,91 +265,6 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
265 return rc; 265 return rc;
266} 266}
267 267
268#if 0 /* currently unused */
269/* Does both the NT and LM owfs of a user's password */
270static void
271nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
272{
273 char passwd[514];
274
275 memset(passwd, '\0', 514);
276 if (strlen(pwd) < 513)
277 strcpy(passwd, pwd);
278 else
279 memcpy(passwd, pwd, 512);
280 /* Calculate the MD4 hash (NT compatible) of the password */
281 memset(nt_p16, '\0', 16);
282 E_md4hash(passwd, nt_p16);
283
284 /* Mangle the passwords into Lanman format */
285 passwd[14] = '\0';
286/* strupper(passwd); */
287
288 /* Calculate the SMB (lanman) hash functions of the password */
289
290 memset(p16, '\0', 16);
291 E_P16((unsigned char *) passwd, (unsigned char *) p16);
292
293 /* clear out local copy of user's password (just being paranoid). */
294 memset(passwd, '\0', sizeof(passwd));
295}
296#endif
297
298/* Does the NTLMv2 owfs of a user's password */
299#if 0 /* function not needed yet - but will be soon */
300static void
301ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
302 const char *domain_n, unsigned char kr_buf[16],
303 const struct nls_table *nls_codepage)
304{
305 wchar_t *user_u;
306 wchar_t *dom_u;
307 int user_l, domain_l;
308 struct HMACMD5Context ctx;
309
310 /* might as well do one alloc to hold both (user_u and dom_u) */
311 user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL);
312 if (user_u == NULL)
313 return;
314 dom_u = user_u + 1024;
315
316 /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
317 STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
318 push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
319 STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
320
321 /* BB user and domain may need to be uppercased */
322 user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
323 domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage);
324
325 user_l++; /* trailing null */
326 domain_l++;
327
328 hmac_md5_init_limK_to_64(owf, 16, &ctx);
329 hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx);
330 hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx);
331 hmac_md5_final(kr_buf, &ctx);
332
333 kfree(user_u);
334}
335#endif
336
337/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
338#if 0 /* currently unused */
339static void
340NTLMSSPOWFencrypt(unsigned char passwd[8],
341 unsigned char *ntlmchalresp, unsigned char p24[24])
342{
343 unsigned char p21[21];
344
345 memset(p21, '\0', 21);
346 memcpy(p21, passwd, 8);
347 memset(p21 + 8, 0xbd, 8);
348
349 E_P24(p21, ntlmchalresp, p24);
350}
351#endif
352
353/* Does the NT MD4 hash then des encryption. */ 268/* Does the NT MD4 hash then des encryption. */
354int 269int
355SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) 270SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
@@ -369,39 +284,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
369 rc = E_P24(p21, c8, p24); 284 rc = E_P24(p21, c8, p24);
370 return rc; 285 return rc;
371} 286}
372
373
374/* Does the md5 encryption from the NT hash for NTLMv2. */
375/* These routines will be needed later */
376#if 0
377static void
378SMBOWFencrypt_ntv2(const unsigned char kr[16],
379 const struct data_blob *srv_chal,
380 const struct data_blob *cli_chal, unsigned char resp_buf[16])
381{
382 struct HMACMD5Context ctx;
383
384 hmac_md5_init_limK_to_64(kr, 16, &ctx);
385 hmac_md5_update(srv_chal->data, srv_chal->length, &ctx);
386 hmac_md5_update(cli_chal->data, cli_chal->length, &ctx);
387 hmac_md5_final(resp_buf, &ctx);
388}
389
390static void
391SMBsesskeygen_ntv2(const unsigned char kr[16],
392 const unsigned char *nt_resp, __u8 sess_key[16])
393{
394 struct HMACMD5Context ctx;
395
396 hmac_md5_init_limK_to_64(kr, 16, &ctx);
397 hmac_md5_update(nt_resp, 16, &ctx);
398 hmac_md5_final((unsigned char *) sess_key, &ctx);
399}
400
401static void
402SMBsesskeygen_ntv1(const unsigned char kr[16],
403 const unsigned char *nt_resp, __u8 sess_key[16])
404{
405 mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16);
406}
407#endif
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c1b9c4b10739..0cc9584f5889 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -26,6 +26,7 @@
26#include <linux/wait.h> 26#include <linux/wait.h>
27#include <linux/net.h> 27#include <linux/net.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/freezer.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/processor.h> 31#include <asm/processor.h>
31#include <linux/mempool.h> 32#include <linux/mempool.h>
@@ -266,15 +267,11 @@ static int wait_for_free_request(struct TCP_Server_Info *server,
266 while (1) { 267 while (1) {
267 if (atomic_read(&server->inFlight) >= cifs_max_pending) { 268 if (atomic_read(&server->inFlight) >= cifs_max_pending) {
268 spin_unlock(&GlobalMid_Lock); 269 spin_unlock(&GlobalMid_Lock);
269#ifdef CONFIG_CIFS_STATS2 270 cifs_num_waiters_inc(server);
270 atomic_inc(&server->num_waiters);
271#endif
272 wait_event(server->request_q, 271 wait_event(server->request_q,
273 atomic_read(&server->inFlight) 272 atomic_read(&server->inFlight)
274 < cifs_max_pending); 273 < cifs_max_pending);
275#ifdef CONFIG_CIFS_STATS2 274 cifs_num_waiters_dec(server);
276 atomic_dec(&server->num_waiters);
277#endif
278 spin_lock(&GlobalMid_Lock); 275 spin_lock(&GlobalMid_Lock);
279 } else { 276 } else {
280 if (server->tcpStatus == CifsExiting) { 277 if (server->tcpStatus == CifsExiting) {
@@ -328,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
328{ 325{
329 int error; 326 int error;
330 327
331 error = wait_event_killable(server->response_q, 328 error = wait_event_freezekillable(server->response_q,
332 midQ->midState != MID_REQUEST_SUBMITTED); 329 midQ->midState != MID_REQUEST_SUBMITTED);
333 if (error < 0) 330 if (error < 0)
334 return -ERESTARTSYS; 331 return -ERESTARTSYS;
@@ -343,8 +340,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
343 */ 340 */
344int 341int
345cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, 342cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
346 unsigned int nvec, mid_callback_t *callback, void *cbdata, 343 unsigned int nvec, mid_receive_t *receive,
347 bool ignore_pend) 344 mid_callback_t *callback, void *cbdata, bool ignore_pend)
348{ 345{
349 int rc; 346 int rc;
350 struct mid_q_entry *mid; 347 struct mid_q_entry *mid;
@@ -378,18 +375,17 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
378 goto out_err; 375 goto out_err;
379 } 376 }
380 377
378 mid->receive = receive;
381 mid->callback = callback; 379 mid->callback = callback;
382 mid->callback_data = cbdata; 380 mid->callback_data = cbdata;
383 mid->midState = MID_REQUEST_SUBMITTED; 381 mid->midState = MID_REQUEST_SUBMITTED;
384#ifdef CONFIG_CIFS_STATS2 382
385 atomic_inc(&server->inSend); 383 cifs_in_send_inc(server);
386#endif
387 rc = smb_sendv(server, iov, nvec); 384 rc = smb_sendv(server, iov, nvec);
388#ifdef CONFIG_CIFS_STATS2 385 cifs_in_send_dec(server);
389 atomic_dec(&server->inSend); 386 cifs_save_when_sent(mid);
390 mid->when_sent = jiffies;
391#endif
392 mutex_unlock(&server->srv_mutex); 387 mutex_unlock(&server->srv_mutex);
388
393 if (rc) 389 if (rc)
394 goto out_err; 390 goto out_err;
395 391
@@ -502,13 +498,18 @@ int
502cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, 498cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
503 bool log_error) 499 bool log_error)
504{ 500{
505 dump_smb(mid->resp_buf, 501 unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4;
506 min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length))); 502
503 dump_smb(mid->resp_buf, min_t(u32, 92, len));
507 504
508 /* convert the length into a more usable form */ 505 /* convert the length into a more usable form */
509 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 506 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
507 struct kvec iov;
508
509 iov.iov_base = mid->resp_buf;
510 iov.iov_len = len;
510 /* FIXME: add code to kill session */ 511 /* FIXME: add code to kill session */
511 if (cifs_verify_signature(mid->resp_buf, server, 512 if (cifs_verify_signature(&iov, 1, server,
512 mid->sequence_number + 1) != 0) 513 mid->sequence_number + 1) != 0)
513 cERROR(1, "Unexpected SMB signature"); 514 cERROR(1, "Unexpected SMB signature");
514 } 515 }
@@ -575,14 +576,10 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
575 } 576 }
576 577
577 midQ->midState = MID_REQUEST_SUBMITTED; 578 midQ->midState = MID_REQUEST_SUBMITTED;
578#ifdef CONFIG_CIFS_STATS2 579 cifs_in_send_inc(ses->server);
579 atomic_inc(&ses->server->inSend);
580#endif
581 rc = smb_sendv(ses->server, iov, n_vec); 580 rc = smb_sendv(ses->server, iov, n_vec);
582#ifdef CONFIG_CIFS_STATS2 581 cifs_in_send_dec(ses->server);
583 atomic_dec(&ses->server->inSend); 582 cifs_save_when_sent(midQ);
584 midQ->when_sent = jiffies;
585#endif
586 583
587 mutex_unlock(&ses->server->srv_mutex); 584 mutex_unlock(&ses->server->srv_mutex);
588 585
@@ -703,14 +700,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
703 } 700 }
704 701
705 midQ->midState = MID_REQUEST_SUBMITTED; 702 midQ->midState = MID_REQUEST_SUBMITTED;
706#ifdef CONFIG_CIFS_STATS2 703
707 atomic_inc(&ses->server->inSend); 704 cifs_in_send_inc(ses->server);
708#endif
709 rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); 705 rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
710#ifdef CONFIG_CIFS_STATS2 706 cifs_in_send_dec(ses->server);
711 atomic_dec(&ses->server->inSend); 707 cifs_save_when_sent(midQ);
712 midQ->when_sent = jiffies;
713#endif
714 mutex_unlock(&ses->server->srv_mutex); 708 mutex_unlock(&ses->server->srv_mutex);
715 709
716 if (rc < 0) 710 if (rc < 0)
@@ -843,14 +837,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
843 } 837 }
844 838
845 midQ->midState = MID_REQUEST_SUBMITTED; 839 midQ->midState = MID_REQUEST_SUBMITTED;
846#ifdef CONFIG_CIFS_STATS2 840 cifs_in_send_inc(ses->server);
847 atomic_inc(&ses->server->inSend);
848#endif
849 rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); 841 rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
850#ifdef CONFIG_CIFS_STATS2 842 cifs_in_send_dec(ses->server);
851 atomic_dec(&ses->server->inSend); 843 cifs_save_when_sent(midQ);
852 midQ->when_sent = jiffies;
853#endif
854 mutex_unlock(&ses->server->srv_mutex); 844 mutex_unlock(&ses->server->srv_mutex);
855 845
856 if (rc < 0) { 846 if (rc < 0) {
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 2a22fb2989e4..45f07c46f3ed 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/xattr.h>
25#include "cifsfs.h" 26#include "cifsfs.h"
26#include "cifspdu.h" 27#include "cifspdu.h"
27#include "cifsglob.h" 28#include "cifsglob.h"
@@ -31,16 +32,8 @@
31#define MAX_EA_VALUE_SIZE 65535 32#define MAX_EA_VALUE_SIZE 65535
32#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" 33#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
33#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" 34#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
34#define CIFS_XATTR_USER_PREFIX "user."
35#define CIFS_XATTR_SYSTEM_PREFIX "system."
36#define CIFS_XATTR_OS2_PREFIX "os2."
37#define CIFS_XATTR_SECURITY_PREFIX "security."
38#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
39#define XATTR_TRUSTED_PREFIX_LEN 8
40#define XATTR_SECURITY_PREFIX_LEN 9
41/* BB need to add server (Samba e.g) support for security and trusted prefix */
42
43 35
36/* BB need to add server (Samba e.g) support for security and trusted prefix */
44 37
45int cifs_removexattr(struct dentry *direntry, const char *ea_name) 38int cifs_removexattr(struct dentry *direntry, const char *ea_name)
46{ 39{
@@ -76,8 +69,8 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
76 } 69 }
77 if (ea_name == NULL) { 70 if (ea_name == NULL) {
78 cFYI(1, "Null xattr names not supported"); 71 cFYI(1, "Null xattr names not supported");
79 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) 72 } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
80 && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) { 73 && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
81 cFYI(1, 74 cFYI(1,
82 "illegal xattr request %s (only user namespace supported)", 75 "illegal xattr request %s (only user namespace supported)",
83 ea_name); 76 ea_name);
@@ -88,7 +81,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
88 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 81 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
89 goto remove_ea_exit; 82 goto remove_ea_exit;
90 83
91 ea_name += 5; /* skip past user. prefix */ 84 ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
92 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, 85 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
93 (__u16)0, cifs_sb->local_nls, 86 (__u16)0, cifs_sb->local_nls,
94 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 87 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -149,21 +142,23 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
149 142
150 if (ea_name == NULL) { 143 if (ea_name == NULL) {
151 cFYI(1, "Null xattr names not supported"); 144 cFYI(1, "Null xattr names not supported");
152 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { 145 } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
146 == 0) {
153 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 147 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
154 goto set_ea_exit; 148 goto set_ea_exit;
155 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) 149 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
156 cFYI(1, "attempt to set cifs inode metadata"); 150 cFYI(1, "attempt to set cifs inode metadata");
157 151
158 ea_name += 5; /* skip past user. prefix */ 152 ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
159 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, 153 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
160 (__u16)value_size, cifs_sb->local_nls, 154 (__u16)value_size, cifs_sb->local_nls,
161 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 155 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
162 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { 156 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
157 == 0) {
163 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 158 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
164 goto set_ea_exit; 159 goto set_ea_exit;
165 160
166 ea_name += 4; /* skip past os2. prefix */ 161 ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
167 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, 162 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
168 (__u16)value_size, cifs_sb->local_nls, 163 (__u16)value_size, cifs_sb->local_nls,
169 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 164 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -178,7 +173,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
178#ifdef CONFIG_CIFS_ACL 173#ifdef CONFIG_CIFS_ACL
179 memcpy(pacl, ea_value, value_size); 174 memcpy(pacl, ea_value, value_size);
180 rc = set_cifs_acl(pacl, value_size, 175 rc = set_cifs_acl(pacl, value_size,
181 direntry->d_inode, full_path); 176 direntry->d_inode, full_path, CIFS_ACL_DACL);
182 if (rc == 0) /* force revalidate of the inode */ 177 if (rc == 0) /* force revalidate of the inode */
183 CIFS_I(direntry->d_inode)->time = 0; 178 CIFS_I(direntry->d_inode)->time = 0;
184 kfree(pacl); 179 kfree(pacl);
@@ -269,7 +264,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
269 /* return alt name if available as pseudo attr */ 264 /* return alt name if available as pseudo attr */
270 if (ea_name == NULL) { 265 if (ea_name == NULL) {
271 cFYI(1, "Null xattr names not supported"); 266 cFYI(1, "Null xattr names not supported");
272 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { 267 } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
268 == 0) {
273 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 269 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
274 goto get_ea_exit; 270 goto get_ea_exit;
275 271
@@ -277,15 +273,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
277 cFYI(1, "attempt to query cifs inode metadata"); 273 cFYI(1, "attempt to query cifs inode metadata");
278 /* revalidate/getattr then populate from inode */ 274 /* revalidate/getattr then populate from inode */
279 } /* BB add else when above is implemented */ 275 } /* BB add else when above is implemented */
280 ea_name += 5; /* skip past user. prefix */ 276 ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
281 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, 277 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
282 buf_size, cifs_sb->local_nls, 278 buf_size, cifs_sb->local_nls,
283 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 279 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
284 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { 280 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
285 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 281 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
286 goto get_ea_exit; 282 goto get_ea_exit;
287 283
288 ea_name += 4; /* skip past os2. prefix */ 284 ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
289 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, 285 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
290 buf_size, cifs_sb->local_nls, 286 buf_size, cifs_sb->local_nls,
291 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 287 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -339,10 +335,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
339 cFYI(1, "Query CIFS ACL not supported yet"); 335 cFYI(1, "Query CIFS ACL not supported yet");
340#endif /* CONFIG_CIFS_ACL */ 336#endif /* CONFIG_CIFS_ACL */
341 } else if (strncmp(ea_name, 337 } else if (strncmp(ea_name,
342 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { 338 XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
343 cFYI(1, "Trusted xattr namespace not supported yet"); 339 cFYI(1, "Trusted xattr namespace not supported yet");
344 } else if (strncmp(ea_name, 340 } else if (strncmp(ea_name,
345 CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { 341 XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
346 cFYI(1, "Security xattr namespace not supported yet"); 342 cFYI(1, "Security xattr namespace not supported yet");
347 } else 343 } else
348 cFYI(1, 344 cFYI(1,
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 44e17e9c21ae..cc0ea9fe5ecf 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -59,12 +59,11 @@ void coda_sysctl_clean(void);
59 59
60#define CODA_ALLOC(ptr, cast, size) do { \ 60#define CODA_ALLOC(ptr, cast, size) do { \
61 if (size < PAGE_SIZE) \ 61 if (size < PAGE_SIZE) \
62 ptr = kmalloc((unsigned long) size, GFP_KERNEL); \ 62 ptr = kzalloc((unsigned long) size, GFP_KERNEL); \
63 else \ 63 else \
64 ptr = (cast)vmalloc((unsigned long) size); \ 64 ptr = (cast)vzalloc((unsigned long) size); \
65 if (!ptr) \ 65 if (!ptr) \
66 printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ 66 printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
67 else memset( ptr, 0, size ); \
68} while (0) 67} while (0)
69 68
70 69
diff --git a/fs/compat.c b/fs/compat.c
index 0b48d018e38a..302e761bd0aa 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -37,7 +37,6 @@
37#include <linux/dirent.h> 37#include <linux/dirent.h>
38#include <linux/fsnotify.h> 38#include <linux/fsnotify.h>
39#include <linux/highuid.h> 39#include <linux/highuid.h>
40#include <linux/nfsd/syscall.h>
41#include <linux/personality.h> 40#include <linux/personality.h>
42#include <linux/rwsem.h> 41#include <linux/rwsem.h>
43#include <linux/tsacct_kern.h> 42#include <linux/tsacct_kern.h>
@@ -247,11 +246,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
247 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || 246 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
248 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || 247 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
249 __put_user(kbuf->f_frsize, &ubuf->f_frsize) || 248 __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
250 __put_user(0, &ubuf->f_spare[0]) || 249 __put_user(kbuf->f_flags, &ubuf->f_flags) ||
251 __put_user(0, &ubuf->f_spare[1]) || 250 __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
252 __put_user(0, &ubuf->f_spare[2]) ||
253 __put_user(0, &ubuf->f_spare[3]) ||
254 __put_user(0, &ubuf->f_spare[4]))
255 return -EFAULT; 251 return -EFAULT;
256 return 0; 252 return 0;
257} 253}
@@ -1675,11 +1671,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
1675} 1671}
1676#endif /* HAVE_SET_RESTORE_SIGMASK */ 1672#endif /* HAVE_SET_RESTORE_SIGMASK */
1677 1673
1678long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2)
1679{
1680 return sys_ni_syscall();
1681}
1682
1683#ifdef CONFIG_EPOLL 1674#ifdef CONFIG_EPOLL
1684 1675
1685#ifdef HAVE_SET_RESTORE_SIGMASK 1676#ifdef HAVE_SET_RESTORE_SIGMASK
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8be086e9abe4..51352de88ef1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1003,6 +1003,7 @@ COMPATIBLE_IOCTL(PPPIOCCONNECT)
1003COMPATIBLE_IOCTL(PPPIOCDISCONN) 1003COMPATIBLE_IOCTL(PPPIOCDISCONN)
1004COMPATIBLE_IOCTL(PPPIOCATTCHAN) 1004COMPATIBLE_IOCTL(PPPIOCATTCHAN)
1005COMPATIBLE_IOCTL(PPPIOCGCHAN) 1005COMPATIBLE_IOCTL(PPPIOCGCHAN)
1006COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
1006/* PPPOX */ 1007/* PPPOX */
1007COMPATIBLE_IOCTL(PPPOEIOCSFWD) 1008COMPATIBLE_IOCTL(PPPOEIOCSFWD)
1008COMPATIBLE_IOCTL(PPPOEIOCDFWD) 1009COMPATIBLE_IOCTL(PPPOEIOCDFWD)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c83f4768eeaa..ca418aaf6352 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -23,7 +23,8 @@
23 * 23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved. 24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 * 25 *
26 * Please see Documentation/filesystems/configfs.txt for more information. 26 * Please see Documentation/filesystems/configfs/configfs.txt for more
27 * information.
27 */ 28 */
28 29
29#undef DEBUG 30#undef DEBUG
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 76dc4c3e5d51..50cee7f9110b 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -23,7 +23,7 @@
23 * 23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved. 24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 * 25 *
26 * Please see the file Documentation/filesystems/configfs.txt for 26 * Please see the file Documentation/filesystems/configfs/configfs.txt for
27 * critical information about using the config_item interface. 27 * critical information about using the config_item interface.
28 */ 28 */
29 29
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e7a7a2f07324..f3a257d7a985 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * file.c - part of debugfs, a tiny little debug file system 2 * inode.c - part of debugfs, a tiny little debug file system
3 * 3 *
4 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> 4 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
5 * Copyright (C) 2004 IBM Inc. 5 * Copyright (C) 2004 IBM Inc.
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 44a360ca8046..d740ab67ff6e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -39,7 +39,7 @@
39 39
40/* 40/*
41 * How many user pages to map in one call to get_user_pages(). This determines 41 * How many user pages to map in one call to get_user_pages(). This determines
42 * the size of a structure on the stack. 42 * the size of a structure in the slab cache
43 */ 43 */
44#define DIO_PAGES 64 44#define DIO_PAGES 64
45 45
@@ -55,13 +55,10 @@
55 * blocksize. 55 * blocksize.
56 */ 56 */
57 57
58struct dio { 58/* dio_state only used in the submission path */
59 /* BIO submission state */ 59
60struct dio_submit {
60 struct bio *bio; /* bio under assembly */ 61 struct bio *bio; /* bio under assembly */
61 struct inode *inode;
62 int rw;
63 loff_t i_size; /* i_size when submitted */
64 int flags; /* doesn't change */
65 unsigned blkbits; /* doesn't change */ 62 unsigned blkbits; /* doesn't change */
66 unsigned blkfactor; /* When we're using an alignment which 63 unsigned blkfactor; /* When we're using an alignment which
67 is finer than the filesystem's soft 64 is finer than the filesystem's soft
@@ -76,18 +73,17 @@ struct dio {
76 sector_t block_in_file; /* Current offset into the underlying 73 sector_t block_in_file; /* Current offset into the underlying
77 file in dio_block units. */ 74 file in dio_block units. */
78 unsigned blocks_available; /* At block_in_file. changes */ 75 unsigned blocks_available; /* At block_in_file. changes */
76 int reap_counter; /* rate limit reaping */
79 sector_t final_block_in_request;/* doesn't change */ 77 sector_t final_block_in_request;/* doesn't change */
80 unsigned first_block_in_page; /* doesn't change, Used only once */ 78 unsigned first_block_in_page; /* doesn't change, Used only once */
81 int boundary; /* prev block is at a boundary */ 79 int boundary; /* prev block is at a boundary */
82 int reap_counter; /* rate limit reaping */
83 get_block_t *get_block; /* block mapping function */ 80 get_block_t *get_block; /* block mapping function */
84 dio_iodone_t *end_io; /* IO completion function */
85 dio_submit_t *submit_io; /* IO submition function */ 81 dio_submit_t *submit_io; /* IO submition function */
82
86 loff_t logical_offset_in_bio; /* current first logical block in bio */ 83 loff_t logical_offset_in_bio; /* current first logical block in bio */
87 sector_t final_block_in_bio; /* current final block in bio + 1 */ 84 sector_t final_block_in_bio; /* current final block in bio + 1 */
88 sector_t next_block_for_io; /* next block to be put under IO, 85 sector_t next_block_for_io; /* next block to be put under IO,
89 in dio_blocks units */ 86 in dio_blocks units */
90 struct buffer_head map_bh; /* last get_block() result */
91 87
92 /* 88 /*
93 * Deferred addition of a page to the dio. These variables are 89 * Deferred addition of a page to the dio. These variables are
@@ -100,18 +96,6 @@ struct dio {
100 sector_t cur_page_block; /* Where it starts */ 96 sector_t cur_page_block; /* Where it starts */
101 loff_t cur_page_fs_offset; /* Offset in file */ 97 loff_t cur_page_fs_offset; /* Offset in file */
102 98
103 /* BIO completion state */
104 spinlock_t bio_lock; /* protects BIO fields below */
105 unsigned long refcount; /* direct_io_worker() and bios */
106 struct bio *bio_list; /* singly linked via bi_private */
107 struct task_struct *waiter; /* waiting task (NULL if none) */
108
109 /* AIO related stuff */
110 struct kiocb *iocb; /* kiocb */
111 int is_async; /* is IO async ? */
112 int io_error; /* IO error in completion path */
113 ssize_t result; /* IO result */
114
115 /* 99 /*
116 * Page fetching state. These variables belong to dio_refill_pages(). 100 * Page fetching state. These variables belong to dio_refill_pages().
117 */ 101 */
@@ -125,7 +109,30 @@ struct dio {
125 */ 109 */
126 unsigned head; /* next page to process */ 110 unsigned head; /* next page to process */
127 unsigned tail; /* last valid page + 1 */ 111 unsigned tail; /* last valid page + 1 */
112};
113
114/* dio_state communicated between submission path and end_io */
115struct dio {
116 int flags; /* doesn't change */
117 int rw;
118 struct inode *inode;
119 loff_t i_size; /* i_size when submitted */
120 dio_iodone_t *end_io; /* IO completion function */
121
122 void *private; /* copy from map_bh.b_private */
123
124 /* BIO completion state */
125 spinlock_t bio_lock; /* protects BIO fields below */
128 int page_errors; /* errno from get_user_pages() */ 126 int page_errors; /* errno from get_user_pages() */
127 int is_async; /* is IO async ? */
128 int io_error; /* IO error in completion path */
129 unsigned long refcount; /* direct_io_worker() and bios */
130 struct bio *bio_list; /* singly linked via bi_private */
131 struct task_struct *waiter; /* waiting task (NULL if none) */
132
133 /* AIO related stuff */
134 struct kiocb *iocb; /* kiocb */
135 ssize_t result; /* IO result */
129 136
130 /* 137 /*
131 * pages[] (and any fields placed after it) are not zeroed out at 138 * pages[] (and any fields placed after it) are not zeroed out at
@@ -133,7 +140,9 @@ struct dio {
133 * wish that they not be zeroed. 140 * wish that they not be zeroed.
134 */ 141 */
135 struct page *pages[DIO_PAGES]; /* page buffer */ 142 struct page *pages[DIO_PAGES]; /* page buffer */
136}; 143} ____cacheline_aligned_in_smp;
144
145static struct kmem_cache *dio_cache __read_mostly;
137 146
138static void __inode_dio_wait(struct inode *inode) 147static void __inode_dio_wait(struct inode *inode)
139{ 148{
@@ -182,27 +191,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done);
182/* 191/*
183 * How many pages are in the queue? 192 * How many pages are in the queue?
184 */ 193 */
185static inline unsigned dio_pages_present(struct dio *dio) 194static inline unsigned dio_pages_present(struct dio_submit *sdio)
186{ 195{
187 return dio->tail - dio->head; 196 return sdio->tail - sdio->head;
188} 197}
189 198
190/* 199/*
191 * Go grab and pin some userspace pages. Typically we'll get 64 at a time. 200 * Go grab and pin some userspace pages. Typically we'll get 64 at a time.
192 */ 201 */
193static int dio_refill_pages(struct dio *dio) 202static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
194{ 203{
195 int ret; 204 int ret;
196 int nr_pages; 205 int nr_pages;
197 206
198 nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); 207 nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
199 ret = get_user_pages_fast( 208 ret = get_user_pages_fast(
200 dio->curr_user_address, /* Where from? */ 209 sdio->curr_user_address, /* Where from? */
201 nr_pages, /* How many pages? */ 210 nr_pages, /* How many pages? */
202 dio->rw == READ, /* Write to memory? */ 211 dio->rw == READ, /* Write to memory? */
203 &dio->pages[0]); /* Put results here */ 212 &dio->pages[0]); /* Put results here */
204 213
205 if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { 214 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
206 struct page *page = ZERO_PAGE(0); 215 struct page *page = ZERO_PAGE(0);
207 /* 216 /*
208 * A memory fault, but the filesystem has some outstanding 217 * A memory fault, but the filesystem has some outstanding
@@ -213,17 +222,17 @@ static int dio_refill_pages(struct dio *dio)
213 dio->page_errors = ret; 222 dio->page_errors = ret;
214 page_cache_get(page); 223 page_cache_get(page);
215 dio->pages[0] = page; 224 dio->pages[0] = page;
216 dio->head = 0; 225 sdio->head = 0;
217 dio->tail = 1; 226 sdio->tail = 1;
218 ret = 0; 227 ret = 0;
219 goto out; 228 goto out;
220 } 229 }
221 230
222 if (ret >= 0) { 231 if (ret >= 0) {
223 dio->curr_user_address += ret * PAGE_SIZE; 232 sdio->curr_user_address += ret * PAGE_SIZE;
224 dio->curr_page += ret; 233 sdio->curr_page += ret;
225 dio->head = 0; 234 sdio->head = 0;
226 dio->tail = ret; 235 sdio->tail = ret;
227 ret = 0; 236 ret = 0;
228 } 237 }
229out: 238out:
@@ -236,17 +245,18 @@ out:
236 * decent number of pages, less frequently. To provide nicer use of the 245 * decent number of pages, less frequently. To provide nicer use of the
237 * L1 cache. 246 * L1 cache.
238 */ 247 */
239static struct page *dio_get_page(struct dio *dio) 248static inline struct page *dio_get_page(struct dio *dio,
249 struct dio_submit *sdio)
240{ 250{
241 if (dio_pages_present(dio) == 0) { 251 if (dio_pages_present(sdio) == 0) {
242 int ret; 252 int ret;
243 253
244 ret = dio_refill_pages(dio); 254 ret = dio_refill_pages(dio, sdio);
245 if (ret) 255 if (ret)
246 return ERR_PTR(ret); 256 return ERR_PTR(ret);
247 BUG_ON(dio_pages_present(dio) == 0); 257 BUG_ON(dio_pages_present(sdio) == 0);
248 } 258 }
249 return dio->pages[dio->head++]; 259 return dio->pages[sdio->head++];
250} 260}
251 261
252/** 262/**
@@ -292,7 +302,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
292 302
293 if (dio->end_io && dio->result) { 303 if (dio->end_io && dio->result) {
294 dio->end_io(dio->iocb, offset, transferred, 304 dio->end_io(dio->iocb, offset, transferred,
295 dio->map_bh.b_private, ret, is_async); 305 dio->private, ret, is_async);
296 } else { 306 } else {
297 if (is_async) 307 if (is_async)
298 aio_complete(dio->iocb, ret, 0); 308 aio_complete(dio->iocb, ret, 0);
@@ -323,7 +333,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
323 333
324 if (remaining == 0) { 334 if (remaining == 0) {
325 dio_complete(dio, dio->iocb->ki_pos, 0, true); 335 dio_complete(dio, dio->iocb->ki_pos, 0, true);
326 kfree(dio); 336 kmem_cache_free(dio_cache, dio);
327 } 337 }
328} 338}
329 339
@@ -367,9 +377,10 @@ void dio_end_io(struct bio *bio, int error)
367} 377}
368EXPORT_SYMBOL_GPL(dio_end_io); 378EXPORT_SYMBOL_GPL(dio_end_io);
369 379
370static void 380static inline void
371dio_bio_alloc(struct dio *dio, struct block_device *bdev, 381dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
372 sector_t first_sector, int nr_vecs) 382 struct block_device *bdev,
383 sector_t first_sector, int nr_vecs)
373{ 384{
374 struct bio *bio; 385 struct bio *bio;
375 386
@@ -386,8 +397,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
386 else 397 else
387 bio->bi_end_io = dio_bio_end_io; 398 bio->bi_end_io = dio_bio_end_io;
388 399
389 dio->bio = bio; 400 sdio->bio = bio;
390 dio->logical_offset_in_bio = dio->cur_page_fs_offset; 401 sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
391} 402}
392 403
393/* 404/*
@@ -397,9 +408,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
397 * 408 *
398 * bios hold a dio reference between submit_bio and ->end_io. 409 * bios hold a dio reference between submit_bio and ->end_io.
399 */ 410 */
400static void dio_bio_submit(struct dio *dio) 411static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
401{ 412{
402 struct bio *bio = dio->bio; 413 struct bio *bio = sdio->bio;
403 unsigned long flags; 414 unsigned long flags;
404 415
405 bio->bi_private = dio; 416 bio->bi_private = dio;
@@ -411,24 +422,24 @@ static void dio_bio_submit(struct dio *dio)
411 if (dio->is_async && dio->rw == READ) 422 if (dio->is_async && dio->rw == READ)
412 bio_set_pages_dirty(bio); 423 bio_set_pages_dirty(bio);
413 424
414 if (dio->submit_io) 425 if (sdio->submit_io)
415 dio->submit_io(dio->rw, bio, dio->inode, 426 sdio->submit_io(dio->rw, bio, dio->inode,
416 dio->logical_offset_in_bio); 427 sdio->logical_offset_in_bio);
417 else 428 else
418 submit_bio(dio->rw, bio); 429 submit_bio(dio->rw, bio);
419 430
420 dio->bio = NULL; 431 sdio->bio = NULL;
421 dio->boundary = 0; 432 sdio->boundary = 0;
422 dio->logical_offset_in_bio = 0; 433 sdio->logical_offset_in_bio = 0;
423} 434}
424 435
425/* 436/*
426 * Release any resources in case of a failure 437 * Release any resources in case of a failure
427 */ 438 */
428static void dio_cleanup(struct dio *dio) 439static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
429{ 440{
430 while (dio_pages_present(dio)) 441 while (dio_pages_present(sdio))
431 page_cache_release(dio_get_page(dio)); 442 page_cache_release(dio_get_page(dio, sdio));
432} 443}
433 444
434/* 445/*
@@ -518,11 +529,11 @@ static void dio_await_completion(struct dio *dio)
518 * 529 *
519 * This also helps to limit the peak amount of pinned userspace memory. 530 * This also helps to limit the peak amount of pinned userspace memory.
520 */ 531 */
521static int dio_bio_reap(struct dio *dio) 532static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
522{ 533{
523 int ret = 0; 534 int ret = 0;
524 535
525 if (dio->reap_counter++ >= 64) { 536 if (sdio->reap_counter++ >= 64) {
526 while (dio->bio_list) { 537 while (dio->bio_list) {
527 unsigned long flags; 538 unsigned long flags;
528 struct bio *bio; 539 struct bio *bio;
@@ -536,14 +547,14 @@ static int dio_bio_reap(struct dio *dio)
536 if (ret == 0) 547 if (ret == 0)
537 ret = ret2; 548 ret = ret2;
538 } 549 }
539 dio->reap_counter = 0; 550 sdio->reap_counter = 0;
540 } 551 }
541 return ret; 552 return ret;
542} 553}
543 554
544/* 555/*
545 * Call into the fs to map some more disk blocks. We record the current number 556 * Call into the fs to map some more disk blocks. We record the current number
546 * of available blocks at dio->blocks_available. These are in units of the 557 * of available blocks at sdio->blocks_available. These are in units of the
547 * fs blocksize, (1 << inode->i_blkbits). 558 * fs blocksize, (1 << inode->i_blkbits).
548 * 559 *
549 * The fs is allowed to map lots of blocks at once. If it wants to do that, 560 * The fs is allowed to map lots of blocks at once. If it wants to do that,
@@ -564,10 +575,10 @@ static int dio_bio_reap(struct dio *dio)
564 * buffer_mapped(). However the direct-io code will only process holes one 575 * buffer_mapped(). However the direct-io code will only process holes one
565 * block at a time - it will repeatedly call get_block() as it walks the hole. 576 * block at a time - it will repeatedly call get_block() as it walks the hole.
566 */ 577 */
567static int get_more_blocks(struct dio *dio) 578static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
579 struct buffer_head *map_bh)
568{ 580{
569 int ret; 581 int ret;
570 struct buffer_head *map_bh = &dio->map_bh;
571 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ 582 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
572 unsigned long fs_count; /* Number of filesystem-sized blocks */ 583 unsigned long fs_count; /* Number of filesystem-sized blocks */
573 unsigned long dio_count;/* Number of dio_block-sized blocks */ 584 unsigned long dio_count;/* Number of dio_block-sized blocks */
@@ -580,11 +591,11 @@ static int get_more_blocks(struct dio *dio)
580 */ 591 */
581 ret = dio->page_errors; 592 ret = dio->page_errors;
582 if (ret == 0) { 593 if (ret == 0) {
583 BUG_ON(dio->block_in_file >= dio->final_block_in_request); 594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
584 fs_startblk = dio->block_in_file >> dio->blkfactor; 595 fs_startblk = sdio->block_in_file >> sdio->blkfactor;
585 dio_count = dio->final_block_in_request - dio->block_in_file; 596 dio_count = sdio->final_block_in_request - sdio->block_in_file;
586 fs_count = dio_count >> dio->blkfactor; 597 fs_count = dio_count >> sdio->blkfactor;
587 blkmask = (1 << dio->blkfactor) - 1; 598 blkmask = (1 << sdio->blkfactor) - 1;
588 if (dio_count & blkmask) 599 if (dio_count & blkmask)
589 fs_count++; 600 fs_count++;
590 601
@@ -604,13 +615,16 @@ static int get_more_blocks(struct dio *dio)
604 */ 615 */
605 create = dio->rw & WRITE; 616 create = dio->rw & WRITE;
606 if (dio->flags & DIO_SKIP_HOLES) { 617 if (dio->flags & DIO_SKIP_HOLES) {
607 if (dio->block_in_file < (i_size_read(dio->inode) >> 618 if (sdio->block_in_file < (i_size_read(dio->inode) >>
608 dio->blkbits)) 619 sdio->blkbits))
609 create = 0; 620 create = 0;
610 } 621 }
611 622
612 ret = (*dio->get_block)(dio->inode, fs_startblk, 623 ret = (*sdio->get_block)(dio->inode, fs_startblk,
613 map_bh, create); 624 map_bh, create);
625
626 /* Store for completion */
627 dio->private = map_bh->b_private;
614 } 628 }
615 return ret; 629 return ret;
616} 630}
@@ -618,20 +632,21 @@ static int get_more_blocks(struct dio *dio)
618/* 632/*
619 * There is no bio. Make one now. 633 * There is no bio. Make one now.
620 */ 634 */
621static int dio_new_bio(struct dio *dio, sector_t start_sector) 635static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
636 sector_t start_sector, struct buffer_head *map_bh)
622{ 637{
623 sector_t sector; 638 sector_t sector;
624 int ret, nr_pages; 639 int ret, nr_pages;
625 640
626 ret = dio_bio_reap(dio); 641 ret = dio_bio_reap(dio, sdio);
627 if (ret) 642 if (ret)
628 goto out; 643 goto out;
629 sector = start_sector << (dio->blkbits - 9); 644 sector = start_sector << (sdio->blkbits - 9);
630 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); 645 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
631 nr_pages = min(nr_pages, BIO_MAX_PAGES); 646 nr_pages = min(nr_pages, BIO_MAX_PAGES);
632 BUG_ON(nr_pages <= 0); 647 BUG_ON(nr_pages <= 0);
633 dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); 648 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
634 dio->boundary = 0; 649 sdio->boundary = 0;
635out: 650out:
636 return ret; 651 return ret;
637} 652}
@@ -643,21 +658,21 @@ out:
643 * 658 *
644 * Return zero on success. Non-zero means the caller needs to start a new BIO. 659 * Return zero on success. Non-zero means the caller needs to start a new BIO.
645 */ 660 */
646static int dio_bio_add_page(struct dio *dio) 661static inline int dio_bio_add_page(struct dio_submit *sdio)
647{ 662{
648 int ret; 663 int ret;
649 664
650 ret = bio_add_page(dio->bio, dio->cur_page, 665 ret = bio_add_page(sdio->bio, sdio->cur_page,
651 dio->cur_page_len, dio->cur_page_offset); 666 sdio->cur_page_len, sdio->cur_page_offset);
652 if (ret == dio->cur_page_len) { 667 if (ret == sdio->cur_page_len) {
653 /* 668 /*
654 * Decrement count only, if we are done with this page 669 * Decrement count only, if we are done with this page
655 */ 670 */
656 if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) 671 if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
657 dio->pages_in_io--; 672 sdio->pages_in_io--;
658 page_cache_get(dio->cur_page); 673 page_cache_get(sdio->cur_page);
659 dio->final_block_in_bio = dio->cur_page_block + 674 sdio->final_block_in_bio = sdio->cur_page_block +
660 (dio->cur_page_len >> dio->blkbits); 675 (sdio->cur_page_len >> sdio->blkbits);
661 ret = 0; 676 ret = 0;
662 } else { 677 } else {
663 ret = 1; 678 ret = 1;
@@ -675,14 +690,15 @@ static int dio_bio_add_page(struct dio *dio)
675 * The caller of this function is responsible for removing cur_page from the 690 * The caller of this function is responsible for removing cur_page from the
676 * dio, and for dropping the refcount which came from that presence. 691 * dio, and for dropping the refcount which came from that presence.
677 */ 692 */
678static int dio_send_cur_page(struct dio *dio) 693static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
694 struct buffer_head *map_bh)
679{ 695{
680 int ret = 0; 696 int ret = 0;
681 697
682 if (dio->bio) { 698 if (sdio->bio) {
683 loff_t cur_offset = dio->cur_page_fs_offset; 699 loff_t cur_offset = sdio->cur_page_fs_offset;
684 loff_t bio_next_offset = dio->logical_offset_in_bio + 700 loff_t bio_next_offset = sdio->logical_offset_in_bio +
685 dio->bio->bi_size; 701 sdio->bio->bi_size;
686 702
687 /* 703 /*
688 * See whether this new request is contiguous with the old. 704 * See whether this new request is contiguous with the old.
@@ -698,28 +714,28 @@ static int dio_send_cur_page(struct dio *dio)
698 * be the next logical offset in the bio, submit the bio we 714 * be the next logical offset in the bio, submit the bio we
699 * have. 715 * have.
700 */ 716 */
701 if (dio->final_block_in_bio != dio->cur_page_block || 717 if (sdio->final_block_in_bio != sdio->cur_page_block ||
702 cur_offset != bio_next_offset) 718 cur_offset != bio_next_offset)
703 dio_bio_submit(dio); 719 dio_bio_submit(dio, sdio);
704 /* 720 /*
705 * Submit now if the underlying fs is about to perform a 721 * Submit now if the underlying fs is about to perform a
706 * metadata read 722 * metadata read
707 */ 723 */
708 else if (dio->boundary) 724 else if (sdio->boundary)
709 dio_bio_submit(dio); 725 dio_bio_submit(dio, sdio);
710 } 726 }
711 727
712 if (dio->bio == NULL) { 728 if (sdio->bio == NULL) {
713 ret = dio_new_bio(dio, dio->cur_page_block); 729 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
714 if (ret) 730 if (ret)
715 goto out; 731 goto out;
716 } 732 }
717 733
718 if (dio_bio_add_page(dio) != 0) { 734 if (dio_bio_add_page(sdio) != 0) {
719 dio_bio_submit(dio); 735 dio_bio_submit(dio, sdio);
720 ret = dio_new_bio(dio, dio->cur_page_block); 736 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
721 if (ret == 0) { 737 if (ret == 0) {
722 ret = dio_bio_add_page(dio); 738 ret = dio_bio_add_page(sdio);
723 BUG_ON(ret != 0); 739 BUG_ON(ret != 0);
724 } 740 }
725 } 741 }
@@ -744,9 +760,10 @@ out:
744 * If that doesn't work out then we put the old page into the bio and add this 760 * If that doesn't work out then we put the old page into the bio and add this
745 * page to the dio instead. 761 * page to the dio instead.
746 */ 762 */
747static int 763static inline int
748submit_page_section(struct dio *dio, struct page *page, 764submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
749 unsigned offset, unsigned len, sector_t blocknr) 765 unsigned offset, unsigned len, sector_t blocknr,
766 struct buffer_head *map_bh)
750{ 767{
751 int ret = 0; 768 int ret = 0;
752 769
@@ -760,20 +777,20 @@ submit_page_section(struct dio *dio, struct page *page,
760 /* 777 /*
761 * Can we just grow the current page's presence in the dio? 778 * Can we just grow the current page's presence in the dio?
762 */ 779 */
763 if ( (dio->cur_page == page) && 780 if (sdio->cur_page == page &&
764 (dio->cur_page_offset + dio->cur_page_len == offset) && 781 sdio->cur_page_offset + sdio->cur_page_len == offset &&
765 (dio->cur_page_block + 782 sdio->cur_page_block +
766 (dio->cur_page_len >> dio->blkbits) == blocknr)) { 783 (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
767 dio->cur_page_len += len; 784 sdio->cur_page_len += len;
768 785
769 /* 786 /*
770 * If dio->boundary then we want to schedule the IO now to 787 * If sdio->boundary then we want to schedule the IO now to
771 * avoid metadata seeks. 788 * avoid metadata seeks.
772 */ 789 */
773 if (dio->boundary) { 790 if (sdio->boundary) {
774 ret = dio_send_cur_page(dio); 791 ret = dio_send_cur_page(dio, sdio, map_bh);
775 page_cache_release(dio->cur_page); 792 page_cache_release(sdio->cur_page);
776 dio->cur_page = NULL; 793 sdio->cur_page = NULL;
777 } 794 }
778 goto out; 795 goto out;
779 } 796 }
@@ -781,20 +798,20 @@ submit_page_section(struct dio *dio, struct page *page,
781 /* 798 /*
782 * If there's a deferred page already there then send it. 799 * If there's a deferred page already there then send it.
783 */ 800 */
784 if (dio->cur_page) { 801 if (sdio->cur_page) {
785 ret = dio_send_cur_page(dio); 802 ret = dio_send_cur_page(dio, sdio, map_bh);
786 page_cache_release(dio->cur_page); 803 page_cache_release(sdio->cur_page);
787 dio->cur_page = NULL; 804 sdio->cur_page = NULL;
788 if (ret) 805 if (ret)
789 goto out; 806 goto out;
790 } 807 }
791 808
792 page_cache_get(page); /* It is in dio */ 809 page_cache_get(page); /* It is in dio */
793 dio->cur_page = page; 810 sdio->cur_page = page;
794 dio->cur_page_offset = offset; 811 sdio->cur_page_offset = offset;
795 dio->cur_page_len = len; 812 sdio->cur_page_len = len;
796 dio->cur_page_block = blocknr; 813 sdio->cur_page_block = blocknr;
797 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; 814 sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
798out: 815out:
799 return ret; 816 return ret;
800} 817}
@@ -804,16 +821,16 @@ out:
804 * file blocks. Only called for S_ISREG files - blockdevs do not set 821 * file blocks. Only called for S_ISREG files - blockdevs do not set
805 * buffer_new 822 * buffer_new
806 */ 823 */
807static void clean_blockdev_aliases(struct dio *dio) 824static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
808{ 825{
809 unsigned i; 826 unsigned i;
810 unsigned nblocks; 827 unsigned nblocks;
811 828
812 nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits; 829 nblocks = map_bh->b_size >> dio->inode->i_blkbits;
813 830
814 for (i = 0; i < nblocks; i++) { 831 for (i = 0; i < nblocks; i++) {
815 unmap_underlying_metadata(dio->map_bh.b_bdev, 832 unmap_underlying_metadata(map_bh->b_bdev,
816 dio->map_bh.b_blocknr + i); 833 map_bh->b_blocknr + i);
817 } 834 }
818} 835}
819 836
@@ -826,19 +843,20 @@ static void clean_blockdev_aliases(struct dio *dio)
826 * `end' is zero if we're doing the start of the IO, 1 at the end of the 843 * `end' is zero if we're doing the start of the IO, 1 at the end of the
827 * IO. 844 * IO.
828 */ 845 */
829static void dio_zero_block(struct dio *dio, int end) 846static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
847 int end, struct buffer_head *map_bh)
830{ 848{
831 unsigned dio_blocks_per_fs_block; 849 unsigned dio_blocks_per_fs_block;
832 unsigned this_chunk_blocks; /* In dio_blocks */ 850 unsigned this_chunk_blocks; /* In dio_blocks */
833 unsigned this_chunk_bytes; 851 unsigned this_chunk_bytes;
834 struct page *page; 852 struct page *page;
835 853
836 dio->start_zero_done = 1; 854 sdio->start_zero_done = 1;
837 if (!dio->blkfactor || !buffer_new(&dio->map_bh)) 855 if (!sdio->blkfactor || !buffer_new(map_bh))
838 return; 856 return;
839 857
840 dio_blocks_per_fs_block = 1 << dio->blkfactor; 858 dio_blocks_per_fs_block = 1 << sdio->blkfactor;
841 this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); 859 this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
842 860
843 if (!this_chunk_blocks) 861 if (!this_chunk_blocks)
844 return; 862 return;
@@ -850,14 +868,14 @@ static void dio_zero_block(struct dio *dio, int end)
850 if (end) 868 if (end)
851 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; 869 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
852 870
853 this_chunk_bytes = this_chunk_blocks << dio->blkbits; 871 this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
854 872
855 page = ZERO_PAGE(0); 873 page = ZERO_PAGE(0);
856 if (submit_page_section(dio, page, 0, this_chunk_bytes, 874 if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
857 dio->next_block_for_io)) 875 sdio->next_block_for_io, map_bh))
858 return; 876 return;
859 877
860 dio->next_block_for_io += this_chunk_blocks; 878 sdio->next_block_for_io += this_chunk_blocks;
861} 879}
862 880
863/* 881/*
@@ -876,20 +894,20 @@ static void dio_zero_block(struct dio *dio, int end)
876 * it should set b_size to PAGE_SIZE or more inside get_block(). This gives 894 * it should set b_size to PAGE_SIZE or more inside get_block(). This gives
877 * fine alignment but still allows this function to work in PAGE_SIZE units. 895 * fine alignment but still allows this function to work in PAGE_SIZE units.
878 */ 896 */
879static int do_direct_IO(struct dio *dio) 897static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
898 struct buffer_head *map_bh)
880{ 899{
881 const unsigned blkbits = dio->blkbits; 900 const unsigned blkbits = sdio->blkbits;
882 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 901 const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
883 struct page *page; 902 struct page *page;
884 unsigned block_in_page; 903 unsigned block_in_page;
885 struct buffer_head *map_bh = &dio->map_bh;
886 int ret = 0; 904 int ret = 0;
887 905
888 /* The I/O can start at any block offset within the first page */ 906 /* The I/O can start at any block offset within the first page */
889 block_in_page = dio->first_block_in_page; 907 block_in_page = sdio->first_block_in_page;
890 908
891 while (dio->block_in_file < dio->final_block_in_request) { 909 while (sdio->block_in_file < sdio->final_block_in_request) {
892 page = dio_get_page(dio); 910 page = dio_get_page(dio, sdio);
893 if (IS_ERR(page)) { 911 if (IS_ERR(page)) {
894 ret = PTR_ERR(page); 912 ret = PTR_ERR(page);
895 goto out; 913 goto out;
@@ -901,14 +919,14 @@ static int do_direct_IO(struct dio *dio)
901 unsigned this_chunk_blocks; /* # of blocks */ 919 unsigned this_chunk_blocks; /* # of blocks */
902 unsigned u; 920 unsigned u;
903 921
904 if (dio->blocks_available == 0) { 922 if (sdio->blocks_available == 0) {
905 /* 923 /*
906 * Need to go and map some more disk 924 * Need to go and map some more disk
907 */ 925 */
908 unsigned long blkmask; 926 unsigned long blkmask;
909 unsigned long dio_remainder; 927 unsigned long dio_remainder;
910 928
911 ret = get_more_blocks(dio); 929 ret = get_more_blocks(dio, sdio, map_bh);
912 if (ret) { 930 if (ret) {
913 page_cache_release(page); 931 page_cache_release(page);
914 goto out; 932 goto out;
@@ -916,18 +934,18 @@ static int do_direct_IO(struct dio *dio)
916 if (!buffer_mapped(map_bh)) 934 if (!buffer_mapped(map_bh))
917 goto do_holes; 935 goto do_holes;
918 936
919 dio->blocks_available = 937 sdio->blocks_available =
920 map_bh->b_size >> dio->blkbits; 938 map_bh->b_size >> sdio->blkbits;
921 dio->next_block_for_io = 939 sdio->next_block_for_io =
922 map_bh->b_blocknr << dio->blkfactor; 940 map_bh->b_blocknr << sdio->blkfactor;
923 if (buffer_new(map_bh)) 941 if (buffer_new(map_bh))
924 clean_blockdev_aliases(dio); 942 clean_blockdev_aliases(dio, map_bh);
925 943
926 if (!dio->blkfactor) 944 if (!sdio->blkfactor)
927 goto do_holes; 945 goto do_holes;
928 946
929 blkmask = (1 << dio->blkfactor) - 1; 947 blkmask = (1 << sdio->blkfactor) - 1;
930 dio_remainder = (dio->block_in_file & blkmask); 948 dio_remainder = (sdio->block_in_file & blkmask);
931 949
932 /* 950 /*
933 * If we are at the start of IO and that IO 951 * If we are at the start of IO and that IO
@@ -941,8 +959,8 @@ static int do_direct_IO(struct dio *dio)
941 * on-disk 959 * on-disk
942 */ 960 */
943 if (!buffer_new(map_bh)) 961 if (!buffer_new(map_bh))
944 dio->next_block_for_io += dio_remainder; 962 sdio->next_block_for_io += dio_remainder;
945 dio->blocks_available -= dio_remainder; 963 sdio->blocks_available -= dio_remainder;
946 } 964 }
947do_holes: 965do_holes:
948 /* Handle holes */ 966 /* Handle holes */
@@ -961,7 +979,7 @@ do_holes:
961 */ 979 */
962 i_size_aligned = ALIGN(i_size_read(dio->inode), 980 i_size_aligned = ALIGN(i_size_read(dio->inode),
963 1 << blkbits); 981 1 << blkbits);
964 if (dio->block_in_file >= 982 if (sdio->block_in_file >=
965 i_size_aligned >> blkbits) { 983 i_size_aligned >> blkbits) {
966 /* We hit eof */ 984 /* We hit eof */
967 page_cache_release(page); 985 page_cache_release(page);
@@ -969,7 +987,7 @@ do_holes:
969 } 987 }
970 zero_user(page, block_in_page << blkbits, 988 zero_user(page, block_in_page << blkbits,
971 1 << blkbits); 989 1 << blkbits);
972 dio->block_in_file++; 990 sdio->block_in_file++;
973 block_in_page++; 991 block_in_page++;
974 goto next_block; 992 goto next_block;
975 } 993 }
@@ -979,38 +997,41 @@ do_holes:
979 * is finer than the underlying fs, go check to see if 997 * is finer than the underlying fs, go check to see if
980 * we must zero out the start of this block. 998 * we must zero out the start of this block.
981 */ 999 */
982 if (unlikely(dio->blkfactor && !dio->start_zero_done)) 1000 if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
983 dio_zero_block(dio, 0); 1001 dio_zero_block(dio, sdio, 0, map_bh);
984 1002
985 /* 1003 /*
986 * Work out, in this_chunk_blocks, how much disk we 1004 * Work out, in this_chunk_blocks, how much disk we
987 * can add to this page 1005 * can add to this page
988 */ 1006 */
989 this_chunk_blocks = dio->blocks_available; 1007 this_chunk_blocks = sdio->blocks_available;
990 u = (PAGE_SIZE - offset_in_page) >> blkbits; 1008 u = (PAGE_SIZE - offset_in_page) >> blkbits;
991 if (this_chunk_blocks > u) 1009 if (this_chunk_blocks > u)
992 this_chunk_blocks = u; 1010 this_chunk_blocks = u;
993 u = dio->final_block_in_request - dio->block_in_file; 1011 u = sdio->final_block_in_request - sdio->block_in_file;
994 if (this_chunk_blocks > u) 1012 if (this_chunk_blocks > u)
995 this_chunk_blocks = u; 1013 this_chunk_blocks = u;
996 this_chunk_bytes = this_chunk_blocks << blkbits; 1014 this_chunk_bytes = this_chunk_blocks << blkbits;
997 BUG_ON(this_chunk_bytes == 0); 1015 BUG_ON(this_chunk_bytes == 0);
998 1016
999 dio->boundary = buffer_boundary(map_bh); 1017 sdio->boundary = buffer_boundary(map_bh);
1000 ret = submit_page_section(dio, page, offset_in_page, 1018 ret = submit_page_section(dio, sdio, page,
1001 this_chunk_bytes, dio->next_block_for_io); 1019 offset_in_page,
1020 this_chunk_bytes,
1021 sdio->next_block_for_io,
1022 map_bh);
1002 if (ret) { 1023 if (ret) {
1003 page_cache_release(page); 1024 page_cache_release(page);
1004 goto out; 1025 goto out;
1005 } 1026 }
1006 dio->next_block_for_io += this_chunk_blocks; 1027 sdio->next_block_for_io += this_chunk_blocks;
1007 1028
1008 dio->block_in_file += this_chunk_blocks; 1029 sdio->block_in_file += this_chunk_blocks;
1009 block_in_page += this_chunk_blocks; 1030 block_in_page += this_chunk_blocks;
1010 dio->blocks_available -= this_chunk_blocks; 1031 sdio->blocks_available -= this_chunk_blocks;
1011next_block: 1032next_block:
1012 BUG_ON(dio->block_in_file > dio->final_block_in_request); 1033 BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
1013 if (dio->block_in_file == dio->final_block_in_request) 1034 if (sdio->block_in_file == sdio->final_block_in_request)
1014 break; 1035 break;
1015 } 1036 }
1016 1037
@@ -1022,135 +1043,10 @@ out:
1022 return ret; 1043 return ret;
1023} 1044}
1024 1045
1025static ssize_t 1046static inline int drop_refcount(struct dio *dio)
1026direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1027 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
1028 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
1029 dio_submit_t submit_io, struct dio *dio)
1030{ 1047{
1031 unsigned long user_addr; 1048 int ret2;
1032 unsigned long flags; 1049 unsigned long flags;
1033 int seg;
1034 ssize_t ret = 0;
1035 ssize_t ret2;
1036 size_t bytes;
1037
1038 dio->inode = inode;
1039 dio->rw = rw;
1040 dio->blkbits = blkbits;
1041 dio->blkfactor = inode->i_blkbits - blkbits;
1042 dio->block_in_file = offset >> blkbits;
1043
1044 dio->get_block = get_block;
1045 dio->end_io = end_io;
1046 dio->submit_io = submit_io;
1047 dio->final_block_in_bio = -1;
1048 dio->next_block_for_io = -1;
1049
1050 dio->iocb = iocb;
1051 dio->i_size = i_size_read(inode);
1052
1053 spin_lock_init(&dio->bio_lock);
1054 dio->refcount = 1;
1055
1056 /*
1057 * In case of non-aligned buffers, we may need 2 more
1058 * pages since we need to zero out first and last block.
1059 */
1060 if (unlikely(dio->blkfactor))
1061 dio->pages_in_io = 2;
1062
1063 for (seg = 0; seg < nr_segs; seg++) {
1064 user_addr = (unsigned long)iov[seg].iov_base;
1065 dio->pages_in_io +=
1066 ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
1067 - user_addr/PAGE_SIZE);
1068 }
1069
1070 for (seg = 0; seg < nr_segs; seg++) {
1071 user_addr = (unsigned long)iov[seg].iov_base;
1072 dio->size += bytes = iov[seg].iov_len;
1073
1074 /* Index into the first page of the first block */
1075 dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1076 dio->final_block_in_request = dio->block_in_file +
1077 (bytes >> blkbits);
1078 /* Page fetching state */
1079 dio->head = 0;
1080 dio->tail = 0;
1081 dio->curr_page = 0;
1082
1083 dio->total_pages = 0;
1084 if (user_addr & (PAGE_SIZE-1)) {
1085 dio->total_pages++;
1086 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1087 }
1088 dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1089 dio->curr_user_address = user_addr;
1090
1091 ret = do_direct_IO(dio);
1092
1093 dio->result += iov[seg].iov_len -
1094 ((dio->final_block_in_request - dio->block_in_file) <<
1095 blkbits);
1096
1097 if (ret) {
1098 dio_cleanup(dio);
1099 break;
1100 }
1101 } /* end iovec loop */
1102
1103 if (ret == -ENOTBLK) {
1104 /*
1105 * The remaining part of the request will be
1106 * be handled by buffered I/O when we return
1107 */
1108 ret = 0;
1109 }
1110 /*
1111 * There may be some unwritten disk at the end of a part-written
1112 * fs-block-sized block. Go zero that now.
1113 */
1114 dio_zero_block(dio, 1);
1115
1116 if (dio->cur_page) {
1117 ret2 = dio_send_cur_page(dio);
1118 if (ret == 0)
1119 ret = ret2;
1120 page_cache_release(dio->cur_page);
1121 dio->cur_page = NULL;
1122 }
1123 if (dio->bio)
1124 dio_bio_submit(dio);
1125
1126 /*
1127 * It is possible that, we return short IO due to end of file.
1128 * In that case, we need to release all the pages we got hold on.
1129 */
1130 dio_cleanup(dio);
1131
1132 /*
1133 * All block lookups have been performed. For READ requests
1134 * we can let i_mutex go now that its achieved its purpose
1135 * of protecting us from looking up uninitialized blocks.
1136 */
1137 if (rw == READ && (dio->flags & DIO_LOCKING))
1138 mutex_unlock(&dio->inode->i_mutex);
1139
1140 /*
1141 * The only time we want to leave bios in flight is when a successful
1142 * partial aio read or full aio write have been setup. In that case
1143 * bio completion will call aio_complete. The only time it's safe to
1144 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1145 * This had *better* be the only place that raises -EIOCBQUEUED.
1146 */
1147 BUG_ON(ret == -EIOCBQUEUED);
1148 if (dio->is_async && ret == 0 && dio->result &&
1149 ((rw & READ) || (dio->result == dio->size)))
1150 ret = -EIOCBQUEUED;
1151
1152 if (ret != -EIOCBQUEUED)
1153 dio_await_completion(dio);
1154 1050
1155 /* 1051 /*
1156 * Sync will always be dropping the final ref and completing the 1052 * Sync will always be dropping the final ref and completing the
@@ -1166,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1166 spin_lock_irqsave(&dio->bio_lock, flags); 1062 spin_lock_irqsave(&dio->bio_lock, flags);
1167 ret2 = --dio->refcount; 1063 ret2 = --dio->refcount;
1168 spin_unlock_irqrestore(&dio->bio_lock, flags); 1064 spin_unlock_irqrestore(&dio->bio_lock, flags);
1169 1065 return ret2;
1170 if (ret2 == 0) {
1171 ret = dio_complete(dio, offset, ret, false);
1172 kfree(dio);
1173 } else
1174 BUG_ON(ret != -EIOCBQUEUED);
1175
1176 return ret;
1177} 1066}
1178 1067
1179/* 1068/*
@@ -1195,6 +1084,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1195 * expected that filesystem provide exclusion between new direct I/O 1084 * expected that filesystem provide exclusion between new direct I/O
1196 * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, 1085 * and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
1197 * but other filesystems need to take care of this on their own. 1086 * but other filesystems need to take care of this on their own.
1087 *
1088 * NOTE: if you pass "sdio" to anything by pointer make sure that function
1089 * is always inlined. Otherwise gcc is unable to split the structure into
1090 * individual fields and will generate much worse code. This is important
1091 * for the whole file.
1198 */ 1092 */
1199ssize_t 1093ssize_t
1200__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1094__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1211,6 +1105,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1211 ssize_t retval = -EINVAL; 1105 ssize_t retval = -EINVAL;
1212 loff_t end = offset; 1106 loff_t end = offset;
1213 struct dio *dio; 1107 struct dio *dio;
1108 struct dio_submit sdio = { 0, };
1109 unsigned long user_addr;
1110 size_t bytes;
1111 struct buffer_head map_bh = { 0, };
1214 1112
1215 if (rw & WRITE) 1113 if (rw & WRITE)
1216 rw = WRITE_ODIRECT; 1114 rw = WRITE_ODIRECT;
@@ -1244,7 +1142,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1244 if (rw == READ && end == offset) 1142 if (rw == READ && end == offset)
1245 return 0; 1143 return 0;
1246 1144
1247 dio = kmalloc(sizeof(*dio), GFP_KERNEL); 1145 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
1248 retval = -ENOMEM; 1146 retval = -ENOMEM;
1249 if (!dio) 1147 if (!dio)
1250 goto out; 1148 goto out;
@@ -1268,7 +1166,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1268 end - 1); 1166 end - 1);
1269 if (retval) { 1167 if (retval) {
1270 mutex_unlock(&inode->i_mutex); 1168 mutex_unlock(&inode->i_mutex);
1271 kfree(dio); 1169 kmem_cache_free(dio_cache, dio);
1272 goto out; 1170 goto out;
1273 } 1171 }
1274 } 1172 }
@@ -1288,11 +1186,141 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1288 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && 1186 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
1289 (end > i_size_read(inode))); 1187 (end > i_size_read(inode)));
1290 1188
1291 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1189 retval = 0;
1292 nr_segs, blkbits, get_block, end_io, 1190
1293 submit_io, dio); 1191 dio->inode = inode;
1192 dio->rw = rw;
1193 sdio.blkbits = blkbits;
1194 sdio.blkfactor = inode->i_blkbits - blkbits;
1195 sdio.block_in_file = offset >> blkbits;
1196
1197 sdio.get_block = get_block;
1198 dio->end_io = end_io;
1199 sdio.submit_io = submit_io;
1200 sdio.final_block_in_bio = -1;
1201 sdio.next_block_for_io = -1;
1202
1203 dio->iocb = iocb;
1204 dio->i_size = i_size_read(inode);
1205
1206 spin_lock_init(&dio->bio_lock);
1207 dio->refcount = 1;
1208
1209 /*
1210 * In case of non-aligned buffers, we may need 2 more
1211 * pages since we need to zero out first and last block.
1212 */
1213 if (unlikely(sdio.blkfactor))
1214 sdio.pages_in_io = 2;
1215
1216 for (seg = 0; seg < nr_segs; seg++) {
1217 user_addr = (unsigned long)iov[seg].iov_base;
1218 sdio.pages_in_io +=
1219 ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
1220 PAGE_SIZE - user_addr / PAGE_SIZE);
1221 }
1222
1223 for (seg = 0; seg < nr_segs; seg++) {
1224 user_addr = (unsigned long)iov[seg].iov_base;
1225 sdio.size += bytes = iov[seg].iov_len;
1226
1227 /* Index into the first page of the first block */
1228 sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1229 sdio.final_block_in_request = sdio.block_in_file +
1230 (bytes >> blkbits);
1231 /* Page fetching state */
1232 sdio.head = 0;
1233 sdio.tail = 0;
1234 sdio.curr_page = 0;
1235
1236 sdio.total_pages = 0;
1237 if (user_addr & (PAGE_SIZE-1)) {
1238 sdio.total_pages++;
1239 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1240 }
1241 sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1242 sdio.curr_user_address = user_addr;
1243
1244 retval = do_direct_IO(dio, &sdio, &map_bh);
1245
1246 dio->result += iov[seg].iov_len -
1247 ((sdio.final_block_in_request - sdio.block_in_file) <<
1248 blkbits);
1249
1250 if (retval) {
1251 dio_cleanup(dio, &sdio);
1252 break;
1253 }
1254 } /* end iovec loop */
1255
1256 if (retval == -ENOTBLK) {
1257 /*
1258 * The remaining part of the request will be
1259 * be handled by buffered I/O when we return
1260 */
1261 retval = 0;
1262 }
1263 /*
1264 * There may be some unwritten disk at the end of a part-written
1265 * fs-block-sized block. Go zero that now.
1266 */
1267 dio_zero_block(dio, &sdio, 1, &map_bh);
1268
1269 if (sdio.cur_page) {
1270 ssize_t ret2;
1271
1272 ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
1273 if (retval == 0)
1274 retval = ret2;
1275 page_cache_release(sdio.cur_page);
1276 sdio.cur_page = NULL;
1277 }
1278 if (sdio.bio)
1279 dio_bio_submit(dio, &sdio);
1280
1281 /*
1282 * It is possible that, we return short IO due to end of file.
1283 * In that case, we need to release all the pages we got hold on.
1284 */
1285 dio_cleanup(dio, &sdio);
1286
1287 /*
1288 * All block lookups have been performed. For READ requests
1289 * we can let i_mutex go now that its achieved its purpose
1290 * of protecting us from looking up uninitialized blocks.
1291 */
1292 if (rw == READ && (dio->flags & DIO_LOCKING))
1293 mutex_unlock(&dio->inode->i_mutex);
1294
1295 /*
1296 * The only time we want to leave bios in flight is when a successful
1297 * partial aio read or full aio write have been setup. In that case
1298 * bio completion will call aio_complete. The only time it's safe to
1299 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1300 * This had *better* be the only place that raises -EIOCBQUEUED.
1301 */
1302 BUG_ON(retval == -EIOCBQUEUED);
1303 if (dio->is_async && retval == 0 && dio->result &&
1304 ((rw & READ) || (dio->result == sdio.size)))
1305 retval = -EIOCBQUEUED;
1306
1307 if (retval != -EIOCBQUEUED)
1308 dio_await_completion(dio);
1309
1310 if (drop_refcount(dio) == 0) {
1311 retval = dio_complete(dio, offset, retval, false);
1312 kmem_cache_free(dio_cache, dio);
1313 } else
1314 BUG_ON(retval != -EIOCBQUEUED);
1294 1315
1295out: 1316out:
1296 return retval; 1317 return retval;
1297} 1318}
1298EXPORT_SYMBOL(__blockdev_direct_IO); 1319EXPORT_SYMBOL(__blockdev_direct_IO);
1320
1321static __init int dio_init(void)
1322{
1323 dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
1324 return 0;
1325}
1326module_init(dio_init)
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 1cd6d9d3e29a..cc16562654de 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,6 @@
1config ECRYPT_FS 1config ECRYPT_FS
2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)" 2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
3 depends on EXPERIMENTAL && KEYS && CRYPTO 3 depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
4 select CRYPTO_ECB 4 select CRYPTO_ECB
5 select CRYPTO_CBC 5 select CRYPTO_CBC
6 select CRYPTO_MD5 6 select CRYPTO_MD5
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 08a2b52bf565..ac1ad48c2376 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1973,7 +1973,7 @@ pki_encrypt_session_key(struct key *auth_tok_key,
1973{ 1973{
1974 struct ecryptfs_msg_ctx *msg_ctx = NULL; 1974 struct ecryptfs_msg_ctx *msg_ctx = NULL;
1975 char *payload = NULL; 1975 char *payload = NULL;
1976 size_t payload_len; 1976 size_t payload_len = 0;
1977 struct ecryptfs_message *msg; 1977 struct ecryptfs_message *msg;
1978 int rc; 1978 int rc;
1979 1979
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f1bb747d77d..b4a6befb1216 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -175,6 +175,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
175 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, 175 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
176 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, 176 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
177 ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, 177 ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
178 ecryptfs_opt_check_dev_ruid,
178 ecryptfs_opt_err }; 179 ecryptfs_opt_err };
179 180
180static const match_table_t tokens = { 181static const match_table_t tokens = {
@@ -191,6 +192,7 @@ static const match_table_t tokens = {
191 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, 192 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
192 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, 193 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
193 {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, 194 {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
195 {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"},
194 {ecryptfs_opt_err, NULL} 196 {ecryptfs_opt_err, NULL}
195}; 197};
196 198
@@ -236,6 +238,7 @@ static void ecryptfs_init_mount_crypt_stat(
236 * ecryptfs_parse_options 238 * ecryptfs_parse_options
237 * @sb: The ecryptfs super block 239 * @sb: The ecryptfs super block
238 * @options: The options passed to the kernel 240 * @options: The options passed to the kernel
241 * @check_ruid: set to 1 if device uid should be checked against the ruid
239 * 242 *
240 * Parse mount options: 243 * Parse mount options:
241 * debug=N - ecryptfs_verbosity level for debug output 244 * debug=N - ecryptfs_verbosity level for debug output
@@ -251,7 +254,8 @@ static void ecryptfs_init_mount_crypt_stat(
251 * 254 *
252 * Returns zero on success; non-zero on error 255 * Returns zero on success; non-zero on error
253 */ 256 */
254static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) 257static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
258 uid_t *check_ruid)
255{ 259{
256 char *p; 260 char *p;
257 int rc = 0; 261 int rc = 0;
@@ -276,6 +280,8 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
276 char *cipher_key_bytes_src; 280 char *cipher_key_bytes_src;
277 char *fn_cipher_key_bytes_src; 281 char *fn_cipher_key_bytes_src;
278 282
283 *check_ruid = 0;
284
279 if (!options) { 285 if (!options) {
280 rc = -EINVAL; 286 rc = -EINVAL;
281 goto out; 287 goto out;
@@ -380,6 +386,9 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
380 mount_crypt_stat->flags |= 386 mount_crypt_stat->flags |=
381 ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; 387 ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
382 break; 388 break;
389 case ecryptfs_opt_check_dev_ruid:
390 *check_ruid = 1;
391 break;
383 case ecryptfs_opt_err: 392 case ecryptfs_opt_err:
384 default: 393 default:
385 printk(KERN_WARNING 394 printk(KERN_WARNING
@@ -475,6 +484,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
475 const char *err = "Getting sb failed"; 484 const char *err = "Getting sb failed";
476 struct inode *inode; 485 struct inode *inode;
477 struct path path; 486 struct path path;
487 uid_t check_ruid;
478 int rc; 488 int rc;
479 489
480 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); 490 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -483,7 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
483 goto out; 493 goto out;
484 } 494 }
485 495
486 rc = ecryptfs_parse_options(sbi, raw_data); 496 rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
487 if (rc) { 497 if (rc) {
488 err = "Error parsing options"; 498 err = "Error parsing options";
489 goto out; 499 goto out;
@@ -521,6 +531,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
521 "known incompatibilities\n"); 531 "known incompatibilities\n");
522 goto out_free; 532 goto out_free;
523 } 533 }
534
535 if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) {
536 rc = -EPERM;
537 printk(KERN_ERR "Mount of device (uid: %d) not owned by "
538 "requested user (uid: %d)\n",
539 path.dentry->d_inode->i_uid, current_uid());
540 goto out_free;
541 }
542
524 ecryptfs_set_superblock_lower(s, path.dentry->d_sb); 543 ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
525 s->s_maxbytes = path.dentry->d_sb->s_maxbytes; 544 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
526 s->s_blocksize = path.dentry->d_sb->s_blocksize; 545 s->s_blocksize = path.dentry->d_sb->s_blocksize;
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 85d430963116..3745f7c2b9c2 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -39,15 +39,16 @@
39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, 39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
40 loff_t offset, size_t size) 40 loff_t offset, size_t size)
41{ 41{
42 struct ecryptfs_inode_info *inode_info; 42 struct file *lower_file;
43 mm_segment_t fs_save; 43 mm_segment_t fs_save;
44 ssize_t rc; 44 ssize_t rc;
45 45
46 inode_info = ecryptfs_inode_to_private(ecryptfs_inode); 46 lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
47 BUG_ON(!inode_info->lower_file); 47 if (!lower_file)
48 return -EIO;
48 fs_save = get_fs(); 49 fs_save = get_fs();
49 set_fs(get_ds()); 50 set_fs(get_ds());
50 rc = vfs_write(inode_info->lower_file, data, size, &offset); 51 rc = vfs_write(lower_file, data, size, &offset);
51 set_fs(fs_save); 52 set_fs(fs_save);
52 mark_inode_dirty_sync(ecryptfs_inode); 53 mark_inode_dirty_sync(ecryptfs_inode);
53 return rc; 54 return rc;
@@ -225,15 +226,16 @@ out:
225int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 226int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
226 struct inode *ecryptfs_inode) 227 struct inode *ecryptfs_inode)
227{ 228{
228 struct ecryptfs_inode_info *inode_info = 229 struct file *lower_file;
229 ecryptfs_inode_to_private(ecryptfs_inode);
230 mm_segment_t fs_save; 230 mm_segment_t fs_save;
231 ssize_t rc; 231 ssize_t rc;
232 232
233 BUG_ON(!inode_info->lower_file); 233 lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
234 if (!lower_file)
235 return -EIO;
234 fs_save = get_fs(); 236 fs_save = get_fs();
235 set_fs(get_ds()); 237 set_fs(get_ds());
236 rc = vfs_read(inode_info->lower_file, data, size, &offset); 238 rc = vfs_read(lower_file, data, size, &offset);
237 set_fs(fs_save); 239 set_fs(fs_save);
238 return rc; 240 return rc;
239} 241}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index fe047d966dc5..9026fc91fe3b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -700,7 +700,7 @@ static const struct file_operations eventpoll_fops = {
700 .llseek = noop_llseek, 700 .llseek = noop_llseek,
701}; 701};
702 702
703/* Fast test to see if the file is an evenpoll file */ 703/* Fast test to see if the file is an eventpoll file */
704static inline int is_file_epoll(struct file *f) 704static inline int is_file_epoll(struct file *f)
705{ 705{
706 return f->f_op == &eventpoll_fops; 706 return f->f_op == &eventpoll_fops;
diff --git a/fs/exec.c b/fs/exec.c
index da80612a35f4..25dcbe5fc356 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1459,6 +1459,23 @@ static int do_execve_common(const char *filename,
1459 struct files_struct *displaced; 1459 struct files_struct *displaced;
1460 bool clear_in_exec; 1460 bool clear_in_exec;
1461 int retval; 1461 int retval;
1462 const struct cred *cred = current_cred();
1463
1464 /*
1465 * We move the actual failure in case of RLIMIT_NPROC excess from
1466 * set*uid() to execve() because too many poorly written programs
1467 * don't check setuid() return code. Here we additionally recheck
1468 * whether NPROC limit is still exceeded.
1469 */
1470 if ((current->flags & PF_NPROC_EXCEEDED) &&
1471 atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
1472 retval = -EAGAIN;
1473 goto out_ret;
1474 }
1475
1476 /* We're below the limit (still or again), so we don't want to make
1477 * further execve() calls fail. */
1478 current->flags &= ~PF_NPROC_EXCEEDED;
1462 1479
1463 retval = unshare_files(&displaced); 1480 retval = unshare_files(&displaced);
1464 if (retval) 1481 if (retval)
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855a6c44..352ba149d23e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -13,7 +13,8 @@
13# 13#
14 14
15# ore module library 15# ore module library
16obj-$(CONFIG_ORE) += ore.o 16libore-y := ore.o ore_raid.o
17obj-$(CONFIG_ORE) += libore.o
17 18
18exofs-y := inode.o file.o symlink.o namei.o dir.o super.o 19exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
19obj-$(CONFIG_EXOFS_FS) += exofs.o 20obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 70bae4149291..fa9a286c8771 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,10 +1,17 @@
1# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
2# for every ORE user we do it like this. Any user should add itself here
3# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
4# selected here, and we default to "ON". So in effect it is like been
5# selected by any of the users.
1config ORE 6config ORE
2 tristate 7 tristate
8 depends on EXOFS_FS
9 select ASYNC_XOR
10 default SCSI_OSD_ULD
3 11
4config EXOFS_FS 12config EXOFS_FS
5 tristate "exofs: OSD based file system support" 13 tristate "exofs: OSD based file system support"
6 depends on SCSI_OSD_ULD 14 depends on SCSI_OSD_ULD
7 select ORE
8 help 15 help
9 EXOFS is a file system that uses an OSD storage device, 16 EXOFS is a file system that uses an OSD storage device,
10 as its backing storage. 17 as its backing storage.
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index f4e442ec7445..51f4b4c40f09 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -53,6 +53,10 @@
53/* u64 has problems with printk this will cast it to unsigned long long */ 53/* u64 has problems with printk this will cast it to unsigned long long */
54#define _LLU(x) (unsigned long long)(x) 54#define _LLU(x) (unsigned long long)(x)
55 55
56struct exofs_dev {
57 struct ore_dev ored;
58 unsigned did;
59};
56/* 60/*
57 * our extension to the in-memory superblock 61 * our extension to the in-memory superblock
58 */ 62 */
@@ -66,13 +70,9 @@ struct exofs_sb_info {
66 u32 s_next_generation; /* next gen # to use */ 70 u32 s_next_generation; /* next gen # to use */
67 atomic_t s_curr_pending; /* number of pending commands */ 71 atomic_t s_curr_pending; /* number of pending commands */
68 72
69 struct pnfs_osd_data_map data_map; /* Default raid to use
70 * FIXME: Needed ?
71 */
72 struct ore_layout layout; /* Default files layout */ 73 struct ore_layout layout; /* Default files layout */
73 struct ore_comp one_comp; /* id & cred of partition id=0*/ 74 struct ore_comp one_comp; /* id & cred of partition id=0*/
74 struct ore_components comps; /* comps for the partition */ 75 struct ore_components oc; /* comps for the partition */
75 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
76}; 76};
77 77
78/* 78/*
@@ -86,7 +86,7 @@ struct exofs_i_info {
86 uint32_t i_dir_start_lookup; /* which page to start lookup */ 86 uint32_t i_dir_start_lookup; /* which page to start lookup */
87 uint64_t i_commit_size; /* the object's written length */ 87 uint64_t i_commit_size; /* the object's written length */
88 struct ore_comp one_comp; /* same component for all devices */ 88 struct ore_comp one_comp; /* same component for all devices */
89 struct ore_components comps; /* inode view of the device table */ 89 struct ore_components oc; /* inode view of the device table */
90}; 90};
91 91
92static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) 92static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations;
207 * bigger and that the device table repeats twice. 207 * bigger and that the device table repeats twice.
208 * See: exofs_read_lookup_dev_table() 208 * See: exofs_read_lookup_dev_table()
209 */ 209 */
210static inline void exofs_init_comps(struct ore_components *comps, 210static inline void exofs_init_comps(struct ore_components *oc,
211 struct ore_comp *one_comp, 211 struct ore_comp *one_comp,
212 struct exofs_sb_info *sbi, osd_id oid) 212 struct exofs_sb_info *sbi, osd_id oid)
213{ 213{
@@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps,
217 one_comp->obj.id = oid; 217 one_comp->obj.id = oid;
218 exofs_make_credential(one_comp->cred, &one_comp->obj); 218 exofs_make_credential(one_comp->cred, &one_comp->obj);
219 219
220 comps->numdevs = sbi->comps.numdevs; 220 oc->first_dev = 0;
221 comps->single_comp = EC_SINGLE_COMP; 221 oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
222 comps->comps = one_comp; 222 sbi->layout.group_count;
223 oc->single_comp = EC_SINGLE_COMP;
224 oc->comps = one_comp;
223 225
224 /* Round robin device view of the table */ 226 /* Round robin device view of the table */
225 first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; 227 first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
226 comps->ods = sbi->comps.ods + first_dev; 228 oc->ods = &sbi->oc.ods[first_dev];
227} 229}
228 230
229#endif 231#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f39a38fc2349..3e5f3a6be90a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,11 +37,7 @@
37 37
38#define EXOFS_DBGMSG2(M...) do {} while (0) 38#define EXOFS_DBGMSG2(M...) do {} while (0)
39 39
40enum { BIO_MAX_PAGES_KMALLOC = 40enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
41 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
42 MAX_PAGES_KMALLOC =
43 PAGE_SIZE / sizeof(struct page *),
44};
45 41
46unsigned exofs_max_io_pages(struct ore_layout *layout, 42unsigned exofs_max_io_pages(struct ore_layout *layout,
47 unsigned expected_pages) 43 unsigned expected_pages)
@@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout,
49 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); 45 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
50 46
51 /* TODO: easily support bio chaining */ 47 /* TODO: easily support bio chaining */
52 pages = min_t(unsigned, pages, 48 pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
53 layout->group_width * BIO_MAX_PAGES_KMALLOC);
54 return pages; 49 return pages;
55} 50}
56 51
@@ -68,6 +63,7 @@ struct page_collect {
68 bool read_4_write; /* This means two things: that the read is sync 63 bool read_4_write; /* This means two things: that the read is sync
69 * And the pages should not be unlocked. 64 * And the pages should not be unlocked.
70 */ 65 */
66 struct page *that_locked_page;
71}; 67};
72 68
73static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 69static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
86 pcol->length = 0; 82 pcol->length = 0;
87 pcol->pg_first = -1; 83 pcol->pg_first = -1;
88 pcol->read_4_write = false; 84 pcol->read_4_write = false;
85 pcol->that_locked_page = NULL;
89} 86}
90 87
91static void _pcol_reset(struct page_collect *pcol) 88static void _pcol_reset(struct page_collect *pcol)
@@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol)
98 pcol->length = 0; 95 pcol->length = 0;
99 pcol->pg_first = -1; 96 pcol->pg_first = -1;
100 pcol->ios = NULL; 97 pcol->ios = NULL;
98 pcol->that_locked_page = NULL;
101 99
102 /* this is probably the end of the loop but in writes 100 /* this is probably the end of the loop but in writes
103 * it might not end here. don't be left with nothing 101 * it might not end here. don't be left with nothing
@@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page,
149 return 0; 147 return 0;
150} 148}
151 149
150enum {PAGE_WAS_NOT_IN_IO = 17};
152static int update_read_page(struct page *page, int ret) 151static int update_read_page(struct page *page, int ret)
153{ 152{
154 if (ret == 0) { 153 switch (ret) {
154 case 0:
155 /* Everything is OK */ 155 /* Everything is OK */
156 SetPageUptodate(page); 156 SetPageUptodate(page);
157 if (PageError(page)) 157 if (PageError(page))
158 ClearPageError(page); 158 ClearPageError(page);
159 } else if (ret == -EFAULT) { 159 break;
160 case -EFAULT:
160 /* In this case we were trying to read something that wasn't on 161 /* In this case we were trying to read something that wasn't on
161 * disk yet - return a page full of zeroes. This should be OK, 162 * disk yet - return a page full of zeroes. This should be OK,
162 * because the object should be empty (if there was a write 163 * because the object should be empty (if there was a write
@@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret)
167 SetPageUptodate(page); 168 SetPageUptodate(page);
168 if (PageError(page)) 169 if (PageError(page))
169 ClearPageError(page); 170 ClearPageError(page);
170 ret = 0; /* recovered error */
171 EXOFS_DBGMSG("recovered read error\n"); 171 EXOFS_DBGMSG("recovered read error\n");
172 } else /* Error */ 172 /* fall through */
173 case PAGE_WAS_NOT_IN_IO:
174 ret = 0; /* recovered error */
175 break;
176 default:
173 SetPageError(page); 177 SetPageError(page);
174 178 }
175 return ret; 179 return ret;
176} 180}
177 181
178static void update_write_page(struct page *page, int ret) 182static void update_write_page(struct page *page, int ret)
179{ 183{
184 if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
185 return; /* don't pass start don't collect $200 */
186
180 if (ret) { 187 if (ret) {
181 mapping_set_error(page->mapping, ret); 188 mapping_set_error(page->mapping, ret);
182 SetPageError(page); 189 SetPageError(page);
@@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret)
190static int __readpages_done(struct page_collect *pcol) 197static int __readpages_done(struct page_collect *pcol)
191{ 198{
192 int i; 199 int i;
193 u64 resid;
194 u64 good_bytes; 200 u64 good_bytes;
195 u64 length = 0; 201 u64 length = 0;
196 int ret = ore_check_io(pcol->ios, &resid); 202 int ret = ore_check_io(pcol->ios, NULL);
197 203
198 if (likely(!ret)) 204 if (likely(!ret)) {
199 good_bytes = pcol->length; 205 good_bytes = pcol->length;
200 else 206 ret = PAGE_WAS_NOT_IN_IO;
201 good_bytes = pcol->length - resid; 207 } else {
208 good_bytes = 0;
209 }
202 210
203 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" 211 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
204 " length=0x%lx nr_pages=%u\n", 212 " length=0x%lx nr_pages=%u\n",
@@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
259 } 267 }
260} 268}
261 269
270static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
271 struct page_collect *pcol_src, struct page_collect *pcol)
272{
273 /* length was wrong or offset was not page aligned */
274 BUG_ON(pcol_src->nr_pages < ios->nr_pages);
275
276 if (pcol_src->nr_pages > ios->nr_pages) {
277 struct page **src_page;
278 unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
279 unsigned long len_less = pcol_src->length - ios->length;
280 unsigned i;
281 int ret;
282
283 /* This IO was trimmed */
284 pcol_src->nr_pages = ios->nr_pages;
285 pcol_src->length = ios->length;
286
287 /* Left over pages are passed to the next io */
288 pcol->expected_pages += pages_less;
289 pcol->nr_pages = pages_less;
290 pcol->length = len_less;
291 src_page = pcol_src->pages + pcol_src->nr_pages;
292 pcol->pg_first = (*src_page)->index;
293
294 ret = pcol_try_alloc(pcol);
295 if (unlikely(ret))
296 return ret;
297
298 for (i = 0; i < pages_less; ++i)
299 pcol->pages[i] = *src_page++;
300
301 EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
302 "pages_less=0x%x expected_pages=0x%x "
303 "next_offset=0x%llx next_len=0x%lx\n",
304 pcol_src->nr_pages, pages_less, pcol->expected_pages,
305 pcol->pg_first * PAGE_SIZE, pcol->length);
306 }
307 return 0;
308}
309
262static int read_exec(struct page_collect *pcol) 310static int read_exec(struct page_collect *pcol)
263{ 311{
264 struct exofs_i_info *oi = exofs_i(pcol->inode); 312 struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol)
270 return 0; 318 return 0;
271 319
272 if (!pcol->ios) { 320 if (!pcol->ios) {
273 int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, 321 int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
274 pcol->pg_first << PAGE_CACHE_SHIFT, 322 pcol->pg_first << PAGE_CACHE_SHIFT,
275 pcol->length, &pcol->ios); 323 pcol->length, &pcol->ios);
276 324
@@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol)
280 328
281 ios = pcol->ios; 329 ios = pcol->ios;
282 ios->pages = pcol->pages; 330 ios->pages = pcol->pages;
283 ios->nr_pages = pcol->nr_pages;
284 331
285 if (pcol->read_4_write) { 332 if (pcol->read_4_write) {
286 ore_read(pcol->ios); 333 ore_read(pcol->ios);
@@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol)
296 *pcol_copy = *pcol; 343 *pcol_copy = *pcol;
297 ios->done = readpages_done; 344 ios->done = readpages_done;
298 ios->private = pcol_copy; 345 ios->private = pcol_copy;
346
347 /* pages ownership was passed to pcol_copy */
348 _pcol_reset(pcol);
349
350 ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
351 if (unlikely(ret))
352 goto err;
353
354 EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
355 pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
356
299 ret = ore_read(ios); 357 ret = ore_read(ios);
300 if (unlikely(ret)) 358 if (unlikely(ret))
301 goto err; 359 goto err;
302 360
303 atomic_inc(&pcol->sbi->s_curr_pending); 361 atomic_inc(&pcol->sbi->s_curr_pending);
304 362
305 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
306 oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
307
308 /* pages ownership was passed to pcol_copy */
309 _pcol_reset(pcol);
310 return 0; 363 return 0;
311 364
312err: 365err:
@@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page)
341 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, 394 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
342 page->index); 395 page->index);
343 396
397 pcol->that_locked_page = page;
398
344 if (page->index < end_index) 399 if (page->index < end_index)
345 len = PAGE_CACHE_SIZE; 400 len = PAGE_CACHE_SIZE;
346 else if (page->index == end_index) 401 else if (page->index == end_index)
@@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
429 return ret; 484 return ret;
430 } 485 }
431 486
487 ret = read_exec(&pcol);
488 if (unlikely(ret))
489 return ret;
490
432 return read_exec(&pcol); 491 return read_exec(&pcol);
433} 492}
434 493
@@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p)
462{ 521{
463 struct page_collect *pcol = p; 522 struct page_collect *pcol = p;
464 int i; 523 int i;
465 u64 resid;
466 u64 good_bytes; 524 u64 good_bytes;
467 u64 length = 0; 525 u64 length = 0;
468 int ret = ore_check_io(ios, &resid); 526 int ret = ore_check_io(ios, NULL);
469 527
470 atomic_dec(&pcol->sbi->s_curr_pending); 528 atomic_dec(&pcol->sbi->s_curr_pending);
471 529
472 if (likely(!ret)) 530 if (likely(!ret)) {
473 good_bytes = pcol->length; 531 good_bytes = pcol->length;
474 else 532 ret = PAGE_WAS_NOT_IN_IO;
475 good_bytes = pcol->length - resid; 533 } else {
534 good_bytes = 0;
535 }
476 536
477 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" 537 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
478 " length=0x%lx nr_pages=%u\n", 538 " length=0x%lx nr_pages=%u\n",
@@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p)
505 EXOFS_DBGMSG2("writepages_done END\n"); 565 EXOFS_DBGMSG2("writepages_done END\n");
506} 566}
507 567
568static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
569{
570 struct page_collect *pcol = priv;
571 pgoff_t index = offset / PAGE_SIZE;
572
573 if (!pcol->that_locked_page ||
574 (pcol->that_locked_page->index != index)) {
575 struct page *page = find_get_page(pcol->inode->i_mapping, index);
576
577 if (!page) {
578 page = find_or_create_page(pcol->inode->i_mapping,
579 index, GFP_NOFS);
580 if (unlikely(!page)) {
581 EXOFS_DBGMSG("grab_cache_page Failed "
582 "index=0x%llx\n", _LLU(index));
583 return NULL;
584 }
585 unlock_page(page);
586 }
587 if (PageDirty(page) || PageWriteback(page))
588 *uptodate = true;
589 else
590 *uptodate = PageUptodate(page);
591 EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
592 return page;
593 } else {
594 EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
595 pcol->that_locked_page->index);
596 *uptodate = true;
597 return pcol->that_locked_page;
598 }
599}
600
601static void __r4w_put_page(void *priv, struct page *page)
602{
603 struct page_collect *pcol = priv;
604
605 if (pcol->that_locked_page != page) {
606 EXOFS_DBGMSG("index=0x%lx\n", page->index);
607 page_cache_release(page);
608 return;
609 }
610 EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
611}
612
613static const struct _ore_r4w_op _r4w_op = {
614 .get_page = &__r4w_get_page,
615 .put_page = &__r4w_put_page,
616};
617
508static int write_exec(struct page_collect *pcol) 618static int write_exec(struct page_collect *pcol)
509{ 619{
510 struct exofs_i_info *oi = exofs_i(pcol->inode); 620 struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol)
516 return 0; 626 return 0;
517 627
518 BUG_ON(pcol->ios); 628 BUG_ON(pcol->ios);
519 ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, 629 ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
520 pcol->pg_first << PAGE_CACHE_SHIFT, 630 pcol->pg_first << PAGE_CACHE_SHIFT,
521 pcol->length, &pcol->ios); 631 pcol->length, &pcol->ios);
522
523 if (unlikely(ret)) 632 if (unlikely(ret))
524 goto err; 633 goto err;
525 634
@@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol)
534 643
535 ios = pcol->ios; 644 ios = pcol->ios;
536 ios->pages = pcol_copy->pages; 645 ios->pages = pcol_copy->pages;
537 ios->nr_pages = pcol_copy->nr_pages;
538 ios->done = writepages_done; 646 ios->done = writepages_done;
647 ios->r4w = &_r4w_op;
539 ios->private = pcol_copy; 648 ios->private = pcol_copy;
540 649
650 /* pages ownership was passed to pcol_copy */
651 _pcol_reset(pcol);
652
653 ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
654 if (unlikely(ret))
655 goto err;
656
657 EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
658 pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
659
541 ret = ore_write(ios); 660 ret = ore_write(ios);
542 if (unlikely(ret)) { 661 if (unlikely(ret)) {
543 EXOFS_ERR("write_exec: ore_write() Failed\n"); 662 EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol)
545 } 664 }
546 665
547 atomic_inc(&pcol->sbi->s_curr_pending); 666 atomic_inc(&pcol->sbi->s_curr_pending);
548 EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
549 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
550 pcol->length);
551 /* pages ownership was passed to pcol_copy */
552 _pcol_reset(pcol);
553 return 0; 667 return 0;
554 668
555err: 669err:
@@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping,
689 _pcol_init(&pcol, expected_pages, mapping->host); 803 _pcol_init(&pcol, expected_pages, mapping->host);
690 804
691 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); 805 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
692 if (ret) { 806 if (unlikely(ret)) {
693 EXOFS_ERR("write_cache_pages => %d\n", ret); 807 EXOFS_ERR("write_cache_pages => %d\n", ret);
694 return ret; 808 return ret;
695 } 809 }
696 810
697 return write_exec(&pcol); 811 ret = write_exec(&pcol);
812 if (unlikely(ret))
813 return ret;
814
815 if (wbc->sync_mode == WB_SYNC_ALL) {
816 return write_exec(&pcol); /* pump the last reminder */
817 } else if (pcol.nr_pages) {
818 /* not SYNC let the reminder join the next writeout */
819 unsigned i;
820
821 for (i = 0; i < pcol.nr_pages; i++) {
822 struct page *page = pcol.pages[i];
823
824 end_page_writeback(page);
825 set_page_dirty(page);
826 unlock_page(page);
827 }
828 }
829 return 0;
698} 830}
699 831
832/*
700static int exofs_writepage(struct page *page, struct writeback_control *wbc) 833static int exofs_writepage(struct page *page, struct writeback_control *wbc)
701{ 834{
702 struct page_collect pcol; 835 struct page_collect pcol;
@@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
712 845
713 return write_exec(&pcol); 846 return write_exec(&pcol);
714} 847}
715 848*/
716/* i_mutex held using inode->i_size directly */ 849/* i_mutex held using inode->i_size directly */
717static void _write_failed(struct inode *inode, loff_t to) 850static void _write_failed(struct inode *inode, loff_t to)
718{ 851{
@@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset)
818const struct address_space_operations exofs_aops = { 951const struct address_space_operations exofs_aops = {
819 .readpage = exofs_readpage, 952 .readpage = exofs_readpage,
820 .readpages = exofs_readpages, 953 .readpages = exofs_readpages,
821 .writepage = exofs_writepage, 954 .writepage = NULL,
822 .writepages = exofs_writepages, 955 .writepages = exofs_writepages,
823 .write_begin = exofs_write_begin_export, 956 .write_begin = exofs_write_begin_export,
824 .write_end = exofs_write_end, 957 .write_end = exofs_write_end,
@@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
860 993
861 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 994 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
862 995
863 ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); 996 ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
864 if (likely(!ret)) 997 if (likely(!ret))
865 truncate_setsize(inode, newsize); 998 truncate_setsize(inode, newsize);
866 999
@@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
927 struct exofs_on_disk_inode_layout *layout; 1060 struct exofs_on_disk_inode_layout *layout;
928 int ret; 1061 int ret;
929 1062
930 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); 1063 ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
931 if (unlikely(ret)) { 1064 if (unlikely(ret)) {
932 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); 1065 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
933 return ret; 1066 return ret;
934 } 1067 }
935 1068
936 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); 1069 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
937 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); 1070 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
938 1071
939 ios->in_attr = attrs; 1072 ios->in_attr = attrs;
940 ios->in_attr_len = ARRAY_SIZE(attrs); 1073 ios->in_attr_len = ARRAY_SIZE(attrs);
@@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1018 return inode; 1151 return inode;
1019 oi = exofs_i(inode); 1152 oi = exofs_i(inode);
1020 __oi_init(oi); 1153 __oi_init(oi);
1021 exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, 1154 exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
1022 exofs_oi_objno(oi)); 1155 exofs_oi_objno(oi));
1023 1156
1024 /* read the inode from the osd */ 1157 /* read the inode from the osd */
@@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1172 spin_unlock(&sbi->s_next_gen_lock); 1305 spin_unlock(&sbi->s_next_gen_lock);
1173 insert_inode_hash(inode); 1306 insert_inode_hash(inode);
1174 1307
1175 exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, 1308 exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
1176 exofs_oi_objno(oi)); 1309 exofs_oi_objno(oi));
1177 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ 1310 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
1178 1311
1179 mark_inode_dirty(inode); 1312 mark_inode_dirty(inode);
1180 1313
1181 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); 1314 ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
1182 if (unlikely(ret)) { 1315 if (unlikely(ret)) {
1183 EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); 1316 EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
1184 return ERR_PTR(ret); 1317 return ERR_PTR(ret);
@@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1267 } else 1400 } else
1268 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1401 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1269 1402
1270 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); 1403 ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
1271 if (unlikely(ret)) { 1404 if (unlikely(ret)) {
1272 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); 1405 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
1273 goto free_args; 1406 goto free_args;
@@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode)
1350 /* ignore the error, attempt a remove anyway */ 1483 /* ignore the error, attempt a remove anyway */
1351 1484
1352 /* Now Remove the OSD objects */ 1485 /* Now Remove the OSD objects */
1353 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); 1486 ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
1354 if (unlikely(ret)) { 1487 if (unlikely(ret)) {
1355 EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); 1488 EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
1356 return; 1489 return;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 25305af88198..fcfa86ae6faf 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -24,76 +24,287 @@
24 24
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <asm/div64.h> 26#include <asm/div64.h>
27#include <linux/lcm.h>
27 28
28#include <scsi/osd_ore.h> 29#include "ore_raid.h"
29 30
30#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) 31MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
32MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
33MODULE_LICENSE("GPL");
34
35/* ore_verify_layout does a couple of things:
36 * 1. Given a minimum number of needed parameters fixes up the rest of the
37 * members to be operatonals for the ore. The needed parameters are those
38 * that are defined by the pnfs-objects layout STD.
39 * 2. Check to see if the current ore code actually supports these parameters
40 * for example stripe_unit must be a multple of the system PAGE_SIZE,
41 * and etc...
42 * 3. Cache some havily used calculations that will be needed by users.
43 */
44
45enum { BIO_MAX_PAGES_KMALLOC =
46 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
31 47
32#ifdef CONFIG_EXOFS_DEBUG 48int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
33#define ORE_DBGMSG(fmt, a...) \ 49{
34 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) 50 u64 stripe_length;
35#else 51
36#define ORE_DBGMSG(fmt, a...) \ 52 switch (layout->raid_algorithm) {
37 do { if (0) printk(fmt, ##a); } while (0) 53 case PNFS_OSD_RAID_0:
38#endif 54 layout->parity = 0;
55 break;
56 case PNFS_OSD_RAID_5:
57 layout->parity = 1;
58 break;
59 case PNFS_OSD_RAID_PQ:
60 case PNFS_OSD_RAID_4:
61 default:
62 ORE_ERR("Only RAID_0/5 for now\n");
63 return -EINVAL;
64 }
65 if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
66 ORE_ERR("Stripe Unit(0x%llx)"
67 " must be Multples of PAGE_SIZE(0x%lx)\n",
68 _LLU(layout->stripe_unit), PAGE_SIZE);
69 return -EINVAL;
70 }
71 if (layout->group_width) {
72 if (!layout->group_depth) {
73 ORE_ERR("group_depth == 0 && group_width != 0\n");
74 return -EINVAL;
75 }
76 if (total_comps < (layout->group_width * layout->mirrors_p1)) {
77 ORE_ERR("Data Map wrong, "
78 "numdevs=%d < group_width=%d * mirrors=%d\n",
79 total_comps, layout->group_width,
80 layout->mirrors_p1);
81 return -EINVAL;
82 }
83 layout->group_count = total_comps / layout->mirrors_p1 /
84 layout->group_width;
85 } else {
86 if (layout->group_depth) {
87 printk(KERN_NOTICE "Warning: group_depth ignored "
88 "group_width == 0 && group_depth == %lld\n",
89 _LLU(layout->group_depth));
90 }
91 layout->group_width = total_comps / layout->mirrors_p1;
92 layout->group_depth = -1;
93 layout->group_count = 1;
94 }
39 95
40/* u64 has problems with printk this will cast it to unsigned long long */ 96 stripe_length = (u64)layout->group_width * layout->stripe_unit;
41#define _LLU(x) (unsigned long long)(x) 97 if (stripe_length >= (1ULL << 32)) {
98 ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
99 _LLU(stripe_length));
100 return -EINVAL;
101 }
42 102
43#define ORE_DBGMSG2(M...) do {} while (0) 103 layout->max_io_length =
44/* #define ORE_DBGMSG2 ORE_DBGMSG */ 104 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
105 layout->group_width;
106 if (layout->parity) {
107 unsigned stripe_length =
108 (layout->group_width - layout->parity) *
109 layout->stripe_unit;
45 110
46MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); 111 layout->max_io_length /= stripe_length;
47MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); 112 layout->max_io_length *= stripe_length;
48MODULE_LICENSE("GPL"); 113 }
114 return 0;
115}
116EXPORT_SYMBOL(ore_verify_layout);
49 117
50static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) 118static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
51{ 119{
52 return ios->comps->comps[index & ios->comps->single_comp].cred; 120 return ios->oc->comps[index & ios->oc->single_comp].cred;
53} 121}
54 122
55static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) 123static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
56{ 124{
57 return &ios->comps->comps[index & ios->comps->single_comp].obj; 125 return &ios->oc->comps[index & ios->oc->single_comp].obj;
58} 126}
59 127
60static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) 128static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
61{ 129{
62 return ios->comps->ods[index]; 130 ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
131 ios->oc->first_dev, ios->oc->numdevs, index,
132 ios->oc->ods);
133
134 return ore_comp_dev(ios->oc, index);
63} 135}
64 136
65int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, 137int _ore_get_io_state(struct ore_layout *layout,
138 struct ore_components *oc, unsigned numdevs,
139 unsigned sgs_per_dev, unsigned num_par_pages,
140 struct ore_io_state **pios)
141{
142 struct ore_io_state *ios;
143 struct page **pages;
144 struct osd_sg_entry *sgilist;
145 struct __alloc_all_io_state {
146 struct ore_io_state ios;
147 struct ore_per_dev_state per_dev[numdevs];
148 union {
149 struct osd_sg_entry sglist[sgs_per_dev * numdevs];
150 struct page *pages[num_par_pages];
151 };
152 } *_aios;
153
154 if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
155 _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
156 if (unlikely(!_aios)) {
157 ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
158 sizeof(*_aios));
159 *pios = NULL;
160 return -ENOMEM;
161 }
162 pages = num_par_pages ? _aios->pages : NULL;
163 sgilist = sgs_per_dev ? _aios->sglist : NULL;
164 ios = &_aios->ios;
165 } else {
166 struct __alloc_small_io_state {
167 struct ore_io_state ios;
168 struct ore_per_dev_state per_dev[numdevs];
169 } *_aio_small;
170 union __extra_part {
171 struct osd_sg_entry sglist[sgs_per_dev * numdevs];
172 struct page *pages[num_par_pages];
173 } *extra_part;
174
175 _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
176 if (unlikely(!_aio_small)) {
177 ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
178 sizeof(*_aio_small));
179 *pios = NULL;
180 return -ENOMEM;
181 }
182 extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
183 if (unlikely(!extra_part)) {
184 ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
185 sizeof(*extra_part));
186 kfree(_aio_small);
187 *pios = NULL;
188 return -ENOMEM;
189 }
190
191 pages = num_par_pages ? extra_part->pages : NULL;
192 sgilist = sgs_per_dev ? extra_part->sglist : NULL;
193 /* In this case the per_dev[0].sgilist holds the pointer to
194 * be freed
195 */
196 ios = &_aio_small->ios;
197 ios->extra_part_alloc = true;
198 }
199
200 if (pages) {
201 ios->parity_pages = pages;
202 ios->max_par_pages = num_par_pages;
203 }
204 if (sgilist) {
205 unsigned d;
206
207 for (d = 0; d < numdevs; ++d) {
208 ios->per_dev[d].sglist = sgilist;
209 sgilist += sgs_per_dev;
210 }
211 ios->sgs_per_dev = sgs_per_dev;
212 }
213
214 ios->layout = layout;
215 ios->oc = oc;
216 *pios = ios;
217 return 0;
218}
219
220/* Allocate an io_state for only a single group of devices
221 *
222 * If a user needs to call ore_read/write() this version must be used becase it
223 * allocates extra stuff for striping and raid.
224 * The ore might decide to only IO less then @length bytes do to alignmets
225 * and constrains as follows:
226 * - The IO cannot cross group boundary.
227 * - In raid5/6 The end of the IO must align at end of a stripe eg.
228 * (@offset + @length) % strip_size == 0. Or the complete range is within a
229 * single stripe.
230 * - Memory condition only permitted a shorter IO. (A user can use @length=~0
231 * And check the returned ios->length for max_io_size.)
232 *
233 * The caller must check returned ios->length (and/or ios->nr_pages) and
234 * re-issue these pages that fall outside of ios->length
235 */
236int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
66 bool is_reading, u64 offset, u64 length, 237 bool is_reading, u64 offset, u64 length,
67 struct ore_io_state **pios) 238 struct ore_io_state **pios)
68{ 239{
69 struct ore_io_state *ios; 240 struct ore_io_state *ios;
241 unsigned numdevs = layout->group_width * layout->mirrors_p1;
242 unsigned sgs_per_dev = 0, max_par_pages = 0;
243 int ret;
70 244
71 /*TODO: Maybe use kmem_cach per sbi of size 245 if (layout->parity && length) {
72 * exofs_io_state_size(layout->s_numdevs) 246 unsigned data_devs = layout->group_width - layout->parity;
73 */ 247 unsigned stripe_size = layout->stripe_unit * data_devs;
74 ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); 248 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
75 if (unlikely(!ios)) { 249 u32 remainder;
76 ORE_DBGMSG("Failed kzalloc bytes=%d\n", 250 u64 num_stripes;
77 ore_io_state_size(comps->numdevs)); 251 u64 num_raid_units;
78 *pios = NULL; 252
79 return -ENOMEM; 253 num_stripes = div_u64_rem(length, stripe_size, &remainder);
254 if (remainder)
255 ++num_stripes;
256
257 num_raid_units = num_stripes * layout->parity;
258
259 if (is_reading) {
260 /* For reads add per_dev sglist array */
261 /* TODO: Raid 6 we need twice more. Actually:
262 * num_stripes / LCMdP(W,P);
263 * if (W%P != 0) num_stripes *= parity;
264 */
265
266 /* first/last seg is split */
267 num_raid_units += layout->group_width;
268 sgs_per_dev = div_u64(num_raid_units, data_devs);
269 } else {
270 /* For Writes add parity pages array. */
271 max_par_pages = num_raid_units * pages_in_unit *
272 sizeof(struct page *);
273 }
80 } 274 }
81 275
82 ios->layout = layout; 276 ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
83 ios->comps = comps; 277 pios);
84 ios->offset = offset; 278 if (unlikely(ret))
85 ios->length = length; 279 return ret;
280
281 ios = *pios;
86 ios->reading = is_reading; 282 ios->reading = is_reading;
283 ios->offset = offset;
284
285 if (length) {
286 ore_calc_stripe_info(layout, offset, length, &ios->si);
287 ios->length = ios->si.length;
288 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
289 if (layout->parity)
290 _ore_post_alloc_raid_stuff(ios);
291 }
87 292
88 *pios = ios;
89 return 0; 293 return 0;
90} 294}
91EXPORT_SYMBOL(ore_get_rw_state); 295EXPORT_SYMBOL(ore_get_rw_state);
92 296
93int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, 297/* Allocate an io_state for all the devices in the comps array
94 struct ore_io_state **ios) 298 *
299 * This version of io_state allocation is used mostly by create/remove
300 * and trunc where we currently need all the devices. The only wastful
301 * bit is the read/write_attributes with no IO. Those sites should
302 * be converted to use ore_get_rw_state() with length=0
303 */
304int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
305 struct ore_io_state **pios)
95{ 306{
96 return ore_get_rw_state(layout, comps, true, 0, 0, ios); 307 return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
97} 308}
98EXPORT_SYMBOL(ore_get_io_state); 309EXPORT_SYMBOL(ore_get_io_state);
99 310
@@ -111,6 +322,7 @@ void ore_put_io_state(struct ore_io_state *ios)
111 bio_put(per_dev->bio); 322 bio_put(per_dev->bio);
112 } 323 }
113 324
325 _ore_free_raid_stuff(ios);
114 kfree(ios); 326 kfree(ios);
115 } 327 }
116} 328}
@@ -138,7 +350,7 @@ static void _done_io(struct osd_request *or, void *p)
138 kref_put(&ios->kref, _last_io); 350 kref_put(&ios->kref, _last_io);
139} 351}
140 352
141static int ore_io_execute(struct ore_io_state *ios) 353int ore_io_execute(struct ore_io_state *ios)
142{ 354{
143 DECLARE_COMPLETION_ONSTACK(wait); 355 DECLARE_COMPLETION_ONSTACK(wait);
144 bool sync = (ios->done == NULL); 356 bool sync = (ios->done == NULL);
@@ -198,7 +410,7 @@ static void _clear_bio(struct bio *bio)
198 } 410 }
199} 411}
200 412
201int ore_check_io(struct ore_io_state *ios, u64 *resid) 413int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
202{ 414{
203 enum osd_err_priority acumulated_osd_err = 0; 415 enum osd_err_priority acumulated_osd_err = 0;
204 int acumulated_lin_err = 0; 416 int acumulated_lin_err = 0;
@@ -206,7 +418,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
206 418
207 for (i = 0; i < ios->numdevs; i++) { 419 for (i = 0; i < ios->numdevs; i++) {
208 struct osd_sense_info osi; 420 struct osd_sense_info osi;
209 struct osd_request *or = ios->per_dev[i].or; 421 struct ore_per_dev_state *per_dev = &ios->per_dev[i];
422 struct osd_request *or = per_dev->or;
210 int ret; 423 int ret;
211 424
212 if (unlikely(!or)) 425 if (unlikely(!or))
@@ -218,29 +431,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
218 431
219 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 432 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
220 /* start read offset passed endof file */ 433 /* start read offset passed endof file */
221 _clear_bio(ios->per_dev[i].bio); 434 _clear_bio(per_dev->bio);
222 ORE_DBGMSG("start read offset passed end of file " 435 ORE_DBGMSG("start read offset passed end of file "
223 "offset=0x%llx, length=0x%llx\n", 436 "offset=0x%llx, length=0x%llx\n",
224 _LLU(ios->per_dev[i].offset), 437 _LLU(per_dev->offset),
225 _LLU(ios->per_dev[i].length)); 438 _LLU(per_dev->length));
226 439
227 continue; /* we recovered */ 440 continue; /* we recovered */
228 } 441 }
229 442
443 if (on_dev_error) {
444 u64 residual = ios->reading ?
445 or->in.residual : or->out.residual;
446 u64 offset = (ios->offset + ios->length) - residual;
447 struct ore_dev *od = ios->oc->ods[
448 per_dev->dev - ios->oc->first_dev];
449
450 on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
451 offset, residual);
452 }
230 if (osi.osd_err_pri >= acumulated_osd_err) { 453 if (osi.osd_err_pri >= acumulated_osd_err) {
231 acumulated_osd_err = osi.osd_err_pri; 454 acumulated_osd_err = osi.osd_err_pri;
232 acumulated_lin_err = ret; 455 acumulated_lin_err = ret;
233 } 456 }
234 } 457 }
235 458
236 /* TODO: raid specific residual calculations */
237 if (resid) {
238 if (likely(!acumulated_lin_err))
239 *resid = 0;
240 else
241 *resid = ios->length;
242 }
243
244 return acumulated_lin_err; 459 return acumulated_lin_err;
245} 460}
246EXPORT_SYMBOL(ore_check_io); 461EXPORT_SYMBOL(ore_check_io);
@@ -248,61 +463,65 @@ EXPORT_SYMBOL(ore_check_io);
248/* 463/*
249 * L - logical offset into the file 464 * L - logical offset into the file
250 * 465 *
251 * U - The number of bytes in a stripe within a group 466 * D - number of Data devices
467 * D = group_width - parity
252 * 468 *
253 * U = stripe_unit * group_width 469 * U - The number of bytes in a stripe within a group
470 * U = stripe_unit * D
254 * 471 *
255 * T - The number of bytes striped within a group of component objects 472 * T - The number of bytes striped within a group of component objects
256 * (before advancing to the next group) 473 * (before advancing to the next group)
257 * 474 * T = U * group_depth
258 * T = stripe_unit * group_width * group_depth
259 * 475 *
260 * S - The number of bytes striped across all component objects 476 * S - The number of bytes striped across all component objects
261 * before the pattern repeats 477 * before the pattern repeats
478 * S = T * group_count
262 * 479 *
263 * S = stripe_unit * group_width * group_depth * group_count 480 * M - The "major" (i.e., across all components) cycle number
264 *
265 * M - The "major" (i.e., across all components) stripe number
266 *
267 * M = L / S 481 * M = L / S
268 * 482 *
269 * G - Counts the groups from the beginning of the major stripe 483 * G - Counts the groups from the beginning of the major cycle
270 *
271 * G = (L - (M * S)) / T [or (L % S) / T] 484 * G = (L - (M * S)) / T [or (L % S) / T]
272 * 485 *
273 * H - The byte offset within the group 486 * H - The byte offset within the group
274 *
275 * H = (L - (M * S)) % T [or (L % S) % T] 487 * H = (L - (M * S)) % T [or (L % S) % T]
276 * 488 *
277 * N - The "minor" (i.e., across the group) stripe number 489 * N - The "minor" (i.e., across the group) stripe number
278 *
279 * N = H / U 490 * N = H / U
280 * 491 *
281 * C - The component index coresponding to L 492 * C - The component index coresponding to L
282 * 493 *
283 * C = (H - (N * U)) / stripe_unit + G * group_width 494 * C = (H - (N * U)) / stripe_unit + G * D
284 * [or (L % U) / stripe_unit + G * group_width] 495 * [or (L % U) / stripe_unit + G * D]
285 * 496 *
286 * O - The component offset coresponding to L 497 * O - The component offset coresponding to L
287 *
288 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit 498 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
499 *
500 * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
501 * divide by parity
502 * LCMdP = lcm(group_width, parity) / parity
503 *
504 * R - The parity Rotation stripe
505 * (Note parity cycle always starts at a group's boundary)
506 * R = N % LCMdP
507 *
508 * I = the first parity device index
509 * I = (group_width + group_width - R*parity - parity) % group_width
510 *
511 * Craid - The component index Rotated
512 * Craid = (group_width + C - R*parity) % group_width
513 * (We add the group_width to avoid negative numbers modulo math)
289 */ 514 */
290struct _striping_info { 515void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
291 u64 obj_offset; 516 u64 length, struct ore_striping_info *si)
292 u64 group_length;
293 u64 M; /* for truncate */
294 unsigned dev;
295 unsigned unit_off;
296};
297
298static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
299 struct _striping_info *si)
300{ 517{
301 u32 stripe_unit = layout->stripe_unit; 518 u32 stripe_unit = layout->stripe_unit;
302 u32 group_width = layout->group_width; 519 u32 group_width = layout->group_width;
303 u64 group_depth = layout->group_depth; 520 u64 group_depth = layout->group_depth;
521 u32 parity = layout->parity;
304 522
305 u32 U = stripe_unit * group_width; 523 u32 D = group_width - parity;
524 u32 U = D * stripe_unit;
306 u64 T = U * group_depth; 525 u64 T = U * group_depth;
307 u64 S = T * layout->group_count; 526 u64 S = T * layout->group_count;
308 u64 M = div64_u64(file_offset, S); 527 u64 M = div64_u64(file_offset, S);
@@ -318,39 +537,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
318 u32 N = div_u64(H, U); 537 u32 N = div_u64(H, U);
319 538
320 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 539 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
321 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; 540 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
322 si->dev *= layout->mirrors_p1;
323 541
324 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 542 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
325 543
326 si->obj_offset = si->unit_off + (N * stripe_unit) + 544 si->obj_offset = si->unit_off + (N * stripe_unit) +
327 (M * group_depth * stripe_unit); 545 (M * group_depth * stripe_unit);
328 546
329 si->group_length = T - H; 547 if (parity) {
548 u32 LCMdP = lcm(group_width, parity) / parity;
549 /* R = N % LCMdP; */
550 u32 RxP = (N % LCMdP) * parity;
551 u32 first_dev = C - C % group_width;
552
553 si->par_dev = (group_width + group_width - parity - RxP) %
554 group_width + first_dev;
555 si->dev = (group_width + C - RxP) % group_width + first_dev;
556 si->bytes_in_stripe = U;
557 si->first_stripe_start = M * S + G * T + N * U;
558 } else {
559 /* Make the math correct see _prepare_one_group */
560 si->par_dev = group_width;
561 si->dev = C;
562 }
563
564 si->dev *= layout->mirrors_p1;
565 si->par_dev *= layout->mirrors_p1;
566 si->offset = file_offset;
567 si->length = T - H;
568 if (si->length > length)
569 si->length = length;
330 si->M = M; 570 si->M = M;
331} 571}
572EXPORT_SYMBOL(ore_calc_stripe_info);
332 573
333static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 574int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
334 unsigned pgbase, struct ore_per_dev_state *per_dev, 575 unsigned pgbase, struct page **pages,
335 int cur_len) 576 struct ore_per_dev_state *per_dev, int cur_len)
336{ 577{
337 unsigned pg = *cur_pg; 578 unsigned pg = *cur_pg;
338 struct request_queue *q = 579 struct request_queue *q =
339 osd_request_queue(_ios_od(ios, per_dev->dev)); 580 osd_request_queue(_ios_od(ios, per_dev->dev));
340 581 unsigned len = cur_len;
341 per_dev->length += cur_len; 582 int ret;
342 583
343 if (per_dev->bio == NULL) { 584 if (per_dev->bio == NULL) {
344 unsigned pages_in_stripe = ios->layout->group_width * 585 unsigned pages_in_stripe = ios->layout->group_width *
345 (ios->layout->stripe_unit / PAGE_SIZE); 586 (ios->layout->stripe_unit / PAGE_SIZE);
346 unsigned bio_size = (ios->nr_pages + pages_in_stripe) / 587 unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
347 ios->layout->group_width; 588 (ios->layout->group_width -
589 ios->layout->parity);
590 unsigned bio_size = (nr_pages + pages_in_stripe) /
591 ios->layout->group_width;
348 592
349 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 593 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
350 if (unlikely(!per_dev->bio)) { 594 if (unlikely(!per_dev->bio)) {
351 ORE_DBGMSG("Failed to allocate BIO size=%u\n", 595 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
352 bio_size); 596 bio_size);
353 return -ENOMEM; 597 ret = -ENOMEM;
598 goto out;
354 } 599 }
355 } 600 }
356 601
@@ -358,64 +603,90 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
358 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); 603 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
359 unsigned added_len; 604 unsigned added_len;
360 605
361 BUG_ON(ios->nr_pages <= pg);
362 cur_len -= pglen; 606 cur_len -= pglen;
363 607
364 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], 608 added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
365 pglen, pgbase); 609 pglen, pgbase);
366 if (unlikely(pglen != added_len)) 610 if (unlikely(pglen != added_len)) {
367 return -ENOMEM; 611 ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
612 per_dev->bio->bi_vcnt);
613 ret = -ENOMEM;
614 goto out;
615 }
616 _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
617
368 pgbase = 0; 618 pgbase = 0;
369 ++pg; 619 ++pg;
370 } 620 }
371 BUG_ON(cur_len); 621 BUG_ON(cur_len);
372 622
623 per_dev->length += len;
373 *cur_pg = pg; 624 *cur_pg = pg;
374 return 0; 625 ret = 0;
626out: /* we fail the complete unit on an error eg don't advance
627 * per_dev->length and cur_pg. This means that we might have a bigger
628 * bio than the CDB requested length (per_dev->length). That's fine
629 * only the oposite is fatal.
630 */
631 return ret;
375} 632}
376 633
377static int _prepare_one_group(struct ore_io_state *ios, u64 length, 634static int _prepare_for_striping(struct ore_io_state *ios)
378 struct _striping_info *si)
379{ 635{
636 struct ore_striping_info *si = &ios->si;
380 unsigned stripe_unit = ios->layout->stripe_unit; 637 unsigned stripe_unit = ios->layout->stripe_unit;
381 unsigned mirrors_p1 = ios->layout->mirrors_p1; 638 unsigned mirrors_p1 = ios->layout->mirrors_p1;
382 unsigned devs_in_group = ios->layout->group_width * mirrors_p1; 639 unsigned group_width = ios->layout->group_width;
640 unsigned devs_in_group = group_width * mirrors_p1;
383 unsigned dev = si->dev; 641 unsigned dev = si->dev;
384 unsigned first_dev = dev - (dev % devs_in_group); 642 unsigned first_dev = dev - (dev % devs_in_group);
385 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; 643 unsigned dev_order;
386 unsigned cur_pg = ios->pages_consumed; 644 unsigned cur_pg = ios->pages_consumed;
645 u64 length = ios->length;
387 int ret = 0; 646 int ret = 0;
388 647
648 if (!ios->pages) {
649 ios->numdevs = ios->layout->mirrors_p1;
650 return 0;
651 }
652
653 BUG_ON(length > si->length);
654
655 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
656 si->cur_comp = dev_order;
657 si->cur_pg = si->unit_off / PAGE_SIZE;
658
389 while (length) { 659 while (length) {
390 struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; 660 unsigned comp = dev - first_dev;
661 struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
391 unsigned cur_len, page_off = 0; 662 unsigned cur_len, page_off = 0;
392 663
393 if (!per_dev->length) { 664 if (!per_dev->length) {
394 per_dev->dev = dev; 665 per_dev->dev = dev;
395 if (dev < si->dev) { 666 if (dev == si->dev) {
396 per_dev->offset = si->obj_offset + stripe_unit - 667 WARN_ON(dev == si->par_dev);
397 si->unit_off;
398 cur_len = stripe_unit;
399 } else if (dev == si->dev) {
400 per_dev->offset = si->obj_offset; 668 per_dev->offset = si->obj_offset;
401 cur_len = stripe_unit - si->unit_off; 669 cur_len = stripe_unit - si->unit_off;
402 page_off = si->unit_off & ~PAGE_MASK; 670 page_off = si->unit_off & ~PAGE_MASK;
403 BUG_ON(page_off && (page_off != ios->pgbase)); 671 BUG_ON(page_off && (page_off != ios->pgbase));
404 } else { /* dev > si->dev */ 672 } else {
405 per_dev->offset = si->obj_offset - si->unit_off; 673 if (si->cur_comp > dev_order)
674 per_dev->offset =
675 si->obj_offset - si->unit_off;
676 else /* si->cur_comp < dev_order */
677 per_dev->offset =
678 si->obj_offset + stripe_unit -
679 si->unit_off;
406 cur_len = stripe_unit; 680 cur_len = stripe_unit;
407 } 681 }
408
409 if (max_comp < dev)
410 max_comp = dev;
411 } else { 682 } else {
412 cur_len = stripe_unit; 683 cur_len = stripe_unit;
413 } 684 }
414 if (cur_len >= length) 685 if (cur_len >= length)
415 cur_len = length; 686 cur_len = length;
416 687
417 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, 688 ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
418 cur_len); 689 per_dev, cur_len);
419 if (unlikely(ret)) 690 if (unlikely(ret))
420 goto out; 691 goto out;
421 692
@@ -423,60 +694,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
423 dev = (dev % devs_in_group) + first_dev; 694 dev = (dev % devs_in_group) + first_dev;
424 695
425 length -= cur_len; 696 length -= cur_len;
426 }
427out:
428 ios->numdevs = max_comp + mirrors_p1;
429 ios->pages_consumed = cur_pg;
430 return ret;
431}
432
433static int _prepare_for_striping(struct ore_io_state *ios)
434{
435 u64 length = ios->length;
436 u64 offset = ios->offset;
437 struct _striping_info si;
438 int ret = 0;
439 697
440 if (!ios->pages) { 698 si->cur_comp = (si->cur_comp + 1) % group_width;
441 if (ios->kern_buff) { 699 if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
442 struct ore_per_dev_state *per_dev = &ios->per_dev[0]; 700 if (!length && ios->sp2d) {
701 /* If we are writing and this is the very last
702 * stripe. then operate on parity dev.
703 */
704 dev = si->par_dev;
705 }
706 if (ios->sp2d)
707 /* In writes cur_len just means if it's the
708 * last one. See _ore_add_parity_unit.
709 */
710 cur_len = length;
711 per_dev = &ios->per_dev[dev - first_dev];
712 if (!per_dev->length) {
713 /* Only/always the parity unit of the first
714 * stripe will be empty. So this is a chance to
715 * initialize the per_dev info.
716 */
717 per_dev->dev = dev;
718 per_dev->offset = si->obj_offset - si->unit_off;
719 }
443 720
444 _calc_stripe_info(ios->layout, ios->offset, &si); 721 ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
445 per_dev->offset = si.obj_offset; 722 if (unlikely(ret))
446 per_dev->dev = si.dev; 723 goto out;
447 724
448 /* no cross device without page array */ 725 /* Rotate next par_dev backwards with wraping */
449 BUG_ON((ios->layout->group_width > 1) && 726 si->par_dev = (devs_in_group + si->par_dev -
450 (si.unit_off + ios->length > 727 ios->layout->parity * mirrors_p1) %
451 ios->layout->stripe_unit)); 728 devs_in_group + first_dev;
729 /* Next stripe, start fresh */
730 si->cur_comp = 0;
731 si->cur_pg = 0;
452 } 732 }
453 ios->numdevs = ios->layout->mirrors_p1;
454 return 0;
455 }
456
457 while (length) {
458 _calc_stripe_info(ios->layout, offset, &si);
459
460 if (length < si.group_length)
461 si.group_length = length;
462
463 ret = _prepare_one_group(ios, si.group_length, &si);
464 if (unlikely(ret))
465 goto out;
466
467 offset += si.group_length;
468 length -= si.group_length;
469 } 733 }
470
471out: 734out:
472 return ret; 735 ios->numdevs = devs_in_group;
736 ios->pages_consumed = cur_pg;
737 if (unlikely(ret)) {
738 if (length == ios->length)
739 return ret;
740 else
741 ios->length -= length;
742 }
743 return 0;
473} 744}
474 745
475int ore_create(struct ore_io_state *ios) 746int ore_create(struct ore_io_state *ios)
476{ 747{
477 int i, ret; 748 int i, ret;
478 749
479 for (i = 0; i < ios->comps->numdevs; i++) { 750 for (i = 0; i < ios->oc->numdevs; i++) {
480 struct osd_request *or; 751 struct osd_request *or;
481 752
482 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); 753 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -501,7 +772,7 @@ int ore_remove(struct ore_io_state *ios)
501{ 772{
502 int i, ret; 773 int i, ret;
503 774
504 for (i = 0; i < ios->comps->numdevs; i++) { 775 for (i = 0; i < ios->oc->numdevs; i++) {
505 struct osd_request *or; 776 struct osd_request *or;
506 777
507 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); 778 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -543,7 +814,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
543 goto out; 814 goto out;
544 } 815 }
545 per_dev->or = or; 816 per_dev->or = or;
546 per_dev->offset = master_dev->offset;
547 817
548 if (ios->pages) { 818 if (ios->pages) {
549 struct bio *bio; 819 struct bio *bio;
@@ -562,6 +832,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
562 __bio_clone(bio, master_dev->bio); 832 __bio_clone(bio, master_dev->bio);
563 bio->bi_bdev = NULL; 833 bio->bi_bdev = NULL;
564 bio->bi_next = NULL; 834 bio->bi_next = NULL;
835 per_dev->offset = master_dev->offset;
565 per_dev->length = master_dev->length; 836 per_dev->length = master_dev->length;
566 per_dev->bio = bio; 837 per_dev->bio = bio;
567 per_dev->dev = dev; 838 per_dev->dev = dev;
@@ -579,7 +850,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
579 _LLU(per_dev->offset), 850 _LLU(per_dev->offset),
580 _LLU(per_dev->length), dev); 851 _LLU(per_dev->length), dev);
581 } else if (ios->kern_buff) { 852 } else if (ios->kern_buff) {
582 ret = osd_req_write_kern(or, _ios_obj(ios, dev), 853 per_dev->offset = ios->si.obj_offset;
854 per_dev->dev = ios->si.dev + dev;
855
856 /* no cross device without page array */
857 BUG_ON((ios->layout->group_width > 1) &&
858 (ios->si.unit_off + ios->length >
859 ios->layout->stripe_unit));
860
861 ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
583 per_dev->offset, 862 per_dev->offset,
584 ios->kern_buff, ios->length); 863 ios->kern_buff, ios->length);
585 if (unlikely(ret)) 864 if (unlikely(ret))
@@ -588,7 +867,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
588 "length=0x%llx dev=%d\n", 867 "length=0x%llx dev=%d\n",
589 _LLU(_ios_obj(ios, dev)->id), 868 _LLU(_ios_obj(ios, dev)->id),
590 _LLU(per_dev->offset), 869 _LLU(per_dev->offset),
591 _LLU(ios->length), dev); 870 _LLU(ios->length), per_dev->dev);
592 } else { 871 } else {
593 osd_req_set_attributes(or, _ios_obj(ios, dev)); 872 osd_req_set_attributes(or, _ios_obj(ios, dev));
594 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", 873 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
@@ -614,6 +893,14 @@ int ore_write(struct ore_io_state *ios)
614 int i; 893 int i;
615 int ret; 894 int ret;
616 895
896 if (unlikely(ios->sp2d && !ios->r4w)) {
897 /* A library is attempting a RAID-write without providing
898 * a pages lock interface.
899 */
900 WARN_ON_ONCE(1);
901 return -ENOTSUPP;
902 }
903
617 ret = _prepare_for_striping(ios); 904 ret = _prepare_for_striping(ios);
618 if (unlikely(ret)) 905 if (unlikely(ret))
619 return ret; 906 return ret;
@@ -629,7 +916,7 @@ int ore_write(struct ore_io_state *ios)
629} 916}
630EXPORT_SYMBOL(ore_write); 917EXPORT_SYMBOL(ore_write);
631 918
632static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) 919int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
633{ 920{
634 struct osd_request *or; 921 struct osd_request *or;
635 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 922 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
@@ -648,22 +935,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
648 per_dev->or = or; 935 per_dev->or = or;
649 936
650 if (ios->pages) { 937 if (ios->pages) {
651 osd_req_read(or, obj, per_dev->offset, 938 if (per_dev->cur_sg) {
652 per_dev->bio, per_dev->length); 939 /* finalize the last sg_entry */
940 _ore_add_sg_seg(per_dev, 0, false);
941 if (unlikely(!per_dev->cur_sg))
942 return 0; /* Skip parity only device */
943
944 osd_req_read_sg(or, obj, per_dev->bio,
945 per_dev->sglist, per_dev->cur_sg);
946 } else {
947 /* The no raid case */
948 osd_req_read(or, obj, per_dev->offset,
949 per_dev->bio, per_dev->length);
950 }
951
653 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 952 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
654 " dev=%d\n", _LLU(obj->id), 953 " dev=%d sg_len=%d\n", _LLU(obj->id),
655 _LLU(per_dev->offset), _LLU(per_dev->length), 954 _LLU(per_dev->offset), _LLU(per_dev->length),
656 first_dev); 955 first_dev, per_dev->cur_sg);
657 } else if (ios->kern_buff) {
658 int ret = osd_req_read_kern(or, obj, per_dev->offset,
659 ios->kern_buff, ios->length);
660 ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
661 "length=0x%llx dev=%d ret=>%d\n",
662 _LLU(obj->id), _LLU(per_dev->offset),
663 _LLU(ios->length), first_dev, ret);
664 if (unlikely(ret))
665 return ret;
666 } else { 956 } else {
957 BUG_ON(ios->kern_buff);
958
667 osd_req_get_attributes(or, obj); 959 osd_req_get_attributes(or, obj);
668 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", 960 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
669 _LLU(obj->id), 961 _LLU(obj->id),
@@ -688,7 +980,7 @@ int ore_read(struct ore_io_state *ios)
688 return ret; 980 return ret;
689 981
690 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 982 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
691 ret = _read_mirror(ios, i); 983 ret = _ore_read_mirror(ios, i);
692 if (unlikely(ret)) 984 if (unlikely(ret))
693 return ret; 985 return ret;
694 } 986 }
@@ -744,31 +1036,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
744} 1036}
745 1037
746struct _trunc_info { 1038struct _trunc_info {
747 struct _striping_info si; 1039 struct ore_striping_info si;
748 u64 prev_group_obj_off; 1040 u64 prev_group_obj_off;
749 u64 next_group_obj_off; 1041 u64 next_group_obj_off;
750 1042
751 unsigned first_group_dev; 1043 unsigned first_group_dev;
752 unsigned nex_group_dev; 1044 unsigned nex_group_dev;
753 unsigned max_devs;
754}; 1045};
755 1046
756void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, 1047static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
757 struct _trunc_info *ti) 1048 struct _trunc_info *ti)
758{ 1049{
759 unsigned stripe_unit = layout->stripe_unit; 1050 unsigned stripe_unit = layout->stripe_unit;
760 1051
761 _calc_stripe_info(layout, file_offset, &ti->si); 1052 ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
762 1053
763 ti->prev_group_obj_off = ti->si.M * stripe_unit; 1054 ti->prev_group_obj_off = ti->si.M * stripe_unit;
764 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; 1055 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
765 1056
766 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); 1057 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
767 ti->nex_group_dev = ti->first_group_dev + layout->group_width; 1058 ti->nex_group_dev = ti->first_group_dev + layout->group_width;
768 ti->max_devs = layout->group_width * layout->group_count;
769} 1059}
770 1060
771int ore_truncate(struct ore_layout *layout, struct ore_components *comps, 1061int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
772 u64 size) 1062 u64 size)
773{ 1063{
774 struct ore_io_state *ios; 1064 struct ore_io_state *ios;
@@ -779,22 +1069,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
779 struct _trunc_info ti; 1069 struct _trunc_info ti;
780 int i, ret; 1070 int i, ret;
781 1071
782 ret = ore_get_io_state(layout, comps, &ios); 1072 ret = ore_get_io_state(layout, oc, &ios);
783 if (unlikely(ret)) 1073 if (unlikely(ret))
784 return ret; 1074 return ret;
785 1075
786 _calc_trunk_info(ios->layout, size, &ti); 1076 _calc_trunk_info(ios->layout, size, &ti);
787 1077
788 size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), 1078 size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
789 GFP_KERNEL); 1079 GFP_KERNEL);
790 if (unlikely(!size_attrs)) { 1080 if (unlikely(!size_attrs)) {
791 ret = -ENOMEM; 1081 ret = -ENOMEM;
792 goto out; 1082 goto out;
793 } 1083 }
794 1084
795 ios->numdevs = ios->comps->numdevs; 1085 ios->numdevs = ios->oc->numdevs;
796 1086
797 for (i = 0; i < ti.max_devs; ++i) { 1087 for (i = 0; i < ios->numdevs; ++i) {
798 struct exofs_trunc_attr *size_attr = &size_attrs[i]; 1088 struct exofs_trunc_attr *size_attr = &size_attrs[i];
799 u64 obj_size; 1089 u64 obj_size;
800 1090
@@ -815,7 +1105,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
815 size_attr->attr.val_ptr = &size_attr->newsize; 1105 size_attr->attr.val_ptr = &size_attr->newsize;
816 1106
817 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", 1107 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
818 _LLU(comps->comps->obj.id), _LLU(obj_size), i); 1108 _LLU(oc->comps->obj.id), _LLU(obj_size), i);
819 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, 1109 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
820 &size_attr->attr); 1110 &size_attr->attr);
821 if (unlikely(ret)) 1111 if (unlikely(ret))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 000000000000..29c47e5c4a86
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,660 @@
1/*
2 * Copyright (C) 2011
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of the objects raid engine (ore).
6 *
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
14 */
15
16#include <linux/gfp.h>
17#include <linux/async_tx.h>
18
19#include "ore_raid.h"
20
21#undef ORE_DBGMSG2
22#define ORE_DBGMSG2 ORE_DBGMSG
23
24struct page *_raid_page_alloc(void)
25{
26 return alloc_page(GFP_KERNEL);
27}
28
29void _raid_page_free(struct page *p)
30{
31 __free_page(p);
32}
33
34/* This struct is forward declare in ore_io_state, but is private to here.
35 * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
36 *
37 * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
38 * Ascending page index access is sp2d(p-minor, c-major). But storage is
39 * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
40 * API.
41 */
42struct __stripe_pages_2d {
43 /* Cache some hot path repeated calculations */
44 unsigned parity;
45 unsigned data_devs;
46 unsigned pages_in_unit;
47
48 bool needed ;
49
50 /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
51 struct __1_page_stripe {
52 bool alloc;
53 unsigned write_count;
54 struct async_submit_ctl submit;
55 struct dma_async_tx_descriptor *tx;
56
57 /* The size of this array is data_devs + parity */
58 struct page **pages;
59 struct page **scribble;
60 /* bool array, size of this array is data_devs */
61 char *page_is_read;
62 } _1p_stripes[];
63};
64
65/* This can get bigger then a page. So support multiple page allocations
66 * _sp2d_free should be called even if _sp2d_alloc fails (by returning
67 * none-zero).
68 */
69static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
70 unsigned parity, struct __stripe_pages_2d **psp2d)
71{
72 struct __stripe_pages_2d *sp2d;
73 unsigned data_devs = group_width - parity;
74 struct _alloc_all_bytes {
75 struct __alloc_stripe_pages_2d {
76 struct __stripe_pages_2d sp2d;
77 struct __1_page_stripe _1p_stripes[pages_in_unit];
78 } __asp2d;
79 struct __alloc_1p_arrays {
80 struct page *pages[group_width];
81 struct page *scribble[group_width];
82 char page_is_read[data_devs];
83 } __a1pa[pages_in_unit];
84 } *_aab;
85 struct __alloc_1p_arrays *__a1pa;
86 struct __alloc_1p_arrays *__a1pa_end;
87 const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
88 unsigned num_a1pa, alloc_size, i;
89
90 /* FIXME: check these numbers in ore_verify_layout */
91 BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
92 BUG_ON(sizeof__a1pa > PAGE_SIZE);
93
94 if (sizeof(*_aab) > PAGE_SIZE) {
95 num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
96 alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
97 } else {
98 num_a1pa = pages_in_unit;
99 alloc_size = sizeof(*_aab);
100 }
101
102 _aab = kzalloc(alloc_size, GFP_KERNEL);
103 if (unlikely(!_aab)) {
104 ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
105 return -ENOMEM;
106 }
107
108 sp2d = &_aab->__asp2d.sp2d;
109 *psp2d = sp2d; /* From here Just call _sp2d_free */
110
111 __a1pa = _aab->__a1pa;
112 __a1pa_end = __a1pa + num_a1pa;
113
114 for (i = 0; i < pages_in_unit; ++i) {
115 if (unlikely(__a1pa >= __a1pa_end)) {
116 num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
117 pages_in_unit - i);
118
119 __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
120 if (unlikely(!__a1pa)) {
121 ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
122 num_a1pa);
123 return -ENOMEM;
124 }
125 __a1pa_end = __a1pa + num_a1pa;
126 /* First *pages is marked for kfree of the buffer */
127 sp2d->_1p_stripes[i].alloc = true;
128 }
129
130 sp2d->_1p_stripes[i].pages = __a1pa->pages;
131 sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
132 sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
133 ++__a1pa;
134 }
135
136 sp2d->parity = parity;
137 sp2d->data_devs = data_devs;
138 sp2d->pages_in_unit = pages_in_unit;
139 return 0;
140}
141
142static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
143 const struct _ore_r4w_op *r4w, void *priv)
144{
145 unsigned data_devs = sp2d->data_devs;
146 unsigned group_width = data_devs + sp2d->parity;
147 unsigned p;
148
149 if (!sp2d->needed)
150 return;
151
152 for (p = 0; p < sp2d->pages_in_unit; p++) {
153 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
154
155 if (_1ps->write_count < group_width) {
156 unsigned c;
157
158 for (c = 0; c < data_devs; c++)
159 if (_1ps->page_is_read[c]) {
160 struct page *page = _1ps->pages[c];
161
162 r4w->put_page(priv, page);
163 _1ps->page_is_read[c] = false;
164 }
165 }
166
167 memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
168 _1ps->write_count = 0;
169 _1ps->tx = NULL;
170 }
171
172 sp2d->needed = false;
173}
174
175static void _sp2d_free(struct __stripe_pages_2d *sp2d)
176{
177 unsigned i;
178
179 if (!sp2d)
180 return;
181
182 for (i = 0; i < sp2d->pages_in_unit; ++i) {
183 if (sp2d->_1p_stripes[i].alloc)
184 kfree(sp2d->_1p_stripes[i].pages);
185 }
186
187 kfree(sp2d);
188}
189
190static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
191{
192 unsigned p;
193
194 for (p = 0; p < sp2d->pages_in_unit; p++) {
195 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
196
197 if (_1ps->write_count)
198 return p;
199 }
200
201 return ~0;
202}
203
204static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
205{
206 unsigned p;
207
208 for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
209 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
210
211 if (_1ps->write_count)
212 return p;
213 }
214
215 return ~0;
216}
217
218static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
219{
220 unsigned p;
221 for (p = 0; p < sp2d->pages_in_unit; p++) {
222 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
223
224 if (!_1ps->write_count)
225 continue;
226
227 init_async_submit(&_1ps->submit,
228 ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
229 NULL,
230 NULL, NULL,
231 (addr_conv_t *)_1ps->scribble);
232
233 /* TODO: raid6 */
234 _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
235 0, sp2d->data_devs, PAGE_SIZE,
236 &_1ps->submit);
237 }
238
239 for (p = 0; p < sp2d->pages_in_unit; p++) {
240 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
241 /* NOTE: We wait for HW synchronously (I don't have such HW
242 * to test with.) Is parallelism needed with today's multi
243 * cores?
244 */
245 async_tx_issue_pending(_1ps->tx);
246 }
247}
248
249void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
250 struct ore_striping_info *si, struct page *page)
251{
252 struct __1_page_stripe *_1ps;
253
254 sp2d->needed = true;
255
256 _1ps = &sp2d->_1p_stripes[si->cur_pg];
257 _1ps->pages[si->cur_comp] = page;
258 ++_1ps->write_count;
259
260 si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
261 /* si->cur_comp is advanced outside at main loop */
262}
263
264void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
265 bool not_last)
266{
267 struct osd_sg_entry *sge;
268
269 ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
270 "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
271 per_dev->dev, cur_len, not_last, per_dev->cur_sg,
272 _LLU(per_dev->offset), per_dev->length,
273 per_dev->last_sgs_total);
274
275 if (!per_dev->cur_sg) {
276 sge = per_dev->sglist;
277
278 /* First time we prepare two entries */
279 if (per_dev->length) {
280 ++per_dev->cur_sg;
281 sge->offset = per_dev->offset;
282 sge->len = per_dev->length;
283 } else {
284 /* Here the parity is the first unit of this object.
285 * This happens every time we reach a parity device on
286 * the same stripe as the per_dev->offset. We need to
287 * just skip this unit.
288 */
289 per_dev->offset += cur_len;
290 return;
291 }
292 } else {
293 /* finalize the last one */
294 sge = &per_dev->sglist[per_dev->cur_sg - 1];
295 sge->len = per_dev->length - per_dev->last_sgs_total;
296 }
297
298 if (not_last) {
299 /* Partly prepare the next one */
300 struct osd_sg_entry *next_sge = sge + 1;
301
302 ++per_dev->cur_sg;
303 next_sge->offset = sge->offset + sge->len + cur_len;
304 /* Save cur len so we know how mutch was added next time */
305 per_dev->last_sgs_total = per_dev->length;
306 next_sge->len = 0;
307 } else if (!sge->len) {
308 /* Optimize for when the last unit is a parity */
309 --per_dev->cur_sg;
310 }
311}
312
313static int _alloc_read_4_write(struct ore_io_state *ios)
314{
315 struct ore_layout *layout = ios->layout;
316 int ret;
317 /* We want to only read those pages not in cache so worst case
318 * is a stripe populated with every other page
319 */
320 unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
321
322 ret = _ore_get_io_state(layout, ios->oc,
323 layout->group_width * layout->mirrors_p1,
324 sgs_per_dev, 0, &ios->ios_read_4_write);
325 return ret;
326}
327
328/* @si contains info of the to-be-inserted page. Update of @si should be
329 * maintained by caller. Specificaly si->dev, si->obj_offset, ...
330 */
331static int _add_to_read_4_write(struct ore_io_state *ios,
332 struct ore_striping_info *si, struct page *page)
333{
334 struct request_queue *q;
335 struct ore_per_dev_state *per_dev;
336 struct ore_io_state *read_ios;
337 unsigned first_dev = si->dev - (si->dev %
338 (ios->layout->group_width * ios->layout->mirrors_p1));
339 unsigned comp = si->dev - first_dev;
340 unsigned added_len;
341
342 if (!ios->ios_read_4_write) {
343 int ret = _alloc_read_4_write(ios);
344
345 if (unlikely(ret))
346 return ret;
347 }
348
349 read_ios = ios->ios_read_4_write;
350 read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
351
352 per_dev = &read_ios->per_dev[comp];
353 if (!per_dev->length) {
354 per_dev->bio = bio_kmalloc(GFP_KERNEL,
355 ios->sp2d->pages_in_unit);
356 if (unlikely(!per_dev->bio)) {
357 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
358 ios->sp2d->pages_in_unit);
359 return -ENOMEM;
360 }
361 per_dev->offset = si->obj_offset;
362 per_dev->dev = si->dev;
363 } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
364 u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
365
366 _ore_add_sg_seg(per_dev, gap, true);
367 }
368 q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
369 added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
370 if (unlikely(added_len != PAGE_SIZE)) {
371 ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
372 per_dev->bio->bi_vcnt);
373 return -ENOMEM;
374 }
375
376 per_dev->length += PAGE_SIZE;
377 return 0;
378}
379
380static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
381{
382 struct bio_vec *bv;
383 unsigned i, d;
384
385 /* loop on all devices all pages */
386 for (d = 0; d < ios->numdevs; d++) {
387 struct bio *bio = ios->per_dev[d].bio;
388
389 if (!bio)
390 continue;
391
392 __bio_for_each_segment(bv, bio, i, 0) {
393 struct page *page = bv->bv_page;
394
395 SetPageUptodate(page);
396 if (PageError(page))
397 ClearPageError(page);
398 }
399 }
400}
401
402/* read_4_write is hacked to read the start of the first stripe and/or
403 * the end of the last stripe. If needed, with an sg-gap at each device/page.
404 * It is assumed to be called after the to_be_written pages of the first stripe
405 * are populating ios->sp2d[][]
406 *
407 * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
408 * These pages are held at sp2d[p].pages[c] but with
409 * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
410 * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
411 * @uptodate=true, so we don't need to read it, only unlock, after IO.
412 *
413 * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
414 * to-be-written count, we should consider the xor-in-place mode.
415 * need_to_read_pages_count is the actual number of pages not present in cache.
416 * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
417 * approximation? In this mode the read pages are put in the empty places of
418 * ios->sp2d[p][*], xor is calculated the same way. These pages are
419 * allocated/freed and don't go through cache
420 */
421static int _read_4_write(struct ore_io_state *ios)
422{
423 struct ore_io_state *ios_read;
424 struct ore_striping_info read_si;
425 struct __stripe_pages_2d *sp2d = ios->sp2d;
426 u64 offset = ios->si.first_stripe_start;
427 u64 last_stripe_end;
428 unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
429 unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
430 int ret;
431
432 if (offset == ios->offset) /* Go to start collect $200 */
433 goto read_last_stripe;
434
435 min_p = _sp2d_min_pg(sp2d);
436 max_p = _sp2d_max_pg(sp2d);
437
438 for (c = 0; ; c++) {
439 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
440 read_si.obj_offset += min_p * PAGE_SIZE;
441 offset += min_p * PAGE_SIZE;
442 for (p = min_p; p <= max_p; p++) {
443 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
444 struct page **pp = &_1ps->pages[c];
445 bool uptodate;
446
447 if (*pp)
448 /* to-be-written pages start here */
449 goto read_last_stripe;
450
451 *pp = ios->r4w->get_page(ios->private, offset,
452 &uptodate);
453 if (unlikely(!*pp))
454 return -ENOMEM;
455
456 if (!uptodate)
457 _add_to_read_4_write(ios, &read_si, *pp);
458
459 /* Mark read-pages to be cache_released */
460 _1ps->page_is_read[c] = true;
461 read_si.obj_offset += PAGE_SIZE;
462 offset += PAGE_SIZE;
463 }
464 offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
465 }
466
467read_last_stripe:
468 offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
469 PAGE_SIZE * PAGE_SIZE;
470 last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
471 * bytes_in_stripe;
472 if (offset == last_stripe_end) /* Optimize for the aligned case */
473 goto read_it;
474
475 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
476 p = read_si.unit_off / PAGE_SIZE;
477 c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
478 ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
479
480 BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
481 /* unaligned IO must be within a single stripe */
482
483 if (min_p == sp2d->pages_in_unit) {
484 /* Didn't do it yet */
485 min_p = _sp2d_min_pg(sp2d);
486 max_p = _sp2d_max_pg(sp2d);
487 }
488
489 while (offset < last_stripe_end) {
490 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
491
492 if ((min_p <= p) && (p <= max_p)) {
493 struct page *page;
494 bool uptodate;
495
496 BUG_ON(_1ps->pages[c]);
497 page = ios->r4w->get_page(ios->private, offset,
498 &uptodate);
499 if (unlikely(!page))
500 return -ENOMEM;
501
502 _1ps->pages[c] = page;
503 /* Mark read-pages to be cache_released */
504 _1ps->page_is_read[c] = true;
505 if (!uptodate)
506 _add_to_read_4_write(ios, &read_si, page);
507 }
508
509 offset += PAGE_SIZE;
510 if (p == (sp2d->pages_in_unit - 1)) {
511 ++c;
512 p = 0;
513 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
514 } else {
515 read_si.obj_offset += PAGE_SIZE;
516 ++p;
517 }
518 }
519
520read_it:
521 ios_read = ios->ios_read_4_write;
522 if (!ios_read)
523 return 0;
524
525 /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
526 * to check for per_dev->bio
527 */
528 ios_read->pages = ios->pages;
529
530 /* Now read these devices */
531 for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
532 ret = _ore_read_mirror(ios_read, i);
533 if (unlikely(ret))
534 return ret;
535 }
536
537 ret = ore_io_execute(ios_read); /* Synchronus execution */
538 if (unlikely(ret)) {
539 ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
540 return ret;
541 }
542
543 _mark_read4write_pages_uptodate(ios_read, ret);
544 return 0;
545}
546
547/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
548int _ore_add_parity_unit(struct ore_io_state *ios,
549 struct ore_striping_info *si,
550 struct ore_per_dev_state *per_dev,
551 unsigned cur_len)
552{
553 if (ios->reading) {
554 BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
555 _ore_add_sg_seg(per_dev, cur_len, true);
556 } else {
557 struct __stripe_pages_2d *sp2d = ios->sp2d;
558 struct page **pages = ios->parity_pages + ios->cur_par_page;
559 unsigned num_pages;
560 unsigned array_start = 0;
561 unsigned i;
562 int ret;
563
564 si->cur_pg = _sp2d_min_pg(sp2d);
565 num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
566
567 if (!cur_len) /* If last stripe operate on parity comp */
568 si->cur_comp = sp2d->data_devs;
569
570 if (!per_dev->length) {
571 per_dev->offset += si->cur_pg * PAGE_SIZE;
572 /* If first stripe, Read in all read4write pages
573 * (if needed) before we calculate the first parity.
574 */
575 _read_4_write(ios);
576 }
577
578 for (i = 0; i < num_pages; i++) {
579 pages[i] = _raid_page_alloc();
580 if (unlikely(!pages[i]))
581 return -ENOMEM;
582
583 ++(ios->cur_par_page);
584 }
585
586 BUG_ON(si->cur_comp != sp2d->data_devs);
587 BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
588
589 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
590 per_dev, num_pages * PAGE_SIZE);
591 if (unlikely(ret))
592 return ret;
593
594 /* TODO: raid6 if (last_parity_dev) */
595 _gen_xor_unit(sp2d);
596 _sp2d_reset(sp2d, ios->r4w, ios->private);
597 }
598 return 0;
599}
600
601int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
602{
603 struct ore_layout *layout = ios->layout;
604
605 if (ios->parity_pages) {
606 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
607 unsigned stripe_size = ios->si.bytes_in_stripe;
608 u64 last_stripe, first_stripe;
609
610 if (_sp2d_alloc(pages_in_unit, layout->group_width,
611 layout->parity, &ios->sp2d)) {
612 return -ENOMEM;
613 }
614
615 BUG_ON(ios->offset % PAGE_SIZE);
616
617 /* Round io down to last full strip */
618 first_stripe = div_u64(ios->offset, stripe_size);
619 last_stripe = div_u64(ios->offset + ios->length, stripe_size);
620
621 /* If an IO spans more then a single stripe it must end at
622 * a stripe boundary. The reminder at the end is pushed into the
623 * next IO.
624 */
625 if (last_stripe != first_stripe) {
626 ios->length = last_stripe * stripe_size - ios->offset;
627
628 BUG_ON(!ios->length);
629 ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
630 PAGE_SIZE;
631 ios->si.length = ios->length; /*make it consistent */
632 }
633 }
634 return 0;
635}
636
637void _ore_free_raid_stuff(struct ore_io_state *ios)
638{
639 if (ios->sp2d) { /* writing and raid */
640 unsigned i;
641
642 for (i = 0; i < ios->cur_par_page; i++) {
643 struct page *page = ios->parity_pages[i];
644
645 if (page)
646 _raid_page_free(page);
647 }
648 if (ios->extra_part_alloc)
649 kfree(ios->parity_pages);
650 /* If IO returned an error pages might need unlocking */
651 _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
652 _sp2d_free(ios->sp2d);
653 } else {
654 /* Will only be set if raid reading && sglist is big */
655 if (ios->extra_part_alloc)
656 kfree(ios->per_dev[0].sglist);
657 }
658 if (ios->ios_read_4_write)
659 ore_put_io_state(ios->ios_read_4_write);
660}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 000000000000..2ffd2c3c6e46
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,79 @@
1/*
2 * Copyright (C) from 2011
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of the objects raid engine (ore).
6 *
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
14 */
15
16#include <scsi/osd_ore.h>
17
18#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
19
20#ifdef CONFIG_EXOFS_DEBUG
21#define ORE_DBGMSG(fmt, a...) \
22 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
23#else
24#define ORE_DBGMSG(fmt, a...) \
25 do { if (0) printk(fmt, ##a); } while (0)
26#endif
27
28/* u64 has problems with printk this will cast it to unsigned long long */
29#define _LLU(x) (unsigned long long)(x)
30
31#define ORE_DBGMSG2(M...) do {} while (0)
32/* #define ORE_DBGMSG2 ORE_DBGMSG */
33
34/* Calculate the component order in a stripe. eg the logical data unit
35 * address within the stripe of @dev given the @par_dev of this stripe.
36 */
37static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
38 unsigned par_dev, unsigned dev)
39{
40 unsigned first_dev = dev - dev % devs_in_group;
41
42 dev -= first_dev;
43 par_dev -= first_dev;
44
45 if (devs_in_group == par_dev) /* The raid 0 case */
46 return dev / mirrors_p1;
47 /* raid4/5/6 case */
48 return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
49 mirrors_p1;
50}
51
52/* ios_raid.c stuff needed by ios.c */
53int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
54void _ore_free_raid_stuff(struct ore_io_state *ios);
55
56void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
57 bool not_last);
58int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
59 struct ore_per_dev_state *per_dev, unsigned cur_len);
60void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
61 struct ore_striping_info *si, struct page *page);
62static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
63 struct ore_striping_info *si, struct page *page)
64{
65 if (!sp2d) /* Inline the fast path */
66 return; /* Hay no raid stuff */
67 _ore_add_stripe_page(sp2d, si, page);
68}
69
70/* ios.c stuff needed by ios_raid.c */
71int _ore_get_io_state(struct ore_layout *layout,
72 struct ore_components *oc, unsigned numdevs,
73 unsigned sgs_per_dev, unsigned num_par_pages,
74 struct ore_io_state **pios);
75int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
76 unsigned pgbase, struct page **pages,
77 struct ore_per_dev_state *per_dev, int cur_len);
78int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
79int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 274894053b02..057b237b8b69 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
266 struct ore_io_state *ios; 266 struct ore_io_state *ios;
267 int ret; 267 int ret;
268 268
269 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); 269 ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
270 if (unlikely(ret)) { 270 if (unlikely(ret)) {
271 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); 271 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
272 return ret; 272 return ret;
@@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
321 struct ore_io_state *ios; 321 struct ore_io_state *ios;
322 int ret; 322 int ret;
323 323
324 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); 324 ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
325 if (unlikely(ret)) { 325 if (unlikely(ret)) {
326 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); 326 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
327 return ret; 327 return ret;
@@ -355,12 +355,12 @@ static const struct export_operations exofs_export_ops;
355/* 355/*
356 * Write the superblock to the OSD 356 * Write the superblock to the OSD
357 */ 357 */
358int exofs_sync_fs(struct super_block *sb, int wait) 358static int exofs_sync_fs(struct super_block *sb, int wait)
359{ 359{
360 struct exofs_sb_info *sbi; 360 struct exofs_sb_info *sbi;
361 struct exofs_fscb *fscb; 361 struct exofs_fscb *fscb;
362 struct ore_comp one_comp; 362 struct ore_comp one_comp;
363 struct ore_components comps; 363 struct ore_components oc;
364 struct ore_io_state *ios; 364 struct ore_io_state *ios;
365 int ret = -ENOMEM; 365 int ret = -ENOMEM;
366 366
@@ -378,9 +378,9 @@ int exofs_sync_fs(struct super_block *sb, int wait)
378 * the writeable info is set in exofs_sbi_write_stats() above. 378 * the writeable info is set in exofs_sbi_write_stats() above.
379 */ 379 */
380 380
381 exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); 381 exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
382 382
383 ret = ore_get_io_state(&sbi->layout, &comps, &ios); 383 ret = ore_get_io_state(&sbi->layout, &oc, &ios);
384 if (unlikely(ret)) 384 if (unlikely(ret))
385 goto out; 385 goto out;
386 386
@@ -429,19 +429,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
429 msg, dev_path ?: "", odi->osdname, _LLU(pid)); 429 msg, dev_path ?: "", odi->osdname, _LLU(pid));
430} 430}
431 431
432void exofs_free_sbi(struct exofs_sb_info *sbi) 432static void exofs_free_sbi(struct exofs_sb_info *sbi)
433{ 433{
434 while (sbi->comps.numdevs) { 434 unsigned numdevs = sbi->oc.numdevs;
435 int i = --sbi->comps.numdevs; 435
436 struct osd_dev *od = sbi->comps.ods[i]; 436 while (numdevs) {
437 unsigned i = --numdevs;
438 struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
437 439
438 if (od) { 440 if (od) {
439 sbi->comps.ods[i] = NULL; 441 ore_comp_set_dev(&sbi->oc, i, NULL);
440 osduld_put_device(od); 442 osduld_put_device(od);
441 } 443 }
442 } 444 }
443 if (sbi->comps.ods != sbi->_min_one_dev) 445 kfree(sbi->oc.ods);
444 kfree(sbi->comps.ods);
445 kfree(sbi); 446 kfree(sbi);
446} 447}
447 448
@@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb)
468 msecs_to_jiffies(100)); 469 msecs_to_jiffies(100));
469 } 470 }
470 471
471 _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], 472 _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
472 sbi->one_comp.obj.partition); 473 sbi->one_comp.obj.partition);
473 474
474 bdi_destroy(&sbi->bdi); 475 bdi_destroy(&sbi->bdi);
@@ -479,76 +480,20 @@ static void exofs_put_super(struct super_block *sb)
479static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, 480static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
480 struct exofs_device_table *dt) 481 struct exofs_device_table *dt)
481{ 482{
482 u64 stripe_length; 483 int ret;
483 484
484 sbi->data_map.odm_num_comps = 485 sbi->layout.stripe_unit =
485 le32_to_cpu(dt->dt_data_map.cb_num_comps);
486 sbi->data_map.odm_stripe_unit =
487 le64_to_cpu(dt->dt_data_map.cb_stripe_unit); 486 le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
488 sbi->data_map.odm_group_width = 487 sbi->layout.group_width =
489 le32_to_cpu(dt->dt_data_map.cb_group_width); 488 le32_to_cpu(dt->dt_data_map.cb_group_width);
490 sbi->data_map.odm_group_depth = 489 sbi->layout.group_depth =
491 le32_to_cpu(dt->dt_data_map.cb_group_depth); 490 le32_to_cpu(dt->dt_data_map.cb_group_depth);
492 sbi->data_map.odm_mirror_cnt = 491 sbi->layout.mirrors_p1 =
493 le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); 492 le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
494 sbi->data_map.odm_raid_algorithm = 493 sbi->layout.raid_algorithm =
495 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); 494 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
496 495
497/* FIXME: Only raid0 for now. if not so, do not mount */ 496 ret = ore_verify_layout(numdevs, &sbi->layout);
498 if (sbi->data_map.odm_num_comps != numdevs) {
499 EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
500 sbi->data_map.odm_num_comps, numdevs);
501 return -EINVAL;
502 }
503 if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
504 EXOFS_ERR("Only RAID_0 for now\n");
505 return -EINVAL;
506 }
507 if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
508 EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
509 numdevs, sbi->data_map.odm_mirror_cnt);
510 return -EINVAL;
511 }
512
513 if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
514 EXOFS_ERR("Stripe Unit(0x%llx)"
515 " must be Multples of PAGE_SIZE(0x%lx)\n",
516 _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
517 return -EINVAL;
518 }
519
520 sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
521 sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
522
523 if (sbi->data_map.odm_group_width) {
524 sbi->layout.group_width = sbi->data_map.odm_group_width;
525 sbi->layout.group_depth = sbi->data_map.odm_group_depth;
526 if (!sbi->layout.group_depth) {
527 EXOFS_ERR("group_depth == 0 && group_width != 0\n");
528 return -EINVAL;
529 }
530 sbi->layout.group_count = sbi->data_map.odm_num_comps /
531 sbi->layout.mirrors_p1 /
532 sbi->data_map.odm_group_width;
533 } else {
534 if (sbi->data_map.odm_group_depth) {
535 printk(KERN_NOTICE "Warning: group_depth ignored "
536 "group_width == 0 && group_depth == %d\n",
537 sbi->data_map.odm_group_depth);
538 sbi->data_map.odm_group_depth = 0;
539 }
540 sbi->layout.group_width = sbi->data_map.odm_num_comps /
541 sbi->layout.mirrors_p1;
542 sbi->layout.group_depth = -1;
543 sbi->layout.group_count = 1;
544 }
545
546 stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
547 if (stripe_length >= (1ULL << 32)) {
548 EXOFS_ERR("Total Stripe length(0x%llx)"
549 " >= 32bit is not supported\n", _LLU(stripe_length));
550 return -EINVAL;
551 }
552 497
553 EXOFS_DBGMSG("exofs: layout: " 498 EXOFS_DBGMSG("exofs: layout: "
554 "num_comps=%u stripe_unit=0x%x group_width=%u " 499 "num_comps=%u stripe_unit=0x%x group_width=%u "
@@ -558,8 +503,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
558 sbi->layout.group_width, 503 sbi->layout.group_width,
559 _LLU(sbi->layout.group_depth), 504 _LLU(sbi->layout.group_depth),
560 sbi->layout.mirrors_p1, 505 sbi->layout.mirrors_p1,
561 sbi->data_map.odm_raid_algorithm); 506 sbi->layout.raid_algorithm);
562 return 0; 507 return ret;
563} 508}
564 509
565static unsigned __ra_pages(struct ore_layout *layout) 510static unsigned __ra_pages(struct ore_layout *layout)
@@ -605,12 +550,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
605 return !(odi->systemid_len || odi->osdname_len); 550 return !(odi->systemid_len || odi->osdname_len);
606} 551}
607 552
553int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
554 struct exofs_dev **peds)
555{
556 struct __alloc_ore_devs_and_exofs_devs {
557 /* Twice bigger table: See exofs_init_comps() and comment at
558 * exofs_read_lookup_dev_table()
559 */
560 struct ore_dev *oreds[numdevs * 2 - 1];
561 struct exofs_dev eds[numdevs];
562 } *aoded;
563 struct exofs_dev *eds;
564 unsigned i;
565
566 aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
567 if (unlikely(!aoded)) {
568 EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
569 numdevs);
570 return -ENOMEM;
571 }
572
573 sbi->oc.ods = aoded->oreds;
574 *peds = eds = aoded->eds;
575 for (i = 0; i < numdevs; ++i)
576 aoded->oreds[i] = &eds[i].ored;
577 return 0;
578}
579
608static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, 580static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
609 struct osd_dev *fscb_od, 581 struct osd_dev *fscb_od,
610 unsigned table_count) 582 unsigned table_count)
611{ 583{
612 struct ore_comp comp; 584 struct ore_comp comp;
613 struct exofs_device_table *dt; 585 struct exofs_device_table *dt;
586 struct exofs_dev *eds;
614 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + 587 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
615 sizeof(*dt); 588 sizeof(*dt);
616 unsigned numdevs, i; 589 unsigned numdevs, i;
@@ -623,7 +596,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
623 return -ENOMEM; 596 return -ENOMEM;
624 } 597 }
625 598
626 sbi->comps.numdevs = 0; 599 sbi->oc.numdevs = 0;
627 600
628 comp.obj.partition = sbi->one_comp.obj.partition; 601 comp.obj.partition = sbi->one_comp.obj.partition;
629 comp.obj.id = EXOFS_DEVTABLE_ID; 602 comp.obj.id = EXOFS_DEVTABLE_ID;
@@ -647,20 +620,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
647 if (unlikely(ret)) 620 if (unlikely(ret))
648 goto out; 621 goto out;
649 622
650 if (likely(numdevs > 1)) { 623 ret = __alloc_dev_table(sbi, numdevs, &eds);
651 unsigned size = numdevs * sizeof(sbi->comps.ods[0]); 624 if (unlikely(ret))
652 625 goto out;
653 /* Twice bigger table: See exofs_init_comps() and below 626 /* exofs round-robins the device table view according to inode
654 * comment 627 * number. We hold a: twice bigger table hence inodes can point
655 */ 628 * to any device and have a sequential view of the table
656 sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); 629 * starting at this device. See exofs_init_comps()
657 if (unlikely(!sbi->comps.ods)) { 630 */
658 EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", 631 memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
659 numdevs); 632 (numdevs - 1) * sizeof(sbi->oc.ods[0]));
660 ret = -ENOMEM;
661 goto out;
662 }
663 }
664 633
665 for (i = 0; i < numdevs; i++) { 634 for (i = 0; i < numdevs; i++) {
666 struct exofs_fscb fscb; 635 struct exofs_fscb fscb;
@@ -676,13 +645,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
676 printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", 645 printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
677 i, odi.osdname); 646 i, odi.osdname);
678 647
648 /* the exofs id is currently the table index */
649 eds[i].did = i;
650
679 /* On all devices the device table is identical. The user can 651 /* On all devices the device table is identical. The user can
680 * specify any one of the participating devices on the command 652 * specify any one of the participating devices on the command
681 * line. We always keep them in device-table order. 653 * line. We always keep them in device-table order.
682 */ 654 */
683 if (fscb_od && osduld_device_same(fscb_od, &odi)) { 655 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
684 sbi->comps.ods[i] = fscb_od; 656 eds[i].ored.od = fscb_od;
685 ++sbi->comps.numdevs; 657 ++sbi->oc.numdevs;
686 fscb_od = NULL; 658 fscb_od = NULL;
687 continue; 659 continue;
688 } 660 }
@@ -695,8 +667,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
695 goto out; 667 goto out;
696 } 668 }
697 669
698 sbi->comps.ods[i] = od; 670 eds[i].ored.od = od;
699 ++sbi->comps.numdevs; 671 ++sbi->oc.numdevs;
700 672
701 /* Read the fscb of the other devices to make sure the FS 673 /* Read the fscb of the other devices to make sure the FS
702 * partition is there. 674 * partition is there.
@@ -718,21 +690,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
718 690
719out: 691out:
720 kfree(dt); 692 kfree(dt);
721 if (likely(!ret)) { 693 if (unlikely(fscb_od && !ret)) {
722 unsigned numdevs = sbi->comps.numdevs;
723
724 if (unlikely(fscb_od)) {
725 EXOFS_ERR("ERROR: Bad device-table container device not present\n"); 694 EXOFS_ERR("ERROR: Bad device-table container device not present\n");
726 osduld_put_device(fscb_od); 695 osduld_put_device(fscb_od);
727 return -EINVAL; 696 return -EINVAL;
728 }
729 /* exofs round-robins the device table view according to inode
730 * number. We hold a: twice bigger table hence inodes can point
731 * to any device and have a sequential view of the table
732 * starting at this device. See exofs_init_comps()
733 */
734 for (i = 0; i < numdevs - 1; ++i)
735 sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
736 } 697 }
737 return ret; 698 return ret;
738} 699}
@@ -783,10 +744,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
783 sbi->one_comp.obj.partition = opts->pid; 744 sbi->one_comp.obj.partition = opts->pid;
784 sbi->one_comp.obj.id = 0; 745 sbi->one_comp.obj.id = 0;
785 exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); 746 exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
786 sbi->comps.numdevs = 1; 747 sbi->oc.numdevs = 1;
787 sbi->comps.single_comp = EC_SINGLE_COMP; 748 sbi->oc.single_comp = EC_SINGLE_COMP;
788 sbi->comps.comps = &sbi->one_comp; 749 sbi->oc.comps = &sbi->one_comp;
789 sbi->comps.ods = sbi->_min_one_dev;
790 750
791 /* fill in some other data by hand */ 751 /* fill in some other data by hand */
792 memset(sb->s_id, 0, sizeof(sb->s_id)); 752 memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -835,7 +795,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
835 if (unlikely(ret)) 795 if (unlikely(ret))
836 goto free_sbi; 796 goto free_sbi;
837 } else { 797 } else {
838 sbi->comps.ods[0] = od; 798 struct exofs_dev *eds;
799
800 ret = __alloc_dev_table(sbi, 1, &eds);
801 if (unlikely(ret))
802 goto free_sbi;
803
804 ore_comp_set_dev(&sbi->oc, 0, od);
839 } 805 }
840 806
841 __sbi_read_stats(sbi); 807 __sbi_read_stats(sbi);
@@ -875,7 +841,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
875 goto free_sbi; 841 goto free_sbi;
876 } 842 }
877 843
878 _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], 844 _exofs_print_device("Mounting", opts->dev_name,
845 ore_comp_dev(&sbi->oc, 0),
879 sbi->one_comp.obj.partition); 846 sbi->one_comp.obj.partition);
880 return 0; 847 return 0;
881 848
@@ -924,7 +891,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
924 uint64_t used = ULLONG_MAX; 891 uint64_t used = ULLONG_MAX;
925 int ret; 892 int ret;
926 893
927 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); 894 ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
928 if (ret) { 895 if (ret) {
929 EXOFS_DBGMSG("ore_get_io_state failed.\n"); 896 EXOFS_DBGMSG("ore_get_io_state failed.\n");
930 return ret; 897 return ret;
@@ -981,7 +948,7 @@ static const struct super_operations exofs_sops = {
981 * EXPORT OPERATIONS 948 * EXPORT OPERATIONS
982 *****************************************************************************/ 949 *****************************************************************************/
983 950
984struct dentry *exofs_get_parent(struct dentry *child) 951static struct dentry *exofs_get_parent(struct dentry *child)
985{ 952{
986 unsigned long ino = exofs_parent_ino(child); 953 unsigned long ino = exofs_parent_ino(child);
987 954
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 5d979b4347b0..c922adc8ef41 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -46,28 +46,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
46 value, size, flags); 46 value, size, flags);
47} 47}
48 48
49int 49int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
50ext2_init_security(struct inode *inode, struct inode *dir, 50 void *fs_info)
51 const struct qstr *qstr)
52{ 51{
53 int err; 52 const struct xattr *xattr;
54 size_t len; 53 int err = 0;
55 void *value;
56 char *name;
57 54
58 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); 55 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
59 if (err) { 56 err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
60 if (err == -EOPNOTSUPP) 57 xattr->name, xattr->value,
61 return 0; 58 xattr->value_len, 0);
62 return err; 59 if (err < 0)
60 break;
63 } 61 }
64 err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
65 name, value, len, 0);
66 kfree(name);
67 kfree(value);
68 return err; 62 return err;
69} 63}
70 64
65int
66ext2_init_security(struct inode *inode, struct inode *dir,
67 const struct qstr *qstr)
68{
69 return security_inode_init_security(inode, dir, qstr,
70 &ext2_initxattrs, NULL);
71}
72
71const struct xattr_handler ext2_xattr_security_handler = { 73const struct xattr_handler ext2_xattr_security_handler = {
72 .prefix = XATTR_SECURITY_PREFIX, 74 .prefix = XATTR_SECURITY_PREFIX,
73 .list = ext2_xattr_security_list, 75 .list = ext2_xattr_security_list,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 04da6acde85d..12661e1deedd 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1134,7 +1134,7 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
1134 return bh; 1134 return bh;
1135 if (buffer_uptodate(bh)) 1135 if (buffer_uptodate(bh))
1136 return bh; 1136 return bh;
1137 ll_rw_block(READ_META, 1, &bh); 1137 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
1138 wait_on_buffer(bh); 1138 wait_on_buffer(bh);
1139 if (buffer_uptodate(bh)) 1139 if (buffer_uptodate(bh))
1140 return bh; 1140 return bh;
@@ -2807,7 +2807,7 @@ make_io:
2807 trace_ext3_load_inode(inode); 2807 trace_ext3_load_inode(inode);
2808 get_bh(bh); 2808 get_bh(bh);
2809 bh->b_end_io = end_buffer_read_sync; 2809 bh->b_end_io = end_buffer_read_sync;
2810 submit_bh(READ_META, bh); 2810 submit_bh(READ | REQ_META | REQ_PRIO, bh);
2811 wait_on_buffer(bh); 2811 wait_on_buffer(bh);
2812 if (!buffer_uptodate(bh)) { 2812 if (!buffer_uptodate(bh)) {
2813 ext3_error(inode->i_sb, "ext3_get_inode_loc", 2813 ext3_error(inode->i_sb, "ext3_get_inode_loc",
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6e18a0b7750d..0629e09f6511 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -922,7 +922,8 @@ restart:
922 bh = ext3_getblk(NULL, dir, b++, 0, &err); 922 bh = ext3_getblk(NULL, dir, b++, 0, &err);
923 bh_use[ra_max] = bh; 923 bh_use[ra_max] = bh;
924 if (bh) 924 if (bh)
925 ll_rw_block(READ_META, 1, &bh); 925 ll_rw_block(READ | REQ_META | REQ_PRIO,
926 1, &bh);
926 } 927 }
927 } 928 }
928 if ((bh = bh_use[ra_ptr++]) == NULL) 929 if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -2209,9 +2210,11 @@ static int ext3_symlink (struct inode * dir,
2209 /* 2210 /*
2210 * For non-fast symlinks, we just allocate inode and put it on 2211 * For non-fast symlinks, we just allocate inode and put it on
2211 * orphan list in the first transaction => we need bitmap, 2212 * orphan list in the first transaction => we need bitmap,
2212 * group descriptor, sb, inode block, quota blocks. 2213 * group descriptor, sb, inode block, quota blocks, and
2214 * possibly selinux xattr blocks.
2213 */ 2215 */
2214 credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); 2216 credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2217 EXT3_XATTR_TRANS_BLOCKS;
2215 } else { 2218 } else {
2216 /* 2219 /*
2217 * Fast symlink. We have to add entry to directory 2220 * Fast symlink. We have to add entry to directory
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index b8d9f83aa5c5..3c218b8a51d4 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -48,28 +48,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
48 name, value, size, flags); 48 name, value, size, flags);
49} 49}
50 50
51int 51int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array,
52ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, 52 void *fs_info)
53 const struct qstr *qstr)
54{ 53{
55 int err; 54 const struct xattr *xattr;
56 size_t len; 55 handle_t *handle = fs_info;
57 void *value; 56 int err = 0;
58 char *name;
59 57
60 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); 58 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
61 if (err) { 59 err = ext3_xattr_set_handle(handle, inode,
62 if (err == -EOPNOTSUPP) 60 EXT3_XATTR_INDEX_SECURITY,
63 return 0; 61 xattr->name, xattr->value,
64 return err; 62 xattr->value_len, 0);
63 if (err < 0)
64 break;
65 } 65 }
66 err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
67 name, value, len, 0);
68 kfree(name);
69 kfree(value);
70 return err; 66 return err;
71} 67}
72 68
69int
70ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
71 const struct qstr *qstr)
72{
73 return security_inode_init_security(inode, dir, qstr,
74 &ext3_initxattrs, handle);
75}
76
73const struct xattr_handler ext3_xattr_security_handler = { 77const struct xattr_handler ext3_xattr_security_handler = {
74 .prefix = XATTR_SECURITY_PREFIX, 78 .prefix = XATTR_SECURITY_PREFIX,
75 .list = ext3_xattr_security_list, 79 .list = ext3_xattr_security_list,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e717dfd2f2b4..b7d7bd0f066e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -175,6 +175,7 @@ struct mpage_da_data {
175 */ 175 */
176#define EXT4_IO_END_UNWRITTEN 0x0001 176#define EXT4_IO_END_UNWRITTEN 0x0001
177#define EXT4_IO_END_ERROR 0x0002 177#define EXT4_IO_END_ERROR 0x0002
178#define EXT4_IO_END_QUEUED 0x0004
178 179
179struct ext4_io_page { 180struct ext4_io_page {
180 struct page *p_page; 181 struct page *p_page;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index bb85757689b6..5802fa1dab18 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode)
289 289
290static inline int ext4_should_writeback_data(struct inode *inode) 290static inline int ext4_should_writeback_data(struct inode *inode)
291{ 291{
292 if (!S_ISREG(inode->i_mode))
293 return 0;
294 if (EXT4_JOURNAL(inode) == NULL) 292 if (EXT4_JOURNAL(inode) == NULL)
295 return 1; 293 return 1;
294 if (!S_ISREG(inode->i_mode))
295 return 0;
296 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 296 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
297 return 0; 297 return 0;
298 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 298 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e4095e988eba..b9548f477bb8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -224,53 +224,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
224 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; 224 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
225 else 225 else
226 maxbytes = inode->i_sb->s_maxbytes; 226 maxbytes = inode->i_sb->s_maxbytes;
227 mutex_lock(&inode->i_mutex);
228 switch (origin) {
229 case SEEK_END:
230 offset += inode->i_size;
231 break;
232 case SEEK_CUR:
233 if (offset == 0) {
234 mutex_unlock(&inode->i_mutex);
235 return file->f_pos;
236 }
237 offset += file->f_pos;
238 break;
239 case SEEK_DATA:
240 /*
241 * In the generic case the entire file is data, so as long as
242 * offset isn't at the end of the file then the offset is data.
243 */
244 if (offset >= inode->i_size) {
245 mutex_unlock(&inode->i_mutex);
246 return -ENXIO;
247 }
248 break;
249 case SEEK_HOLE:
250 /*
251 * There is a virtual hole at the end of the file, so as long as
252 * offset isn't i_size or larger, return i_size.
253 */
254 if (offset >= inode->i_size) {
255 mutex_unlock(&inode->i_mutex);
256 return -ENXIO;
257 }
258 offset = inode->i_size;
259 break;
260 }
261
262 if (offset < 0 || offset > maxbytes) {
263 mutex_unlock(&inode->i_mutex);
264 return -EINVAL;
265 }
266
267 if (offset != file->f_pos) {
268 file->f_pos = offset;
269 file->f_version = 0;
270 }
271 mutex_unlock(&inode->i_mutex);
272 227
273 return offset; 228 return generic_file_llseek_size(file, offset, origin, maxbytes);
274} 229}
275 230
276const struct file_operations ext4_file_operations = { 231const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8602cde5b5a..0962642119c0 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -800,12 +800,17 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
800 } 800 }
801 801
802retry: 802retry:
803 if (rw == READ && ext4_should_dioread_nolock(inode)) 803 if (rw == READ && ext4_should_dioread_nolock(inode)) {
804 if (unlikely(!list_empty(&ei->i_completed_io_list))) {
805 mutex_lock(&inode->i_mutex);
806 ext4_flush_completed_IO(inode);
807 mutex_unlock(&inode->i_mutex);
808 }
804 ret = __blockdev_direct_IO(rw, iocb, inode, 809 ret = __blockdev_direct_IO(rw, iocb, inode,
805 inode->i_sb->s_bdev, iov, 810 inode->i_sb->s_bdev, iov,
806 offset, nr_segs, 811 offset, nr_segs,
807 ext4_get_block, NULL, NULL, 0); 812 ext4_get_block, NULL, NULL, 0);
808 else { 813 } else {
809 ret = blockdev_direct_IO(rw, iocb, inode, iov, 814 ret = blockdev_direct_IO(rw, iocb, inode, iov,
810 offset, nr_segs, ext4_get_block); 815 offset, nr_segs, ext4_get_block);
811 816
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d47264cafee0..986e2388f031 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -120,6 +120,9 @@ void ext4_evict_inode(struct inode *inode)
120 int err; 120 int err;
121 121
122 trace_ext4_evict_inode(inode); 122 trace_ext4_evict_inode(inode);
123
124 ext4_ioend_wait(inode);
125
123 if (inode->i_nlink) { 126 if (inode->i_nlink) {
124 /* 127 /*
125 * When journalling data dirty buffers are tracked only in the 128 * When journalling data dirty buffers are tracked only in the
@@ -644,7 +647,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
644 return bh; 647 return bh;
645 if (buffer_uptodate(bh)) 648 if (buffer_uptodate(bh))
646 return bh; 649 return bh;
647 ll_rw_block(READ_META, 1, &bh); 650 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
648 wait_on_buffer(bh); 651 wait_on_buffer(bh);
649 if (buffer_uptodate(bh)) 652 if (buffer_uptodate(bh))
650 return bh; 653 return bh;
@@ -983,6 +986,8 @@ static int ext4_journalled_write_end(struct file *file,
983 from = pos & (PAGE_CACHE_SIZE - 1); 986 from = pos & (PAGE_CACHE_SIZE - 1);
984 to = from + len; 987 to = from + len;
985 988
989 BUG_ON(!ext4_handle_valid(handle));
990
986 if (copied < len) { 991 if (copied < len) {
987 if (!PageUptodate(page)) 992 if (!PageUptodate(page))
988 copied = 0; 993 copied = 0;
@@ -1283,7 +1288,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1283 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) 1288 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
1284 err = ext4_bio_write_page(&io_submit, page, 1289 err = ext4_bio_write_page(&io_submit, page,
1285 len, mpd->wbc); 1290 len, mpd->wbc);
1286 else 1291 else if (buffer_uninit(page_bufs)) {
1292 ext4_set_bh_endio(page_bufs, inode);
1293 err = block_write_full_page_endio(page,
1294 noalloc_get_block_write,
1295 mpd->wbc, ext4_end_io_buffer_write);
1296 } else
1287 err = block_write_full_page(page, 1297 err = block_write_full_page(page,
1288 noalloc_get_block_write, mpd->wbc); 1298 noalloc_get_block_write, mpd->wbc);
1289 1299
@@ -1699,6 +1709,8 @@ static int __ext4_journalled_writepage(struct page *page,
1699 goto out; 1709 goto out;
1700 } 1710 }
1701 1711
1712 BUG_ON(!ext4_handle_valid(handle));
1713
1702 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, 1714 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
1703 do_journal_get_write_access); 1715 do_journal_get_write_access);
1704 1716
@@ -2668,8 +2680,15 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2668 goto out; 2680 goto out;
2669 } 2681 }
2670 2682
2671 io_end->flag = EXT4_IO_END_UNWRITTEN; 2683 /*
2684 * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
2685 * but being more careful is always safe for the future change.
2686 */
2672 inode = io_end->inode; 2687 inode = io_end->inode;
2688 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
2689 io_end->flag |= EXT4_IO_END_UNWRITTEN;
2690 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
2691 }
2673 2692
2674 /* Add the io_end to per-inode completed io list*/ 2693 /* Add the io_end to per-inode completed io list*/
2675 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 2694 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -3279,7 +3298,7 @@ make_io:
3279 trace_ext4_load_inode(inode); 3298 trace_ext4_load_inode(inode);
3280 get_bh(bh); 3299 get_bh(bh);
3281 bh->b_end_io = end_buffer_read_sync; 3300 bh->b_end_io = end_buffer_read_sync;
3282 submit_bh(READ_META, bh); 3301 submit_bh(READ | REQ_META | REQ_PRIO, bh);
3283 wait_on_buffer(bh); 3302 wait_on_buffer(bh);
3284 if (!buffer_uptodate(bh)) { 3303 if (!buffer_uptodate(bh)) {
3285 EXT4_ERROR_INODE_BLOCK(inode, block, 3304 EXT4_ERROR_INODE_BLOCK(inode, block,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 565a154e22d4..1c924faeb6c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -922,7 +922,8 @@ restart:
922 bh = ext4_getblk(NULL, dir, b++, 0, &err); 922 bh = ext4_getblk(NULL, dir, b++, 0, &err);
923 bh_use[ra_max] = bh; 923 bh_use[ra_max] = bh;
924 if (bh) 924 if (bh)
925 ll_rw_block(READ_META, 1, &bh); 925 ll_rw_block(READ | REQ_META | REQ_PRIO,
926 1, &bh);
926 } 927 }
927 } 928 }
928 if ((bh = bh_use[ra_ptr++]) == NULL) 929 if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -2253,9 +2254,11 @@ static int ext4_symlink(struct inode *dir,
2253 /* 2254 /*
2254 * For non-fast symlinks, we just allocate inode and put it on 2255 * For non-fast symlinks, we just allocate inode and put it on
2255 * orphan list in the first transaction => we need bitmap, 2256 * orphan list in the first transaction => we need bitmap,
2256 * group descriptor, sb, inode block, quota blocks. 2257 * group descriptor, sb, inode block, quota blocks, and
2258 * possibly selinux xattr blocks.
2257 */ 2259 */
2258 credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); 2260 credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2261 EXT4_XATTR_TRANS_BLOCKS;
2259 } else { 2262 } else {
2260 /* 2263 /*
2261 * Fast symlink. We have to add entry to directory 2264 * Fast symlink. We have to add entry to directory
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 430c401d0895..92f38ee13f8a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -142,7 +142,23 @@ static void ext4_end_io_work(struct work_struct *work)
142 unsigned long flags; 142 unsigned long flags;
143 int ret; 143 int ret;
144 144
145 mutex_lock(&inode->i_mutex); 145 if (!mutex_trylock(&inode->i_mutex)) {
146 /*
147 * Requeue the work instead of waiting so that the work
148 * items queued after this can be processed.
149 */
150 queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
151 /*
152 * To prevent the ext4-dio-unwritten thread from keeping
153 * requeueing end_io requests and occupying cpu for too long,
154 * yield the cpu if it sees an end_io request that has already
155 * been requeued.
156 */
157 if (io->flag & EXT4_IO_END_QUEUED)
158 yield();
159 io->flag |= EXT4_IO_END_QUEUED;
160 return;
161 }
146 ret = ext4_end_io_nolock(io); 162 ret = ext4_end_io_nolock(io);
147 if (ret < 0) { 163 if (ret < 0) {
148 mutex_unlock(&inode->i_mutex); 164 mutex_unlock(&inode->i_mutex);
@@ -334,8 +350,10 @@ submit_and_retry:
334 if ((io_end->num_io_pages >= MAX_IO_PAGES) && 350 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
335 (io_end->pages[io_end->num_io_pages-1] != io_page)) 351 (io_end->pages[io_end->num_io_pages-1] != io_page))
336 goto submit_and_retry; 352 goto submit_and_retry;
337 if (buffer_uninit(bh)) 353 if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
338 io->io_end->flag |= EXT4_IO_END_UNWRITTEN; 354 io_end->flag |= EXT4_IO_END_UNWRITTEN;
355 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
356 }
339 io->io_end->size += bh->b_size; 357 io->io_end->size += bh->b_size;
340 io->io_next_block++; 358 io->io_next_block++;
341 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 359 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4687fea0c00f..44d0c8db2239 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -919,7 +919,6 @@ static void ext4_i_callback(struct rcu_head *head)
919 919
920static void ext4_destroy_inode(struct inode *inode) 920static void ext4_destroy_inode(struct inode *inode)
921{ 921{
922 ext4_ioend_wait(inode);
923 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 922 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
924 ext4_msg(inode->i_sb, KERN_ERR, 923 ext4_msg(inode->i_sb, KERN_ERR,
925 "Inode %lu (%p): orphan list check failed!", 924 "Inode %lu (%p): orphan list check failed!",
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 007c3bfbf094..34e4350dd4d9 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -48,28 +48,32 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
48 name, value, size, flags); 48 name, value, size, flags);
49} 49}
50 50
51int 51int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, 52 void *fs_info)
53 const struct qstr *qstr)
54{ 53{
55 int err; 54 const struct xattr *xattr;
56 size_t len; 55 handle_t *handle = fs_info;
57 void *value; 56 int err = 0;
58 char *name;
59 57
60 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); 58 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
61 if (err) { 59 err = ext4_xattr_set_handle(handle, inode,
62 if (err == -EOPNOTSUPP) 60 EXT4_XATTR_INDEX_SECURITY,
63 return 0; 61 xattr->name, xattr->value,
64 return err; 62 xattr->value_len, 0);
63 if (err < 0)
64 break;
65 } 65 }
66 err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
67 name, value, len, 0);
68 kfree(name);
69 kfree(value);
70 return err; 66 return err;
71} 67}
72 68
69int
70ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
71 const struct qstr *qstr)
72{
73 return security_inode_init_security(inode, dir, qstr,
74 &ext4_initxattrs, handle);
75}
76
73const struct xattr_handler ext4_xattr_security_handler = { 77const struct xattr_handler ext4_xattr_security_handler = {
74 .prefix = XATTR_SECURITY_PREFIX, 78 .prefix = XATTR_SECURITY_PREFIX,
75 .list = ext4_xattr_security_list, 79 .list = ext4_xattr_security_list,
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4ad64732cbce..5efbd5d7701a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1231,7 +1231,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
1231 struct super_block *sb = dir->i_sb; 1231 struct super_block *sb = dir->i_sb;
1232 struct msdos_sb_info *sbi = MSDOS_SB(sb); 1232 struct msdos_sb_info *sbi = MSDOS_SB(sb);
1233 struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ 1233 struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
1234 struct msdos_dir_entry *de; 1234 struct msdos_dir_entry *uninitialized_var(de);
1235 int err, free_slots, i, nr_bhs; 1235 int err, free_slots, i, nr_bhs;
1236 loff_t pos, i_pos; 1236 loff_t pos, i_pos;
1237 1237
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5942fec22c65..1726d7303047 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1188,9 +1188,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
1188out: 1188out:
1189 /* UTF-8 doesn't provide FAT semantics */ 1189 /* UTF-8 doesn't provide FAT semantics */
1190 if (!strcmp(opts->iocharset, "utf8")) { 1190 if (!strcmp(opts->iocharset, "utf8")) {
1191 fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset" 1191 fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
1192 " for FAT filesystems, filesystem will be " 1192 " for FAT filesystems, filesystem will be "
1193 "case sensitive!\n"); 1193 "case sensitive!");
1194 } 1194 }
1195 1195
1196 /* If user doesn't specify allow_utime, it's initialized from dmask. */ 1196 /* If user doesn't specify allow_utime, it's initialized from dmask. */
@@ -1367,6 +1367,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1367 sbi->free_clusters = -1; /* Don't know yet */ 1367 sbi->free_clusters = -1; /* Don't know yet */
1368 sbi->free_clus_valid = 0; 1368 sbi->free_clus_valid = 0;
1369 sbi->prev_free = FAT_START_ENT; 1369 sbi->prev_free = FAT_START_ENT;
1370 sb->s_maxbytes = 0xffffffff;
1370 1371
1371 if (!sbi->fat_length && b->fat32_length) { 1372 if (!sbi->fat_length && b->fat32_length) {
1372 struct fat_boot_fsinfo *fsinfo; 1373 struct fat_boot_fsinfo *fsinfo;
@@ -1377,8 +1378,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1377 sbi->fat_length = le32_to_cpu(b->fat32_length); 1378 sbi->fat_length = le32_to_cpu(b->fat32_length);
1378 sbi->root_cluster = le32_to_cpu(b->root_cluster); 1379 sbi->root_cluster = le32_to_cpu(b->root_cluster);
1379 1380
1380 sb->s_maxbytes = 0xffffffff;
1381
1382 /* MC - if info_sector is 0, don't multiply by 0 */ 1381 /* MC - if info_sector is 0, don't multiply by 0 */
1383 sbi->fsinfo_sector = le16_to_cpu(b->info_sector); 1382 sbi->fsinfo_sector = le16_to_cpu(b->info_sector);
1384 if (sbi->fsinfo_sector == 0) 1383 if (sbi->fsinfo_sector == 0)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 640fc229df10..5cb8614508c3 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -258,10 +258,14 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
258 forget->forget_one.nlookup = nlookup; 258 forget->forget_one.nlookup = nlookup;
259 259
260 spin_lock(&fc->lock); 260 spin_lock(&fc->lock);
261 fc->forget_list_tail->next = forget; 261 if (fc->connected) {
262 fc->forget_list_tail = forget; 262 fc->forget_list_tail->next = forget;
263 wake_up(&fc->waitq); 263 fc->forget_list_tail = forget;
264 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 264 wake_up(&fc->waitq);
265 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
266 } else {
267 kfree(forget);
268 }
265 spin_unlock(&fc->lock); 269 spin_unlock(&fc->lock);
266} 270}
267 271
@@ -1358,6 +1362,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
1358 if (outarg.namelen > FUSE_NAME_MAX) 1362 if (outarg.namelen > FUSE_NAME_MAX)
1359 goto err; 1363 goto err;
1360 1364
1365 err = -EINVAL;
1366 if (size != sizeof(outarg) + outarg.namelen + 1)
1367 goto err;
1368
1361 name.name = buf; 1369 name.name = buf;
1362 name.len = outarg.namelen; 1370 name.len = outarg.namelen;
1363 err = fuse_copy_one(cs, buf, outarg.namelen + 1); 1371 err = fuse_copy_one(cs, buf, outarg.namelen + 1);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d480d9af46c9..594f07a81c28 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -14,6 +14,7 @@
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/compat.h> 16#include <linux/compat.h>
17#include <linux/swap.h>
17 18
18static const struct file_operations fuse_direct_io_file_operations; 19static const struct file_operations fuse_direct_io_file_operations;
19 20
@@ -245,6 +246,12 @@ void fuse_release_common(struct file *file, int opcode)
245 req = ff->reserved_req; 246 req = ff->reserved_req;
246 fuse_prepare_release(ff, file->f_flags, opcode); 247 fuse_prepare_release(ff, file->f_flags, opcode);
247 248
249 if (ff->flock) {
250 struct fuse_release_in *inarg = &req->misc.release.in;
251 inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
252 inarg->lock_owner = fuse_lock_owner_id(ff->fc,
253 (fl_owner_t) file);
254 }
248 /* Hold vfsmount and dentry until release is finished */ 255 /* Hold vfsmount and dentry until release is finished */
249 path_get(&file->f_path); 256 path_get(&file->f_path);
250 req->misc.release.path = file->f_path; 257 req->misc.release.path = file->f_path;
@@ -755,18 +762,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
755 return req->misc.write.out.size; 762 return req->misc.write.out.size;
756} 763}
757 764
758static int fuse_write_begin(struct file *file, struct address_space *mapping,
759 loff_t pos, unsigned len, unsigned flags,
760 struct page **pagep, void **fsdata)
761{
762 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
763
764 *pagep = grab_cache_page_write_begin(mapping, index, flags);
765 if (!*pagep)
766 return -ENOMEM;
767 return 0;
768}
769
770void fuse_write_update_size(struct inode *inode, loff_t pos) 765void fuse_write_update_size(struct inode *inode, loff_t pos)
771{ 766{
772 struct fuse_conn *fc = get_fuse_conn(inode); 767 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -779,62 +774,6 @@ void fuse_write_update_size(struct inode *inode, loff_t pos)
779 spin_unlock(&fc->lock); 774 spin_unlock(&fc->lock);
780} 775}
781 776
782static int fuse_buffered_write(struct file *file, struct inode *inode,
783 loff_t pos, unsigned count, struct page *page)
784{
785 int err;
786 size_t nres;
787 struct fuse_conn *fc = get_fuse_conn(inode);
788 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
789 struct fuse_req *req;
790
791 if (is_bad_inode(inode))
792 return -EIO;
793
794 /*
795 * Make sure writepages on the same page are not mixed up with
796 * plain writes.
797 */
798 fuse_wait_on_page_writeback(inode, page->index);
799
800 req = fuse_get_req(fc);
801 if (IS_ERR(req))
802 return PTR_ERR(req);
803
804 req->in.argpages = 1;
805 req->num_pages = 1;
806 req->pages[0] = page;
807 req->page_offset = offset;
808 nres = fuse_send_write(req, file, pos, count, NULL);
809 err = req->out.h.error;
810 fuse_put_request(fc, req);
811 if (!err && !nres)
812 err = -EIO;
813 if (!err) {
814 pos += nres;
815 fuse_write_update_size(inode, pos);
816 if (count == PAGE_CACHE_SIZE)
817 SetPageUptodate(page);
818 }
819 fuse_invalidate_attr(inode);
820 return err ? err : nres;
821}
822
823static int fuse_write_end(struct file *file, struct address_space *mapping,
824 loff_t pos, unsigned len, unsigned copied,
825 struct page *page, void *fsdata)
826{
827 struct inode *inode = mapping->host;
828 int res = 0;
829
830 if (copied)
831 res = fuse_buffered_write(file, inode, pos, copied, page);
832
833 unlock_page(page);
834 page_cache_release(page);
835 return res;
836}
837
838static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, 777static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
839 struct inode *inode, loff_t pos, 778 struct inode *inode, loff_t pos,
840 size_t count) 779 size_t count)
@@ -908,6 +847,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
908 pagefault_enable(); 847 pagefault_enable();
909 flush_dcache_page(page); 848 flush_dcache_page(page);
910 849
850 mark_page_accessed(page);
851
911 if (!tmp) { 852 if (!tmp) {
912 unlock_page(page); 853 unlock_page(page);
913 page_cache_release(page); 854 page_cache_release(page);
@@ -1559,11 +1500,14 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
1559 struct fuse_conn *fc = get_fuse_conn(inode); 1500 struct fuse_conn *fc = get_fuse_conn(inode);
1560 int err; 1501 int err;
1561 1502
1562 if (fc->no_lock) { 1503 if (fc->no_flock) {
1563 err = flock_lock_file_wait(file, fl); 1504 err = flock_lock_file_wait(file, fl);
1564 } else { 1505 } else {
1506 struct fuse_file *ff = file->private_data;
1507
1565 /* emulate flock with POSIX locks */ 1508 /* emulate flock with POSIX locks */
1566 fl->fl_owner = (fl_owner_t) file; 1509 fl->fl_owner = (fl_owner_t) file;
1510 ff->flock = true;
1567 err = fuse_setlk(file, fl, 1); 1511 err = fuse_setlk(file, fl, 1);
1568 } 1512 }
1569 1513
@@ -2201,8 +2145,6 @@ static const struct address_space_operations fuse_file_aops = {
2201 .readpage = fuse_readpage, 2145 .readpage = fuse_readpage,
2202 .writepage = fuse_writepage, 2146 .writepage = fuse_writepage,
2203 .launder_page = fuse_launder_page, 2147 .launder_page = fuse_launder_page,
2204 .write_begin = fuse_write_begin,
2205 .write_end = fuse_write_end,
2206 .readpages = fuse_readpages, 2148 .readpages = fuse_readpages,
2207 .set_page_dirty = __set_page_dirty_nobuffers, 2149 .set_page_dirty = __set_page_dirty_nobuffers,
2208 .bmap = fuse_bmap, 2150 .bmap = fuse_bmap,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c6aa2d4b8517..cf6db0a93219 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -135,6 +135,9 @@ struct fuse_file {
135 135
136 /** Wait queue head for poll */ 136 /** Wait queue head for poll */
137 wait_queue_head_t poll_wait; 137 wait_queue_head_t poll_wait;
138
139 /** Has flock been performed on this file? */
140 bool flock:1;
138}; 141};
139 142
140/** One input argument of a request */ 143/** One input argument of a request */
@@ -448,7 +451,7 @@ struct fuse_conn {
448 /** Is removexattr not implemented by fs? */ 451 /** Is removexattr not implemented by fs? */
449 unsigned no_removexattr:1; 452 unsigned no_removexattr:1;
450 453
451 /** Are file locking primitives not implemented by fs? */ 454 /** Are posix file locking primitives not implemented by fs? */
452 unsigned no_lock:1; 455 unsigned no_lock:1;
453 456
454 /** Is access not implemented by fs? */ 457 /** Is access not implemented by fs? */
@@ -472,6 +475,9 @@ struct fuse_conn {
472 /** Don't apply umask to creation modes */ 475 /** Don't apply umask to creation modes */
473 unsigned dont_mask:1; 476 unsigned dont_mask:1;
474 477
478 /** Are BSD file locking primitives not implemented by fs? */
479 unsigned no_flock:1;
480
475 /** The number of requests waiting for completion */ 481 /** The number of requests waiting for completion */
476 atomic_t num_waiting; 482 atomic_t num_waiting;
477 483
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 38f84cd48b67..add96f6ffda5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,7 +71,7 @@ struct fuse_mount_data {
71 unsigned blksize; 71 unsigned blksize;
72}; 72};
73 73
74struct fuse_forget_link *fuse_alloc_forget() 74struct fuse_forget_link *fuse_alloc_forget(void)
75{ 75{
76 return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); 76 return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
77} 77}
@@ -809,6 +809,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
809 fc->async_read = 1; 809 fc->async_read = 1;
810 if (!(arg->flags & FUSE_POSIX_LOCKS)) 810 if (!(arg->flags & FUSE_POSIX_LOCKS))
811 fc->no_lock = 1; 811 fc->no_lock = 1;
812 if (arg->minor >= 17) {
813 if (!(arg->flags & FUSE_FLOCK_LOCKS))
814 fc->no_flock = 1;
815 } else {
816 if (!(arg->flags & FUSE_POSIX_LOCKS))
817 fc->no_flock = 1;
818 }
812 if (arg->flags & FUSE_ATOMIC_O_TRUNC) 819 if (arg->flags & FUSE_ATOMIC_O_TRUNC)
813 fc->atomic_o_trunc = 1; 820 fc->atomic_o_trunc = 1;
814 if (arg->minor >= 9) { 821 if (arg->minor >= 9) {
@@ -823,6 +830,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
823 } else { 830 } else {
824 ra_pages = fc->max_read / PAGE_CACHE_SIZE; 831 ra_pages = fc->max_read / PAGE_CACHE_SIZE;
825 fc->no_lock = 1; 832 fc->no_lock = 1;
833 fc->no_flock = 1;
826 } 834 }
827 835
828 fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); 836 fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
@@ -843,7 +851,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
843 arg->minor = FUSE_KERNEL_MINOR_VERSION; 851 arg->minor = FUSE_KERNEL_MINOR_VERSION;
844 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; 852 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
845 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | 853 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
846 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK; 854 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
855 FUSE_FLOCK_LOCKS;
847 req->in.h.opcode = FUSE_INIT; 856 req->in.h.opcode = FUSE_INIT;
848 req->in.numargs = 1; 857 req->in.numargs = 1;
849 req->in.args[0].size = sizeof(*arg); 858 req->in.args[0].size = sizeof(*arg);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 34501b64bc47..65978d7885c8 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -82,7 +82,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
82 iattr.ia_valid = ATTR_MODE; 82 iattr.ia_valid = ATTR_MODE;
83 iattr.ia_mode = mode; 83 iattr.ia_mode = mode;
84 84
85 error = gfs2_setattr_simple(GFS2_I(inode), &iattr); 85 error = gfs2_setattr_simple(inode, &iattr);
86 } 86 }
87 87
88 return error; 88 return error;
@@ -160,6 +160,7 @@ out:
160 160
161int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) 161int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
162{ 162{
163 struct inode *inode = &ip->i_inode;
163 struct posix_acl *acl; 164 struct posix_acl *acl;
164 char *data; 165 char *data;
165 unsigned int len; 166 unsigned int len;
@@ -169,7 +170,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
169 if (IS_ERR(acl)) 170 if (IS_ERR(acl))
170 return PTR_ERR(acl); 171 return PTR_ERR(acl);
171 if (!acl) 172 if (!acl)
172 return gfs2_setattr_simple(ip, attr); 173 return gfs2_setattr_simple(inode, attr);
173 174
174 error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); 175 error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
175 if (error) 176 if (error)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f9fbbe96c222..4858e1fed8b1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -663,7 +663,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
663 if (&ip->i_inode == sdp->sd_rindex) 663 if (&ip->i_inode == sdp->sd_rindex)
664 rblocks += 2 * RES_STATFS; 664 rblocks += 2 * RES_STATFS;
665 if (alloc_required) 665 if (alloc_required)
666 rblocks += gfs2_rg_blocks(al); 666 rblocks += gfs2_rg_blocks(ip);
667 667
668 error = gfs2_trans_begin(sdp, rblocks, 668 error = gfs2_trans_begin(sdp, rblocks,
669 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 669 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -787,7 +787,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
787 u64 to = pos + copied; 787 u64 to = pos + copied;
788 void *kaddr; 788 void *kaddr;
789 unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); 789 unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
790 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
791 790
792 BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); 791 BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
793 kaddr = kmap_atomic(page, KM_USER0); 792 kaddr = kmap_atomic(page, KM_USER0);
@@ -804,7 +803,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
804 if (copied) { 803 if (copied) {
805 if (inode->i_size < to) 804 if (inode->i_size < to)
806 i_size_write(inode, to); 805 i_size_write(inode, to);
807 gfs2_dinode_out(ip, di);
808 mark_inode_dirty(inode); 806 mark_inode_dirty(inode);
809 } 807 }
810 808
@@ -873,10 +871,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
873 gfs2_page_add_databufs(ip, page, from, to); 871 gfs2_page_add_databufs(ip, page, from, to);
874 872
875 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 873 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
876 if (ret > 0) {
877 gfs2_dinode_out(ip, dibh->b_data);
878 mark_inode_dirty(inode);
879 }
880 874
881 if (inode == sdp->sd_rindex) { 875 if (inode == sdp->sd_rindex) {
882 adjust_fs_space(inode); 876 adjust_fs_space(inode);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7878c473ae62..41d494d79709 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -10,6 +10,7 @@
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/completion.h> 11#include <linux/completion.h>
12#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
13#include <linux/blkdev.h>
13#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
14#include <linux/crc32.h> 15#include <linux/crc32.h>
15 16
@@ -36,11 +37,6 @@ struct metapath {
36 __u16 mp_list[GFS2_MAX_META_HEIGHT]; 37 __u16 mp_list[GFS2_MAX_META_HEIGHT];
37}; 38};
38 39
39typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
40 struct buffer_head *bh, __be64 *top,
41 __be64 *bottom, unsigned int height,
42 void *data);
43
44struct strip_mine { 40struct strip_mine {
45 int sm_first; 41 int sm_first;
46 unsigned int sm_height; 42 unsigned int sm_height;
@@ -273,6 +269,30 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp
273 return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; 269 return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
274} 270}
275 271
272static void gfs2_metapath_ra(struct gfs2_glock *gl,
273 const struct buffer_head *bh, const __be64 *pos)
274{
275 struct buffer_head *rabh;
276 const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
277 const __be64 *t;
278
279 for (t = pos; t < endp; t++) {
280 if (!*t)
281 continue;
282
283 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
284 if (trylock_buffer(rabh)) {
285 if (!buffer_uptodate(rabh)) {
286 rabh->b_end_io = end_buffer_read_sync;
287 submit_bh(READA | REQ_META, rabh);
288 continue;
289 }
290 unlock_buffer(rabh);
291 }
292 brelse(rabh);
293 }
294}
295
276/** 296/**
277 * lookup_metapath - Walk the metadata tree to a specific point 297 * lookup_metapath - Walk the metadata tree to a specific point
278 * @ip: The inode 298 * @ip: The inode
@@ -432,12 +452,14 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
432{ 452{
433 struct gfs2_inode *ip = GFS2_I(inode); 453 struct gfs2_inode *ip = GFS2_I(inode);
434 struct gfs2_sbd *sdp = GFS2_SB(inode); 454 struct gfs2_sbd *sdp = GFS2_SB(inode);
455 struct super_block *sb = sdp->sd_vfs;
435 struct buffer_head *dibh = mp->mp_bh[0]; 456 struct buffer_head *dibh = mp->mp_bh[0];
436 u64 bn, dblock = 0; 457 u64 bn, dblock = 0;
437 unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; 458 unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
438 unsigned dblks = 0; 459 unsigned dblks = 0;
439 unsigned ptrs_per_blk; 460 unsigned ptrs_per_blk;
440 const unsigned end_of_metadata = height - 1; 461 const unsigned end_of_metadata = height - 1;
462 int ret;
441 int eob = 0; 463 int eob = 0;
442 enum alloc_state state; 464 enum alloc_state state;
443 __be64 *ptr; 465 __be64 *ptr;
@@ -540,6 +562,15 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
540 dblock = bn; 562 dblock = bn;
541 while (n-- > 0) 563 while (n-- > 0)
542 *ptr++ = cpu_to_be64(bn++); 564 *ptr++ = cpu_to_be64(bn++);
565 if (buffer_zeronew(bh_map)) {
566 ret = sb_issue_zeroout(sb, dblock, dblks,
567 GFP_NOFS);
568 if (ret) {
569 fs_err(sdp,
570 "Failed to zero data buffers\n");
571 clear_buffer_zeronew(bh_map);
572 }
573 }
543 break; 574 break;
544 } 575 }
545 } while ((state != ALLOC_DATA) || !dblock); 576 } while ((state != ALLOC_DATA) || !dblock);
@@ -668,76 +699,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
668} 699}
669 700
670/** 701/**
671 * recursive_scan - recursively scan through the end of a file
672 * @ip: the inode
673 * @dibh: the dinode buffer
674 * @mp: the path through the metadata to the point to start
675 * @height: the height the recursion is at
676 * @block: the indirect block to look at
677 * @first: 1 if this is the first block
678 * @bc: the call to make for each piece of metadata
679 * @data: data opaque to this function to pass to @bc
680 *
681 * When this is first called @height and @block should be zero and
682 * @first should be 1.
683 *
684 * Returns: errno
685 */
686
687static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
688 struct metapath *mp, unsigned int height,
689 u64 block, int first, block_call_t bc,
690 void *data)
691{
692 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
693 struct buffer_head *bh = NULL;
694 __be64 *top, *bottom;
695 u64 bn;
696 int error;
697 int mh_size = sizeof(struct gfs2_meta_header);
698
699 if (!height) {
700 error = gfs2_meta_inode_buffer(ip, &bh);
701 if (error)
702 return error;
703 dibh = bh;
704
705 top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
706 bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
707 } else {
708 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
709 if (error)
710 return error;
711
712 top = (__be64 *)(bh->b_data + mh_size) +
713 (first ? mp->mp_list[height] : 0);
714
715 bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
716 }
717
718 error = bc(ip, dibh, bh, top, bottom, height, data);
719 if (error)
720 goto out;
721
722 if (height < ip->i_height - 1)
723 for (; top < bottom; top++, first = 0) {
724 if (!*top)
725 continue;
726
727 bn = be64_to_cpu(*top);
728
729 error = recursive_scan(ip, dibh, mp, height + 1, bn,
730 first, bc, data);
731 if (error)
732 break;
733 }
734
735out:
736 brelse(bh);
737 return error;
738}
739
740/**
741 * do_strip - Look for a layer a particular layer of the file and strip it off 702 * do_strip - Look for a layer a particular layer of the file and strip it off
742 * @ip: the inode 703 * @ip: the inode
743 * @dibh: the dinode buffer 704 * @dibh: the dinode buffer
@@ -752,9 +713,8 @@ out:
752 713
753static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, 714static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
754 struct buffer_head *bh, __be64 *top, __be64 *bottom, 715 struct buffer_head *bh, __be64 *top, __be64 *bottom,
755 unsigned int height, void *data) 716 unsigned int height, struct strip_mine *sm)
756{ 717{
757 struct strip_mine *sm = data;
758 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 718 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
759 struct gfs2_rgrp_list rlist; 719 struct gfs2_rgrp_list rlist;
760 u64 bn, bstart; 720 u64 bn, bstart;
@@ -783,11 +743,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
783 else if (ip->i_depth) 743 else if (ip->i_depth)
784 revokes = sdp->sd_inptrs; 744 revokes = sdp->sd_inptrs;
785 745
786 if (ip != GFS2_I(sdp->sd_rindex))
787 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
788 else if (!sdp->sd_rgrps)
789 error = gfs2_ri_update(ip);
790
791 if (error) 746 if (error)
792 return error; 747 return error;
793 748
@@ -805,7 +760,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
805 blen++; 760 blen++;
806 else { 761 else {
807 if (bstart) 762 if (bstart)
808 gfs2_rlist_add(sdp, &rlist, bstart); 763 gfs2_rlist_add(ip, &rlist, bstart);
809 764
810 bstart = bn; 765 bstart = bn;
811 blen = 1; 766 blen = 1;
@@ -813,7 +768,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
813 } 768 }
814 769
815 if (bstart) 770 if (bstart)
816 gfs2_rlist_add(sdp, &rlist, bstart); 771 gfs2_rlist_add(ip, &rlist, bstart);
817 else 772 else
818 goto out; /* Nothing to do */ 773 goto out; /* Nothing to do */
819 774
@@ -887,12 +842,82 @@ out_rg_gunlock:
887out_rlist: 842out_rlist:
888 gfs2_rlist_free(&rlist); 843 gfs2_rlist_free(&rlist);
889out: 844out:
890 if (ip != GFS2_I(sdp->sd_rindex))
891 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
892 return error; 845 return error;
893} 846}
894 847
895/** 848/**
849 * recursive_scan - recursively scan through the end of a file
850 * @ip: the inode
851 * @dibh: the dinode buffer
852 * @mp: the path through the metadata to the point to start
853 * @height: the height the recursion is at
854 * @block: the indirect block to look at
855 * @first: 1 if this is the first block
856 * @sm: data opaque to this function to pass to @bc
857 *
858 * When this is first called @height and @block should be zero and
859 * @first should be 1.
860 *
861 * Returns: errno
862 */
863
864static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
865 struct metapath *mp, unsigned int height,
866 u64 block, int first, struct strip_mine *sm)
867{
868 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
869 struct buffer_head *bh = NULL;
870 __be64 *top, *bottom;
871 u64 bn;
872 int error;
873 int mh_size = sizeof(struct gfs2_meta_header);
874
875 if (!height) {
876 error = gfs2_meta_inode_buffer(ip, &bh);
877 if (error)
878 return error;
879 dibh = bh;
880
881 top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
882 bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
883 } else {
884 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
885 if (error)
886 return error;
887
888 top = (__be64 *)(bh->b_data + mh_size) +
889 (first ? mp->mp_list[height] : 0);
890
891 bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
892 }
893
894 error = do_strip(ip, dibh, bh, top, bottom, height, sm);
895 if (error)
896 goto out;
897
898 if (height < ip->i_height - 1) {
899
900 gfs2_metapath_ra(ip->i_gl, bh, top);
901
902 for (; top < bottom; top++, first = 0) {
903 if (!*top)
904 continue;
905
906 bn = be64_to_cpu(*top);
907
908 error = recursive_scan(ip, dibh, mp, height + 1, bn,
909 first, sm);
910 if (error)
911 break;
912 }
913 }
914out:
915 brelse(bh);
916 return error;
917}
918
919
920/**
896 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 921 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
897 * 922 *
898 * This is partly borrowed from ext3. 923 * This is partly borrowed from ext3.
@@ -1031,7 +1056,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
1031 sm.sm_first = !!size; 1056 sm.sm_first = !!size;
1032 sm.sm_height = height; 1057 sm.sm_height = height;
1033 1058
1034 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); 1059 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
1035 if (error) 1060 if (error)
1036 break; 1061 break;
1037 } 1062 }
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1cc2f8ec52a2..8ccad2467cb6 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -240,16 +240,15 @@ fail:
240 return error; 240 return error;
241} 241}
242 242
243static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, 243static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf,
244 u64 offset, unsigned int size) 244 unsigned int size)
245{ 245{
246 struct buffer_head *dibh; 246 struct buffer_head *dibh;
247 int error; 247 int error;
248 248
249 error = gfs2_meta_inode_buffer(ip, &dibh); 249 error = gfs2_meta_inode_buffer(ip, &dibh);
250 if (!error) { 250 if (!error) {
251 offset += sizeof(struct gfs2_dinode); 251 memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
252 memcpy(buf, dibh->b_data + offset, size);
253 brelse(dibh); 252 brelse(dibh);
254 } 253 }
255 254
@@ -261,13 +260,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
261 * gfs2_dir_read_data - Read a data from a directory inode 260 * gfs2_dir_read_data - Read a data from a directory inode
262 * @ip: The GFS2 Inode 261 * @ip: The GFS2 Inode
263 * @buf: The buffer to place result into 262 * @buf: The buffer to place result into
264 * @offset: File offset to begin jdata_readng from
265 * @size: Amount of data to transfer 263 * @size: Amount of data to transfer
266 * 264 *
267 * Returns: The amount of data actually copied or the error 265 * Returns: The amount of data actually copied or the error
268 */ 266 */
269static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, 267static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
270 unsigned int size, unsigned ra) 268 unsigned int size)
271{ 269{
272 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 270 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
273 u64 lblock, dblock; 271 u64 lblock, dblock;
@@ -275,24 +273,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
275 unsigned int o; 273 unsigned int o;
276 int copied = 0; 274 int copied = 0;
277 int error = 0; 275 int error = 0;
278 u64 disksize = i_size_read(&ip->i_inode);
279
280 if (offset >= disksize)
281 return 0;
282
283 if (offset + size > disksize)
284 size = disksize - offset;
285
286 if (!size)
287 return 0;
288 276
289 if (gfs2_is_stuffed(ip)) 277 if (gfs2_is_stuffed(ip))
290 return gfs2_dir_read_stuffed(ip, buf, offset, size); 278 return gfs2_dir_read_stuffed(ip, buf, size);
291 279
292 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) 280 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
293 return -EINVAL; 281 return -EINVAL;
294 282
295 lblock = offset; 283 lblock = 0;
296 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); 284 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
297 285
298 while (copied < size) { 286 while (copied < size) {
@@ -311,8 +299,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
311 if (error || !dblock) 299 if (error || !dblock)
312 goto fail; 300 goto fail;
313 BUG_ON(extlen < 1); 301 BUG_ON(extlen < 1);
314 if (!ra)
315 extlen = 1;
316 bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); 302 bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
317 } else { 303 } else {
318 error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); 304 error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
@@ -328,7 +314,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
328 extlen--; 314 extlen--;
329 memcpy(buf, bh->b_data + o, amount); 315 memcpy(buf, bh->b_data + o, amount);
330 brelse(bh); 316 brelse(bh);
331 buf += amount; 317 buf += (amount/sizeof(__be64));
332 copied += amount; 318 copied += amount;
333 lblock++; 319 lblock++;
334 o = sizeof(struct gfs2_meta_header); 320 o = sizeof(struct gfs2_meta_header);
@@ -371,7 +357,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
371 if (hc == NULL) 357 if (hc == NULL)
372 return ERR_PTR(-ENOMEM); 358 return ERR_PTR(-ENOMEM);
373 359
374 ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1); 360 ret = gfs2_dir_read_data(ip, hc, hsize);
375 if (ret < 0) { 361 if (ret < 0) {
376 kfree(hc); 362 kfree(hc);
377 return ERR_PTR(ret); 363 return ERR_PTR(ret);
@@ -1695,7 +1681,6 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1695 const struct qstr *name = &dentry->d_name; 1681 const struct qstr *name = &dentry->d_name;
1696 struct gfs2_dirent *dent, *prev = NULL; 1682 struct gfs2_dirent *dent, *prev = NULL;
1697 struct buffer_head *bh; 1683 struct buffer_head *bh;
1698 int error;
1699 1684
1700 /* Returns _either_ the entry (if its first in block) or the 1685 /* Returns _either_ the entry (if its first in block) or the
1701 previous entry otherwise */ 1686 previous entry otherwise */
@@ -1724,22 +1709,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1724 } 1709 }
1725 brelse(bh); 1710 brelse(bh);
1726 1711
1727 error = gfs2_meta_inode_buffer(dip, &bh);
1728 if (error)
1729 return error;
1730
1731 if (!dip->i_entries) 1712 if (!dip->i_entries)
1732 gfs2_consist_inode(dip); 1713 gfs2_consist_inode(dip);
1733 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1734 dip->i_entries--; 1714 dip->i_entries--;
1735 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; 1715 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1736 if (S_ISDIR(dentry->d_inode->i_mode)) 1716 if (S_ISDIR(dentry->d_inode->i_mode))
1737 drop_nlink(&dip->i_inode); 1717 drop_nlink(&dip->i_inode);
1738 gfs2_dinode_out(dip, bh->b_data);
1739 brelse(bh);
1740 mark_inode_dirty(&dip->i_inode); 1718 mark_inode_dirty(&dip->i_inode);
1741 1719
1742 return error; 1720 return 0;
1743} 1721}
1744 1722
1745/** 1723/**
@@ -1829,10 +1807,6 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1829 if (error) 1807 if (error)
1830 goto out_put; 1808 goto out_put;
1831 1809
1832 error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
1833 if (error)
1834 goto out_qs;
1835
1836 /* Count the number of leaves */ 1810 /* Count the number of leaves */
1837 bh = leaf_bh; 1811 bh = leaf_bh;
1838 1812
@@ -1847,7 +1821,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1847 if (blk != leaf_no) 1821 if (blk != leaf_no)
1848 brelse(bh); 1822 brelse(bh);
1849 1823
1850 gfs2_rlist_add(sdp, &rlist, blk); 1824 gfs2_rlist_add(dip, &rlist, blk);
1851 l_blocks++; 1825 l_blocks++;
1852 } 1826 }
1853 1827
@@ -1911,8 +1885,6 @@ out_rg_gunlock:
1911 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); 1885 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1912out_rlist: 1886out_rlist:
1913 gfs2_rlist_free(&rlist); 1887 gfs2_rlist_free(&rlist);
1914 gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
1915out_qs:
1916 gfs2_quota_unhold(dip); 1888 gfs2_quota_unhold(dip);
1917out_put: 1889out_put:
1918 gfs2_alloc_put(dip); 1890 gfs2_alloc_put(dip);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index edeb9e802903..ce36a56dfeac 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -59,15 +59,24 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
59 struct gfs2_holder i_gh; 59 struct gfs2_holder i_gh;
60 loff_t error; 60 loff_t error;
61 61
62 if (origin == 2) { 62 switch (origin) {
63 case SEEK_END: /* These reference inode->i_size */
64 case SEEK_DATA:
65 case SEEK_HOLE:
63 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 66 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
64 &i_gh); 67 &i_gh);
65 if (!error) { 68 if (!error) {
66 error = generic_file_llseek_unlocked(file, offset, origin); 69 error = generic_file_llseek(file, offset, origin);
67 gfs2_glock_dq_uninit(&i_gh); 70 gfs2_glock_dq_uninit(&i_gh);
68 } 71 }
69 } else 72 break;
70 error = generic_file_llseek_unlocked(file, offset, origin); 73 case SEEK_CUR:
74 case SEEK_SET:
75 error = generic_file_llseek(file, offset, origin);
76 break;
77 default:
78 error = -EINVAL;
79 }
71 80
72 return error; 81 return error;
73} 82}
@@ -357,8 +366,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
357 unsigned int data_blocks, ind_blocks, rblocks; 366 unsigned int data_blocks, ind_blocks, rblocks;
358 struct gfs2_holder gh; 367 struct gfs2_holder gh;
359 struct gfs2_alloc *al; 368 struct gfs2_alloc *al;
369 loff_t size;
360 int ret; 370 int ret;
361 371
372 /* Wait if fs is frozen. This is racy so we check again later on
373 * and retry if the fs has been frozen after the page lock has
374 * been acquired
375 */
376 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
377
362 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 378 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
363 ret = gfs2_glock_nq(&gh); 379 ret = gfs2_glock_nq(&gh);
364 if (ret) 380 if (ret)
@@ -367,8 +383,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
367 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); 383 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
368 set_bit(GIF_SW_PAGED, &ip->i_flags); 384 set_bit(GIF_SW_PAGED, &ip->i_flags);
369 385
370 if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) 386 if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
387 lock_page(page);
388 if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
389 ret = -EAGAIN;
390 unlock_page(page);
391 }
371 goto out_unlock; 392 goto out_unlock;
393 }
394
372 ret = -ENOMEM; 395 ret = -ENOMEM;
373 al = gfs2_alloc_get(ip); 396 al = gfs2_alloc_get(ip);
374 if (al == NULL) 397 if (al == NULL)
@@ -388,7 +411,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
388 rblocks += data_blocks ? data_blocks : 1; 411 rblocks += data_blocks ? data_blocks : 1;
389 if (ind_blocks || data_blocks) { 412 if (ind_blocks || data_blocks) {
390 rblocks += RES_STATFS + RES_QUOTA; 413 rblocks += RES_STATFS + RES_QUOTA;
391 rblocks += gfs2_rg_blocks(al); 414 rblocks += gfs2_rg_blocks(ip);
392 } 415 }
393 ret = gfs2_trans_begin(sdp, rblocks, 0); 416 ret = gfs2_trans_begin(sdp, rblocks, 0);
394 if (ret) 417 if (ret)
@@ -396,21 +419,29 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
396 419
397 lock_page(page); 420 lock_page(page);
398 ret = -EINVAL; 421 ret = -EINVAL;
399 last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; 422 size = i_size_read(inode);
400 if (page->index > last_index) 423 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
401 goto out_unlock_page; 424 /* Check page index against inode size */
425 if (size == 0 || (page->index > last_index))
426 goto out_trans_end;
427
428 ret = -EAGAIN;
429 /* If truncated, we must retry the operation, we may have raced
430 * with the glock demotion code.
431 */
432 if (!PageUptodate(page) || page->mapping != inode->i_mapping)
433 goto out_trans_end;
434
435 /* Unstuff, if required, and allocate backing blocks for page */
402 ret = 0; 436 ret = 0;
403 if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) 437 if (gfs2_is_stuffed(ip))
404 goto out_unlock_page;
405 if (gfs2_is_stuffed(ip)) {
406 ret = gfs2_unstuff_dinode(ip, page); 438 ret = gfs2_unstuff_dinode(ip, page);
407 if (ret) 439 if (ret == 0)
408 goto out_unlock_page; 440 ret = gfs2_allocate_page_backing(page);
409 }
410 ret = gfs2_allocate_page_backing(page);
411 441
412out_unlock_page: 442out_trans_end:
413 unlock_page(page); 443 if (ret)
444 unlock_page(page);
414 gfs2_trans_end(sdp); 445 gfs2_trans_end(sdp);
415out_trans_fail: 446out_trans_fail:
416 gfs2_inplace_release(ip); 447 gfs2_inplace_release(ip);
@@ -422,11 +453,17 @@ out_unlock:
422 gfs2_glock_dq(&gh); 453 gfs2_glock_dq(&gh);
423out: 454out:
424 gfs2_holder_uninit(&gh); 455 gfs2_holder_uninit(&gh);
425 if (ret == -ENOMEM) 456 if (ret == 0) {
426 ret = VM_FAULT_OOM; 457 set_page_dirty(page);
427 else if (ret) 458 /* This check must be post dropping of transaction lock */
428 ret = VM_FAULT_SIGBUS; 459 if (inode->i_sb->s_frozen == SB_UNFROZEN) {
429 return ret; 460 wait_on_page_writeback(page);
461 } else {
462 ret = -EAGAIN;
463 unlock_page(page);
464 }
465 }
466 return block_page_mkwrite_return(ret);
430} 467}
431 468
432static const struct vm_operations_struct gfs2_vm_ops = { 469static const struct vm_operations_struct gfs2_vm_ops = {
@@ -551,8 +588,16 @@ static int gfs2_close(struct inode *inode, struct file *file)
551 * @end: the end position in the file to sync 588 * @end: the end position in the file to sync
552 * @datasync: set if we can ignore timestamp changes 589 * @datasync: set if we can ignore timestamp changes
553 * 590 *
554 * The VFS will flush data for us. We only need to worry 591 * We split the data flushing here so that we don't wait for the data
555 * about metadata here. 592 * until after we've also sent the metadata to disk. Note that for
593 * data=ordered, we will write & wait for the data at the log flush
594 * stage anyway, so this is unlikely to make much of a difference
595 * except in the data=writeback case.
596 *
597 * If the fdatawrite fails due to any reason except -EIO, we will
598 * continue the remainder of the fsync, although we'll still report
599 * the error at the end. This is to match filemap_write_and_wait_range()
600 * behaviour.
556 * 601 *
557 * Returns: errno 602 * Returns: errno
558 */ 603 */
@@ -560,30 +605,34 @@ static int gfs2_close(struct inode *inode, struct file *file)
560static int gfs2_fsync(struct file *file, loff_t start, loff_t end, 605static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
561 int datasync) 606 int datasync)
562{ 607{
563 struct inode *inode = file->f_mapping->host; 608 struct address_space *mapping = file->f_mapping;
609 struct inode *inode = mapping->host;
564 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); 610 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
565 struct gfs2_inode *ip = GFS2_I(inode); 611 struct gfs2_inode *ip = GFS2_I(inode);
566 int ret; 612 int ret, ret1 = 0;
567 613
568 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 614 if (mapping->nrpages) {
569 if (ret) 615 ret1 = filemap_fdatawrite_range(mapping, start, end);
570 return ret; 616 if (ret1 == -EIO)
571 mutex_lock(&inode->i_mutex); 617 return ret1;
618 }
572 619
573 if (datasync) 620 if (datasync)
574 sync_state &= ~I_DIRTY_SYNC; 621 sync_state &= ~I_DIRTY_SYNC;
575 622
576 if (sync_state) { 623 if (sync_state) {
577 ret = sync_inode_metadata(inode, 1); 624 ret = sync_inode_metadata(inode, 1);
578 if (ret) { 625 if (ret)
579 mutex_unlock(&inode->i_mutex);
580 return ret; 626 return ret;
581 } 627 if (gfs2_is_jdata(ip))
582 gfs2_ail_flush(ip->i_gl); 628 filemap_write_and_wait(mapping);
629 gfs2_ail_flush(ip->i_gl, 1);
583 } 630 }
584 631
585 mutex_unlock(&inode->i_mutex); 632 if (mapping->nrpages)
586 return 0; 633 ret = filemap_fdatawait_range(mapping, start, end);
634
635 return ret ? ret : ret1;
587} 636}
588 637
589/** 638/**
@@ -620,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
620 return generic_file_aio_write(iocb, iov, nr_segs, pos); 669 return generic_file_aio_write(iocb, iov, nr_segs, pos);
621} 670}
622 671
623static int empty_write_end(struct page *page, unsigned from,
624 unsigned to, int mode)
625{
626 struct inode *inode = page->mapping->host;
627 struct gfs2_inode *ip = GFS2_I(inode);
628 struct buffer_head *bh;
629 unsigned offset, blksize = 1 << inode->i_blkbits;
630 pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
631
632 zero_user(page, from, to-from);
633 mark_page_accessed(page);
634
635 if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
636 if (!gfs2_is_writeback(ip))
637 gfs2_page_add_databufs(ip, page, from, to);
638
639 block_commit_write(page, from, to);
640 return 0;
641 }
642
643 offset = 0;
644 bh = page_buffers(page);
645 while (offset < to) {
646 if (offset >= from) {
647 set_buffer_uptodate(bh);
648 mark_buffer_dirty(bh);
649 clear_buffer_new(bh);
650 write_dirty_buffer(bh, WRITE);
651 }
652 offset += blksize;
653 bh = bh->b_this_page;
654 }
655
656 offset = 0;
657 bh = page_buffers(page);
658 while (offset < to) {
659 if (offset >= from) {
660 wait_on_buffer(bh);
661 if (!buffer_uptodate(bh))
662 return -EIO;
663 }
664 offset += blksize;
665 bh = bh->b_this_page;
666 }
667 return 0;
668}
669
670static int needs_empty_write(sector_t block, struct inode *inode)
671{
672 int error;
673 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
674
675 bh_map.b_size = 1 << inode->i_blkbits;
676 error = gfs2_block_map(inode, block, &bh_map, 0);
677 if (unlikely(error))
678 return error;
679 return !buffer_mapped(&bh_map);
680}
681
682static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
683 int mode)
684{
685 struct inode *inode = page->mapping->host;
686 unsigned start, end, next, blksize;
687 sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
688 int ret;
689
690 blksize = 1 << inode->i_blkbits;
691 next = end = 0;
692 while (next < from) {
693 next += blksize;
694 block++;
695 }
696 start = next;
697 do {
698 next += blksize;
699 ret = needs_empty_write(block, inode);
700 if (unlikely(ret < 0))
701 return ret;
702 if (ret == 0) {
703 if (end) {
704 ret = __block_write_begin(page, start, end - start,
705 gfs2_block_map);
706 if (unlikely(ret))
707 return ret;
708 ret = empty_write_end(page, start, end, mode);
709 if (unlikely(ret))
710 return ret;
711 end = 0;
712 }
713 start = next;
714 }
715 else
716 end = next;
717 block++;
718 } while (next < to);
719
720 if (end) {
721 ret = __block_write_begin(page, start, end - start, gfs2_block_map);
722 if (unlikely(ret))
723 return ret;
724 ret = empty_write_end(page, start, end, mode);
725 if (unlikely(ret))
726 return ret;
727 }
728
729 return 0;
730}
731
732static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, 672static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
733 int mode) 673 int mode)
734{ 674{
735 struct gfs2_inode *ip = GFS2_I(inode); 675 struct gfs2_inode *ip = GFS2_I(inode);
736 struct buffer_head *dibh; 676 struct buffer_head *dibh;
737 int error; 677 int error;
738 u64 start = offset >> PAGE_CACHE_SHIFT; 678 unsigned int nr_blks;
739 unsigned int start_offset = offset & ~PAGE_CACHE_MASK; 679 sector_t lblock = offset >> inode->i_blkbits;
740 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
741 pgoff_t curr;
742 struct page *page;
743 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
744 unsigned int from, to;
745
746 if (!end_offset)
747 end_offset = PAGE_CACHE_SIZE;
748 680
749 error = gfs2_meta_inode_buffer(ip, &dibh); 681 error = gfs2_meta_inode_buffer(ip, &dibh);
750 if (unlikely(error)) 682 if (unlikely(error))
751 goto out; 683 return error;
752 684
753 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 685 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
754 686
@@ -758,40 +690,31 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
758 goto out; 690 goto out;
759 } 691 }
760 692
761 curr = start; 693 while (len) {
762 offset = start << PAGE_CACHE_SHIFT; 694 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
763 from = start_offset; 695 bh_map.b_size = len;
764 to = PAGE_CACHE_SIZE; 696 set_buffer_zeronew(&bh_map);
765 while (curr <= end) {
766 page = grab_cache_page_write_begin(inode->i_mapping, curr,
767 AOP_FLAG_NOFS);
768 if (unlikely(!page)) {
769 error = -ENOMEM;
770 goto out;
771 }
772 697
773 if (curr == end) 698 error = gfs2_block_map(inode, lblock, &bh_map, 1);
774 to = end_offset; 699 if (unlikely(error))
775 error = write_empty_blocks(page, from, to, mode);
776 if (!error && offset + to > inode->i_size &&
777 !(mode & FALLOC_FL_KEEP_SIZE)) {
778 i_size_write(inode, offset + to);
779 }
780 unlock_page(page);
781 page_cache_release(page);
782 if (error)
783 goto out; 700 goto out;
784 curr++; 701 len -= bh_map.b_size;
785 offset += PAGE_CACHE_SIZE; 702 nr_blks = bh_map.b_size >> inode->i_blkbits;
786 from = 0; 703 lblock += nr_blks;
704 if (!buffer_new(&bh_map))
705 continue;
706 if (unlikely(!buffer_zeronew(&bh_map))) {
707 error = -EIO;
708 goto out;
709 }
787 } 710 }
711 if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
712 i_size_write(inode, offset + len);
788 713
789 gfs2_dinode_out(ip, dibh->b_data);
790 mark_inode_dirty(inode); 714 mark_inode_dirty(inode);
791 715
792 brelse(dibh);
793
794out: 716out:
717 brelse(dibh);
795 return error; 718 return error;
796} 719}
797 720
@@ -799,7 +722,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
799 unsigned int *data_blocks, unsigned int *ind_blocks) 722 unsigned int *data_blocks, unsigned int *ind_blocks)
800{ 723{
801 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 724 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
802 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; 725 unsigned int max_blocks = ip->i_rgd->rd_free_clone;
803 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); 726 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
804 727
805 for (tmp = max_data; tmp > sdp->sd_diptrs;) { 728 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
@@ -831,6 +754,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
831 int error; 754 int error;
832 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); 755 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
833 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; 756 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
757 loff_t max_chunk_size = UINT_MAX & bsize_mask;
834 next = (next + 1) << sdp->sd_sb.sb_bsize_shift; 758 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
835 759
836 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 760 /* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -884,11 +808,12 @@ retry:
884 goto out_qunlock; 808 goto out_qunlock;
885 } 809 }
886 max_bytes = bytes; 810 max_bytes = bytes;
887 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); 811 calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
812 &max_bytes, &data_blocks, &ind_blocks);
888 al->al_requested = data_blocks + ind_blocks; 813 al->al_requested = data_blocks + ind_blocks;
889 814
890 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + 815 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
891 RES_RG_HDR + gfs2_rg_blocks(al); 816 RES_RG_HDR + gfs2_rg_blocks(ip);
892 if (gfs2_is_jdata(ip)) 817 if (gfs2_is_jdata(ip))
893 rblocks += data_blocks ? data_blocks : 1; 818 rblocks += data_blocks ? data_blocks : 1;
894 819
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index da21ecaafcc2..78418b4fa857 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,40 +28,55 @@
28#include "trans.h" 28#include "trans.h"
29#include "dir.h" 29#include "dir.h"
30 30
31static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
32{
33 fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
34 bh, (unsigned long long)bh->b_blocknr, bh->b_state,
35 bh->b_page->mapping, bh->b_page->flags);
36 fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
37 gl->gl_name.ln_type, gl->gl_name.ln_number,
38 gfs2_glock2aspace(gl));
39 gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
40}
41
31/** 42/**
32 * __gfs2_ail_flush - remove all buffers for a given lock from the AIL 43 * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
33 * @gl: the glock 44 * @gl: the glock
45 * @fsync: set when called from fsync (not all buffers will be clean)
34 * 46 *
35 * None of the buffers should be dirty, locked, or pinned. 47 * None of the buffers should be dirty, locked, or pinned.
36 */ 48 */
37 49
38static void __gfs2_ail_flush(struct gfs2_glock *gl) 50static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
39{ 51{
40 struct gfs2_sbd *sdp = gl->gl_sbd; 52 struct gfs2_sbd *sdp = gl->gl_sbd;
41 struct list_head *head = &gl->gl_ail_list; 53 struct list_head *head = &gl->gl_ail_list;
42 struct gfs2_bufdata *bd; 54 struct gfs2_bufdata *bd, *tmp;
43 struct buffer_head *bh; 55 struct buffer_head *bh;
56 const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
57 sector_t blocknr;
44 58
59 gfs2_log_lock(sdp);
45 spin_lock(&sdp->sd_ail_lock); 60 spin_lock(&sdp->sd_ail_lock);
46 while (!list_empty(head)) { 61 list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) {
47 bd = list_entry(head->next, struct gfs2_bufdata,
48 bd_ail_gl_list);
49 bh = bd->bd_bh; 62 bh = bd->bd_bh;
50 gfs2_remove_from_ail(bd); 63 if (bh->b_state & b_state) {
51 bd->bd_bh = NULL; 64 if (fsync)
65 continue;
66 gfs2_ail_error(gl, bh);
67 }
68 blocknr = bh->b_blocknr;
52 bh->b_private = NULL; 69 bh->b_private = NULL;
53 spin_unlock(&sdp->sd_ail_lock); 70 gfs2_remove_from_ail(bd); /* drops ref on bh */
54 71
55 bd->bd_blkno = bh->b_blocknr; 72 bd->bd_bh = NULL;
56 gfs2_log_lock(sdp); 73 bd->bd_blkno = blocknr;
57 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
58 gfs2_trans_add_revoke(sdp, bd);
59 gfs2_log_unlock(sdp);
60 74
61 spin_lock(&sdp->sd_ail_lock); 75 gfs2_trans_add_revoke(sdp, bd);
62 } 76 }
63 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); 77 BUG_ON(!fsync && atomic_read(&gl->gl_ail_count));
64 spin_unlock(&sdp->sd_ail_lock); 78 spin_unlock(&sdp->sd_ail_lock);
79 gfs2_log_unlock(sdp);
65} 80}
66 81
67 82
@@ -84,13 +99,13 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
84 BUG_ON(current->journal_info); 99 BUG_ON(current->journal_info);
85 current->journal_info = &tr; 100 current->journal_info = &tr;
86 101
87 __gfs2_ail_flush(gl); 102 __gfs2_ail_flush(gl, 0);
88 103
89 gfs2_trans_end(sdp); 104 gfs2_trans_end(sdp);
90 gfs2_log_flush(sdp, NULL); 105 gfs2_log_flush(sdp, NULL);
91} 106}
92 107
93void gfs2_ail_flush(struct gfs2_glock *gl) 108void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
94{ 109{
95 struct gfs2_sbd *sdp = gl->gl_sbd; 110 struct gfs2_sbd *sdp = gl->gl_sbd;
96 unsigned int revokes = atomic_read(&gl->gl_ail_count); 111 unsigned int revokes = atomic_read(&gl->gl_ail_count);
@@ -102,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
102 ret = gfs2_trans_begin(sdp, 0, revokes); 117 ret = gfs2_trans_begin(sdp, 0, revokes);
103 if (ret) 118 if (ret)
104 return; 119 return;
105 __gfs2_ail_flush(gl); 120 __gfs2_ail_flush(gl, fsync);
106 gfs2_trans_end(sdp); 121 gfs2_trans_end(sdp);
107 gfs2_log_flush(sdp, NULL); 122 gfs2_log_flush(sdp, NULL);
108} 123}
@@ -119,6 +134,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
119static void rgrp_go_sync(struct gfs2_glock *gl) 134static void rgrp_go_sync(struct gfs2_glock *gl)
120{ 135{
121 struct address_space *metamapping = gfs2_glock2aspace(gl); 136 struct address_space *metamapping = gfs2_glock2aspace(gl);
137 struct gfs2_rgrpd *rgd;
122 int error; 138 int error;
123 139
124 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 140 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -130,6 +146,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
130 error = filemap_fdatawait(metamapping); 146 error = filemap_fdatawait(metamapping);
131 mapping_set_error(metamapping, error); 147 mapping_set_error(metamapping, error);
132 gfs2_ail_empty_gl(gl); 148 gfs2_ail_empty_gl(gl);
149
150 spin_lock(&gl->gl_spin);
151 rgd = gl->gl_object;
152 if (rgd)
153 gfs2_free_clones(rgd);
154 spin_unlock(&gl->gl_spin);
133} 155}
134 156
135/** 157/**
@@ -430,33 +452,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
430} 452}
431 453
432/** 454/**
433 * rgrp_go_lock - operation done after an rgrp lock is locked by
434 * a first holder on this node.
435 * @gl: the glock
436 * @flags:
437 *
438 * Returns: errno
439 */
440
441static int rgrp_go_lock(struct gfs2_holder *gh)
442{
443 return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
444}
445
446/**
447 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
448 * a last holder on this node.
449 * @gl: the glock
450 * @flags:
451 *
452 */
453
454static void rgrp_go_unlock(struct gfs2_holder *gh)
455{
456 gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
457}
458
459/**
460 * trans_go_sync - promote/demote the transaction glock 455 * trans_go_sync - promote/demote the transaction glock
461 * @gl: the glock 456 * @gl: the glock
462 * @state: the requested state 457 * @state: the requested state
@@ -558,8 +553,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
558const struct gfs2_glock_operations gfs2_rgrp_glops = { 553const struct gfs2_glock_operations gfs2_rgrp_glops = {
559 .go_xmote_th = rgrp_go_sync, 554 .go_xmote_th = rgrp_go_sync,
560 .go_inval = rgrp_go_inval, 555 .go_inval = rgrp_go_inval,
561 .go_lock = rgrp_go_lock, 556 .go_lock = gfs2_rgrp_go_lock,
562 .go_unlock = rgrp_go_unlock, 557 .go_unlock = gfs2_rgrp_go_unlock,
563 .go_dump = gfs2_rgrp_dump, 558 .go_dump = gfs2_rgrp_dump,
564 .go_type = LM_TYPE_RGRP, 559 .go_type = LM_TYPE_RGRP,
565 .go_flags = GLOF_ASPACE, 560 .go_flags = GLOF_ASPACE,
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 6fce409b5a50..bf95a2dc1662 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -23,6 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
23extern const struct gfs2_glock_operations gfs2_journal_glops; 23extern const struct gfs2_glock_operations gfs2_journal_glops;
24extern const struct gfs2_glock_operations *gfs2_glops_list[]; 24extern const struct gfs2_glock_operations *gfs2_glops_list[];
25 25
26extern void gfs2_ail_flush(struct gfs2_glock *gl); 26extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
27 27
28#endif /* __GLOPS_DOT_H__ */ 28#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37de8ae..7389dfdcc9ef 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -18,6 +18,7 @@
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/rculist_bl.h> 19#include <linux/rculist_bl.h>
20#include <linux/completion.h> 20#include <linux/completion.h>
21#include <linux/rbtree.h>
21 22
22#define DIO_WAIT 0x00000010 23#define DIO_WAIT 0x00000010
23#define DIO_METADATA 0x00000020 24#define DIO_METADATA 0x00000020
@@ -78,8 +79,7 @@ struct gfs2_bitmap {
78}; 79};
79 80
80struct gfs2_rgrpd { 81struct gfs2_rgrpd {
81 struct list_head rd_list; /* Link with superblock */ 82 struct rb_node rd_node; /* Link with superblock */
82 struct list_head rd_list_mru;
83 struct gfs2_glock *rd_gl; /* Glock for this rgrp */ 83 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
84 u64 rd_addr; /* grp block disk address */ 84 u64 rd_addr; /* grp block disk address */
85 u64 rd_data0; /* first data location */ 85 u64 rd_data0; /* first data location */
@@ -91,10 +91,7 @@ struct gfs2_rgrpd {
91 u32 rd_dinodes; 91 u32 rd_dinodes;
92 u64 rd_igeneration; 92 u64 rd_igeneration;
93 struct gfs2_bitmap *rd_bits; 93 struct gfs2_bitmap *rd_bits;
94 struct mutex rd_mutex;
95 struct gfs2_log_element rd_le;
96 struct gfs2_sbd *rd_sbd; 94 struct gfs2_sbd *rd_sbd;
97 unsigned int rd_bh_count;
98 u32 rd_last_alloc; 95 u32 rd_last_alloc;
99 u32 rd_flags; 96 u32 rd_flags;
100#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ 97#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
@@ -106,12 +103,15 @@ struct gfs2_rgrpd {
106enum gfs2_state_bits { 103enum gfs2_state_bits {
107 BH_Pinned = BH_PrivateStart, 104 BH_Pinned = BH_PrivateStart,
108 BH_Escaped = BH_PrivateStart + 1, 105 BH_Escaped = BH_PrivateStart + 1,
106 BH_Zeronew = BH_PrivateStart + 2,
109}; 107};
110 108
111BUFFER_FNS(Pinned, pinned) 109BUFFER_FNS(Pinned, pinned)
112TAS_BUFFER_FNS(Pinned, pinned) 110TAS_BUFFER_FNS(Pinned, pinned)
113BUFFER_FNS(Escaped, escaped) 111BUFFER_FNS(Escaped, escaped)
114TAS_BUFFER_FNS(Escaped, escaped) 112TAS_BUFFER_FNS(Escaped, escaped)
113BUFFER_FNS(Zeronew, zeronew)
114TAS_BUFFER_FNS(Zeronew, zeronew)
115 115
116struct gfs2_bufdata { 116struct gfs2_bufdata {
117 struct buffer_head *bd_bh; 117 struct buffer_head *bd_bh;
@@ -246,7 +246,6 @@ struct gfs2_glock {
246 246
247struct gfs2_alloc { 247struct gfs2_alloc {
248 /* Quota stuff */ 248 /* Quota stuff */
249
250 struct gfs2_quota_data *al_qd[2*MAXQUOTAS]; 249 struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
251 struct gfs2_holder al_qd_ghs[2*MAXQUOTAS]; 250 struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
252 unsigned int al_qd_num; 251 unsigned int al_qd_num;
@@ -255,18 +254,13 @@ struct gfs2_alloc {
255 u32 al_alloced; /* Filled in by gfs2_alloc_*() */ 254 u32 al_alloced; /* Filled in by gfs2_alloc_*() */
256 255
257 /* Filled in by gfs2_inplace_reserve() */ 256 /* Filled in by gfs2_inplace_reserve() */
258
259 unsigned int al_line;
260 char *al_file;
261 struct gfs2_holder al_ri_gh;
262 struct gfs2_holder al_rgd_gh; 257 struct gfs2_holder al_rgd_gh;
263 struct gfs2_rgrpd *al_rgd;
264
265}; 258};
266 259
267enum { 260enum {
268 GIF_INVALID = 0, 261 GIF_INVALID = 0,
269 GIF_QD_LOCKED = 1, 262 GIF_QD_LOCKED = 1,
263 GIF_ALLOC_FAILED = 2,
270 GIF_SW_PAGED = 3, 264 GIF_SW_PAGED = 3,
271}; 265};
272 266
@@ -282,6 +276,7 @@ struct gfs2_inode {
282 struct gfs2_holder i_iopen_gh; 276 struct gfs2_holder i_iopen_gh;
283 struct gfs2_holder i_gh; /* for prepare/commit_write only */ 277 struct gfs2_holder i_gh; /* for prepare/commit_write only */
284 struct gfs2_alloc *i_alloc; 278 struct gfs2_alloc *i_alloc;
279 struct gfs2_rgrpd *i_rgd;
285 u64 i_goal; /* goal block for allocations */ 280 u64 i_goal; /* goal block for allocations */
286 struct rw_semaphore i_rw_mutex; 281 struct rw_semaphore i_rw_mutex;
287 struct list_head i_trunc_list; 282 struct list_head i_trunc_list;
@@ -574,9 +569,7 @@ struct gfs2_sbd {
574 int sd_rindex_uptodate; 569 int sd_rindex_uptodate;
575 spinlock_t sd_rindex_spin; 570 spinlock_t sd_rindex_spin;
576 struct mutex sd_rindex_mutex; 571 struct mutex sd_rindex_mutex;
577 struct list_head sd_rindex_list; 572 struct rb_root sd_rindex_tree;
578 struct list_head sd_rindex_mru_list;
579 struct gfs2_rgrpd *sd_rindex_forward;
580 unsigned int sd_rgrps; 573 unsigned int sd_rgrps;
581 unsigned int sd_max_rg_data; 574 unsigned int sd_max_rg_data;
582 575
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 900cf986aadc..cfd4959b218c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -583,7 +583,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
583 goto fail_quota_locks; 583 goto fail_quota_locks;
584 584
585 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 585 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
586 al->al_rgd->rd_length + 586 dip->i_rgd->rd_length +
587 2 * RES_DINODE + 587 2 * RES_DINODE +
588 RES_STATFS + RES_QUOTA, 0); 588 RES_STATFS + RES_QUOTA, 0);
589 if (error) 589 if (error)
@@ -613,8 +613,7 @@ fail_end_trans:
613 gfs2_trans_end(sdp); 613 gfs2_trans_end(sdp);
614 614
615fail_ipreserv: 615fail_ipreserv:
616 if (dip->i_alloc->al_rgd) 616 gfs2_inplace_release(dip);
617 gfs2_inplace_release(dip);
618 617
619fail_quota_locks: 618fail_quota_locks:
620 gfs2_quota_unlock(dip); 619 gfs2_quota_unlock(dip);
@@ -624,31 +623,29 @@ fail:
624 return error; 623 return error;
625} 624}
626 625
627static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, 626int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
628 const struct qstr *qstr) 627 void *fs_info)
629{ 628{
630 int err; 629 const struct xattr *xattr;
631 size_t len; 630 int err = 0;
632 void *value; 631
633 char *name; 632 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
634 633 err = __gfs2_xattr_set(inode, xattr->name, xattr->value,
635 err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, 634 xattr->value_len, 0,
636 &name, &value, &len); 635 GFS2_EATYPE_SECURITY);
637 636 if (err < 0)
638 if (err) { 637 break;
639 if (err == -EOPNOTSUPP)
640 return 0;
641 return err;
642 } 638 }
643
644 err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
645 GFS2_EATYPE_SECURITY);
646 kfree(value);
647 kfree(name);
648
649 return err; 639 return err;
650} 640}
651 641
642static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
643 const struct qstr *qstr)
644{
645 return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
646 &gfs2_initxattrs, NULL);
647}
648
652/** 649/**
653 * gfs2_create_inode - Create a new inode 650 * gfs2_create_inode - Create a new inode
654 * @dir: The parent directory 651 * @dir: The parent directory
@@ -663,7 +660,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
663 660
664static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, 661static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
665 unsigned int mode, dev_t dev, const char *symname, 662 unsigned int mode, dev_t dev, const char *symname,
666 unsigned int size) 663 unsigned int size, int excl)
667{ 664{
668 const struct qstr *name = &dentry->d_name; 665 const struct qstr *name = &dentry->d_name;
669 struct gfs2_holder ghs[2]; 666 struct gfs2_holder ghs[2];
@@ -683,6 +680,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
683 goto fail; 680 goto fail;
684 681
685 error = create_ok(dip, name, mode); 682 error = create_ok(dip, name, mode);
683 if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
684 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
685 gfs2_glock_dq_uninit(ghs);
686 d_instantiate(dentry, inode);
687 return IS_ERR(inode) ? PTR_ERR(inode) : 0;
688 }
686 if (error) 689 if (error)
687 goto fail_gunlock; 690 goto fail_gunlock;
688 691
@@ -725,21 +728,22 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
725 brelse(bh); 728 brelse(bh);
726 729
727 gfs2_trans_end(sdp); 730 gfs2_trans_end(sdp);
728 if (dip->i_alloc->al_rgd) 731 gfs2_inplace_release(dip);
729 gfs2_inplace_release(dip);
730 gfs2_quota_unlock(dip); 732 gfs2_quota_unlock(dip);
731 gfs2_alloc_put(dip); 733 gfs2_alloc_put(dip);
732 gfs2_glock_dq_uninit_m(2, ghs);
733 mark_inode_dirty(inode); 734 mark_inode_dirty(inode);
735 gfs2_glock_dq_uninit_m(2, ghs);
734 d_instantiate(dentry, inode); 736 d_instantiate(dentry, inode);
735 return 0; 737 return 0;
736 738
737fail_gunlock2: 739fail_gunlock2:
738 gfs2_glock_dq_uninit(ghs + 1); 740 gfs2_glock_dq_uninit(ghs + 1);
739 if (inode && !IS_ERR(inode))
740 iput(inode);
741fail_gunlock: 741fail_gunlock:
742 gfs2_glock_dq_uninit(ghs); 742 gfs2_glock_dq_uninit(ghs);
743 if (inode && !IS_ERR(inode)) {
744 set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
745 iput(inode);
746 }
743fail: 747fail:
744 if (bh) 748 if (bh)
745 brelse(bh); 749 brelse(bh);
@@ -758,24 +762,10 @@ fail:
758static int gfs2_create(struct inode *dir, struct dentry *dentry, 762static int gfs2_create(struct inode *dir, struct dentry *dentry,
759 int mode, struct nameidata *nd) 763 int mode, struct nameidata *nd)
760{ 764{
761 struct inode *inode; 765 int excl = 0;
762 int ret; 766 if (nd && (nd->flags & LOOKUP_EXCL))
763 767 excl = 1;
764 for (;;) { 768 return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
765 ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
766 if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
767 return ret;
768
769 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
770 if (inode) {
771 if (!IS_ERR(inode))
772 break;
773 return PTR_ERR(inode);
774 }
775 }
776
777 d_instantiate(dentry, inode);
778 return 0;
779} 769}
780 770
781/** 771/**
@@ -902,7 +892,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
902 goto out_gunlock_q; 892 goto out_gunlock_q;
903 893
904 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 894 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
905 gfs2_rg_blocks(al) + 895 gfs2_rg_blocks(dip) +
906 2 * RES_DINODE + RES_STATFS + 896 2 * RES_DINODE + RES_STATFS +
907 RES_QUOTA, 0); 897 RES_QUOTA, 0);
908 if (error) 898 if (error)
@@ -924,8 +914,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
924 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 914 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
925 inc_nlink(&ip->i_inode); 915 inc_nlink(&ip->i_inode);
926 ip->i_inode.i_ctime = CURRENT_TIME; 916 ip->i_inode.i_ctime = CURRENT_TIME;
927 gfs2_dinode_out(ip, dibh->b_data); 917 ihold(inode);
928 mark_inode_dirty(&ip->i_inode); 918 d_instantiate(dentry, inode);
919 mark_inode_dirty(inode);
929 920
930out_brelse: 921out_brelse:
931 brelse(dibh); 922 brelse(dibh);
@@ -947,11 +938,6 @@ out_child:
947out_parent: 938out_parent:
948 gfs2_holder_uninit(ghs); 939 gfs2_holder_uninit(ghs);
949 gfs2_holder_uninit(ghs + 1); 940 gfs2_holder_uninit(ghs + 1);
950 if (!error) {
951 ihold(inode);
952 d_instantiate(dentry, inode);
953 mark_inode_dirty(inode);
954 }
955 return error; 941 return error;
956} 942}
957 943
@@ -1024,8 +1010,6 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip,
1024 clear_nlink(inode); 1010 clear_nlink(inode);
1025 else 1011 else
1026 drop_nlink(inode); 1012 drop_nlink(inode);
1027 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1028 gfs2_dinode_out(ip, bh->b_data);
1029 mark_inode_dirty(inode); 1013 mark_inode_dirty(inode);
1030 if (inode->i_nlink == 0) 1014 if (inode->i_nlink == 0)
1031 gfs2_unlink_di(inode); 1015 gfs2_unlink_di(inode);
@@ -1053,13 +1037,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
1053 struct buffer_head *bh; 1037 struct buffer_head *bh;
1054 struct gfs2_holder ghs[3]; 1038 struct gfs2_holder ghs[3];
1055 struct gfs2_rgrpd *rgd; 1039 struct gfs2_rgrpd *rgd;
1056 struct gfs2_holder ri_gh;
1057 int error; 1040 int error;
1058 1041
1059 error = gfs2_rindex_hold(sdp, &ri_gh);
1060 if (error)
1061 return error;
1062
1063 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 1042 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
1064 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 1043 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
1065 1044
@@ -1116,7 +1095,6 @@ out_child:
1116 gfs2_glock_dq(ghs); 1095 gfs2_glock_dq(ghs);
1117out_parent: 1096out_parent:
1118 gfs2_holder_uninit(ghs); 1097 gfs2_holder_uninit(ghs);
1119 gfs2_glock_dq_uninit(&ri_gh);
1120 return error; 1098 return error;
1121} 1099}
1122 1100
@@ -1139,7 +1117,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
1139 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) 1117 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
1140 return -ENAMETOOLONG; 1118 return -ENAMETOOLONG;
1141 1119
1142 return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size); 1120 return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
1143} 1121}
1144 1122
1145/** 1123/**
@@ -1153,7 +1131,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
1153 1131
1154static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1132static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1155{ 1133{
1156 return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0); 1134 return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
1157} 1135}
1158 1136
1159/** 1137/**
@@ -1168,7 +1146,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1168static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, 1146static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
1169 dev_t dev) 1147 dev_t dev)
1170{ 1148{
1171 return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0); 1149 return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
1172} 1150}
1173 1151
1174/* 1152/*
@@ -1234,7 +1212,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1234 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 1212 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
1235 struct gfs2_inode *nip = NULL; 1213 struct gfs2_inode *nip = NULL;
1236 struct gfs2_sbd *sdp = GFS2_SB(odir); 1214 struct gfs2_sbd *sdp = GFS2_SB(odir);
1237 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; 1215 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
1238 struct gfs2_rgrpd *nrgd; 1216 struct gfs2_rgrpd *nrgd;
1239 unsigned int num_gh; 1217 unsigned int num_gh;
1240 int dir_rename = 0; 1218 int dir_rename = 0;
@@ -1248,10 +1226,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1248 return 0; 1226 return 0;
1249 } 1227 }
1250 1228
1251 error = gfs2_rindex_hold(sdp, &ri_gh);
1252 if (error)
1253 return error;
1254
1255 if (odip != ndip) { 1229 if (odip != ndip) {
1256 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 1230 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
1257 0, &r_gh); 1231 0, &r_gh);
@@ -1388,12 +1362,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1388 1362
1389 al->al_requested = sdp->sd_max_dirres; 1363 al->al_requested = sdp->sd_max_dirres;
1390 1364
1391 error = gfs2_inplace_reserve_ri(ndip); 1365 error = gfs2_inplace_reserve(ndip);
1392 if (error) 1366 if (error)
1393 goto out_gunlock_q; 1367 goto out_gunlock_q;
1394 1368
1395 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 1369 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
1396 gfs2_rg_blocks(al) + 1370 gfs2_rg_blocks(ndip) +
1397 4 * RES_DINODE + 4 * RES_LEAF + 1371 4 * RES_DINODE + 4 * RES_LEAF +
1398 RES_STATFS + RES_QUOTA + 4, 0); 1372 RES_STATFS + RES_QUOTA + 4, 0);
1399 if (error) 1373 if (error)
@@ -1459,7 +1433,6 @@ out_gunlock_r:
1459 if (r_gh.gh_gl) 1433 if (r_gh.gh_gl)
1460 gfs2_glock_dq_uninit(&r_gh); 1434 gfs2_glock_dq_uninit(&r_gh);
1461out: 1435out:
1462 gfs2_glock_dq_uninit(&ri_gh);
1463 return error; 1436 return error;
1464} 1437}
1465 1438
@@ -1563,21 +1536,10 @@ int gfs2_permission(struct inode *inode, int mask)
1563 return error; 1536 return error;
1564} 1537}
1565 1538
1566static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) 1539static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
1567{ 1540{
1568 struct inode *inode = &ip->i_inode;
1569 struct buffer_head *dibh;
1570 int error;
1571
1572 error = gfs2_meta_inode_buffer(ip, &dibh);
1573 if (error)
1574 return error;
1575
1576 setattr_copy(inode, attr); 1541 setattr_copy(inode, attr);
1577 mark_inode_dirty(inode); 1542 mark_inode_dirty(inode);
1578 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1579 gfs2_dinode_out(ip, dibh->b_data);
1580 brelse(dibh);
1581 return 0; 1543 return 0;
1582} 1544}
1583 1545
@@ -1589,19 +1551,19 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1589 * Returns: errno 1551 * Returns: errno
1590 */ 1552 */
1591 1553
1592int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) 1554int gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
1593{ 1555{
1594 int error; 1556 int error;
1595 1557
1596 if (current->journal_info) 1558 if (current->journal_info)
1597 return __gfs2_setattr_simple(ip, attr); 1559 return __gfs2_setattr_simple(inode, attr);
1598 1560
1599 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0); 1561 error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0);
1600 if (error) 1562 if (error)
1601 return error; 1563 return error;
1602 1564
1603 error = __gfs2_setattr_simple(ip, attr); 1565 error = __gfs2_setattr_simple(inode, attr);
1604 gfs2_trans_end(GFS2_SB(&ip->i_inode)); 1566 gfs2_trans_end(GFS2_SB(inode));
1605 return error; 1567 return error;
1606} 1568}
1607 1569
@@ -1639,7 +1601,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1639 if (error) 1601 if (error)
1640 goto out_gunlock_q; 1602 goto out_gunlock_q;
1641 1603
1642 error = gfs2_setattr_simple(ip, attr); 1604 error = gfs2_setattr_simple(inode, attr);
1643 if (error) 1605 if (error)
1644 goto out_end_trans; 1606 goto out_end_trans;
1645 1607
@@ -1695,12 +1657,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1695 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) 1657 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1696 error = gfs2_acl_chmod(ip, attr); 1658 error = gfs2_acl_chmod(ip, attr);
1697 else 1659 else
1698 error = gfs2_setattr_simple(ip, attr); 1660 error = gfs2_setattr_simple(inode, attr);
1699 1661
1700out: 1662out:
1701 gfs2_glock_dq_uninit(&i_gh);
1702 if (!error) 1663 if (!error)
1703 mark_inode_dirty(inode); 1664 mark_inode_dirty(inode);
1665 gfs2_glock_dq_uninit(&i_gh);
1704 return error; 1666 return error;
1705} 1667}
1706 1668
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 8d90e0c07672..276e7b52b658 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,7 +109,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip);
109extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, 109extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
110 int is_root); 110 int is_root);
111extern int gfs2_permission(struct inode *inode, int mask); 111extern int gfs2_permission(struct inode *inode, int mask);
112extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); 112extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
113extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 113extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
114extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 114extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
115 115
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 85c62923ee29..598646434362 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -624,9 +624,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
624 bh->b_end_io = end_buffer_write_sync; 624 bh->b_end_io = end_buffer_write_sync;
625 get_bh(bh); 625 get_bh(bh);
626 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 626 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
627 submit_bh(WRITE_SYNC | REQ_META, bh); 627 submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
628 else 628 else
629 submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); 629 submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh);
630 wait_on_buffer(bh); 630 wait_on_buffer(bh);
631 631
632 if (!buffer_uptodate(bh)) 632 if (!buffer_uptodate(bh))
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 05bbb124699f..0301be655b12 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -60,6 +60,29 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
60 trace_gfs2_pin(bd, 1); 60 trace_gfs2_pin(bd, 1);
61} 61}
62 62
63static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
64{
65 return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP;
66}
67
68static void maybe_release_space(struct gfs2_bufdata *bd)
69{
70 struct gfs2_glock *gl = bd->bd_gl;
71 struct gfs2_sbd *sdp = gl->gl_sbd;
72 struct gfs2_rgrpd *rgd = gl->gl_object;
73 unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
74 struct gfs2_bitmap *bi = rgd->rd_bits + index;
75
76 if (bi->bi_clone == 0)
77 return;
78 if (sdp->sd_args.ar_discard)
79 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi);
80 memcpy(bi->bi_clone + bi->bi_offset,
81 bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
82 clear_bit(GBF_FULL, &bi->bi_flags);
83 rgd->rd_free_clone = rgd->rd_free;
84}
85
63/** 86/**
64 * gfs2_unpin - Unpin a buffer 87 * gfs2_unpin - Unpin a buffer
65 * @sdp: the filesystem the buffer belongs to 88 * @sdp: the filesystem the buffer belongs to
@@ -81,6 +104,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
81 mark_buffer_dirty(bh); 104 mark_buffer_dirty(bh);
82 clear_buffer_pinned(bh); 105 clear_buffer_pinned(bh);
83 106
107 if (buffer_is_rgrp(bd))
108 maybe_release_space(bd);
109
84 spin_lock(&sdp->sd_ail_lock); 110 spin_lock(&sdp->sd_ail_lock);
85 if (bd->bd_ail) { 111 if (bd->bd_ail) {
86 list_del(&bd->bd_ail_st_list); 112 list_del(&bd->bd_ail_st_list);
@@ -469,42 +495,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
469 gfs2_revoke_clean(sdp); 495 gfs2_revoke_clean(sdp);
470} 496}
471 497
472static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
473{
474 struct gfs2_rgrpd *rgd;
475 struct gfs2_trans *tr = current->journal_info;
476
477 tr->tr_touched = 1;
478
479 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
480
481 gfs2_log_lock(sdp);
482 if (!list_empty(&le->le_list)){
483 gfs2_log_unlock(sdp);
484 return;
485 }
486 gfs2_rgrp_bh_hold(rgd);
487 sdp->sd_log_num_rg++;
488 list_add(&le->le_list, &sdp->sd_log_le_rg);
489 gfs2_log_unlock(sdp);
490}
491
492static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
493{
494 struct list_head *head = &sdp->sd_log_le_rg;
495 struct gfs2_rgrpd *rgd;
496
497 while (!list_empty(head)) {
498 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
499 list_del_init(&rgd->rd_le.le_list);
500 sdp->sd_log_num_rg--;
501
502 gfs2_rgrp_repolish_clones(rgd);
503 gfs2_rgrp_bh_put(rgd);
504 }
505 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
506}
507
508/** 498/**
509 * databuf_lo_add - Add a databuf to the transaction. 499 * databuf_lo_add - Add a databuf to the transaction.
510 * 500 *
@@ -705,8 +695,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
705 695
706 brelse(bh_log); 696 brelse(bh_log);
707 brelse(bh_ip); 697 brelse(bh_ip);
708 if (error)
709 break;
710 698
711 sdp->sd_replayed_blocks++; 699 sdp->sd_replayed_blocks++;
712 } 700 }
@@ -771,8 +759,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
771}; 759};
772 760
773const struct gfs2_log_operations gfs2_rg_lops = { 761const struct gfs2_log_operations gfs2_rg_lops = {
774 .lo_add = rg_lo_add,
775 .lo_after_commit = rg_lo_after_commit,
776 .lo_name = "rg", 762 .lo_name = "rg",
777}; 763};
778 764
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 747238cd9f96..be29858900f6 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
37{ 37{
38 struct buffer_head *bh, *head; 38 struct buffer_head *bh, *head;
39 int nr_underway = 0; 39 int nr_underway = 0;
40 int write_op = REQ_META | 40 int write_op = REQ_META | REQ_PRIO |
41 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); 41 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
42 42
43 BUG_ON(!PageLocked(page)); 43 BUG_ON(!PageLocked(page));
@@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
225 } 225 }
226 bh->b_end_io = end_buffer_read_sync; 226 bh->b_end_io = end_buffer_read_sync;
227 get_bh(bh); 227 get_bh(bh);
228 submit_bh(READ_SYNC | REQ_META, bh); 228 submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
229 if (!(flags & DIO_WAIT)) 229 if (!(flags & DIO_WAIT))
230 return 0; 230 return 0;
231 231
@@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
435 if (buffer_uptodate(first_bh)) 435 if (buffer_uptodate(first_bh))
436 goto out; 436 goto out;
437 if (!buffer_locked(first_bh)) 437 if (!buffer_locked(first_bh))
438 ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); 438 ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh);
439 439
440 dblock++; 440 dblock++;
441 extlen--; 441 extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3bc073a4cf82..7e823bbd2453 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -77,8 +77,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
77 77
78 spin_lock_init(&sdp->sd_rindex_spin); 78 spin_lock_init(&sdp->sd_rindex_spin);
79 mutex_init(&sdp->sd_rindex_mutex); 79 mutex_init(&sdp->sd_rindex_mutex);
80 INIT_LIST_HEAD(&sdp->sd_rindex_list); 80 sdp->sd_rindex_tree.rb_node = NULL;
81 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
82 81
83 INIT_LIST_HEAD(&sdp->sd_jindex_list); 82 INIT_LIST_HEAD(&sdp->sd_jindex_list);
84 spin_lock_init(&sdp->sd_jindex_spin); 83 spin_lock_init(&sdp->sd_jindex_spin);
@@ -224,7 +223,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
224 223
225 bio->bi_end_io = end_bio_io_page; 224 bio->bi_end_io = end_bio_io_page;
226 bio->bi_private = page; 225 bio->bi_private = page;
227 submit_bio(READ_SYNC | REQ_META, bio); 226 submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio);
228 wait_on_page_locked(page); 227 wait_on_page_locked(page);
229 bio_put(bio); 228 bio_put(bio);
230 if (!PageUptodate(page)) { 229 if (!PageUptodate(page)) {
@@ -652,7 +651,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
652 fs_err(sdp, "can't lookup journal index: %d\n", error); 651 fs_err(sdp, "can't lookup journal index: %d\n", error);
653 return PTR_ERR(sdp->sd_jindex); 652 return PTR_ERR(sdp->sd_jindex);
654 } 653 }
655 ip = GFS2_I(sdp->sd_jindex);
656 654
657 /* Load in the journal index special file */ 655 /* Load in the journal index special file */
658 656
@@ -764,7 +762,6 @@ fail:
764static int init_inodes(struct gfs2_sbd *sdp, int undo) 762static int init_inodes(struct gfs2_sbd *sdp, int undo)
765{ 763{
766 int error = 0; 764 int error = 0;
767 struct gfs2_inode *ip;
768 struct inode *master = sdp->sd_master_dir->d_inode; 765 struct inode *master = sdp->sd_master_dir->d_inode;
769 766
770 if (undo) 767 if (undo)
@@ -789,7 +786,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
789 fs_err(sdp, "can't get resource index inode: %d\n", error); 786 fs_err(sdp, "can't get resource index inode: %d\n", error);
790 goto fail_statfs; 787 goto fail_statfs;
791 } 788 }
792 ip = GFS2_I(sdp->sd_rindex);
793 sdp->sd_rindex_uptodate = 0; 789 sdp->sd_rindex_uptodate = 0;
794 790
795 /* Read in the quota inode */ 791 /* Read in the quota inode */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 42e8d23bc047..7e528dc14f85 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -638,15 +638,18 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
638 unsigned long index = loc >> PAGE_CACHE_SHIFT; 638 unsigned long index = loc >> PAGE_CACHE_SHIFT;
639 unsigned offset = loc & (PAGE_CACHE_SIZE - 1); 639 unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
640 unsigned blocksize, iblock, pos; 640 unsigned blocksize, iblock, pos;
641 struct buffer_head *bh, *dibh; 641 struct buffer_head *bh;
642 struct page *page; 642 struct page *page;
643 void *kaddr, *ptr; 643 void *kaddr, *ptr;
644 struct gfs2_quota q, *qp; 644 struct gfs2_quota q, *qp;
645 int err, nbytes; 645 int err, nbytes;
646 u64 size; 646 u64 size;
647 647
648 if (gfs2_is_stuffed(ip)) 648 if (gfs2_is_stuffed(ip)) {
649 gfs2_unstuff_dinode(ip, NULL); 649 err = gfs2_unstuff_dinode(ip, NULL);
650 if (err)
651 return err;
652 }
650 653
651 memset(&q, 0, sizeof(struct gfs2_quota)); 654 memset(&q, 0, sizeof(struct gfs2_quota));
652 err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); 655 err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
@@ -709,7 +712,7 @@ get_a_page:
709 set_buffer_uptodate(bh); 712 set_buffer_uptodate(bh);
710 713
711 if (!buffer_uptodate(bh)) { 714 if (!buffer_uptodate(bh)) {
712 ll_rw_block(READ_META, 1, &bh); 715 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
713 wait_on_buffer(bh); 716 wait_on_buffer(bh);
714 if (!buffer_uptodate(bh)) 717 if (!buffer_uptodate(bh))
715 goto unlock_out; 718 goto unlock_out;
@@ -736,22 +739,13 @@ get_a_page:
736 goto get_a_page; 739 goto get_a_page;
737 } 740 }
738 741
739 /* Update the disk inode timestamp and size (if extended) */
740 err = gfs2_meta_inode_buffer(ip, &dibh);
741 if (err)
742 goto out;
743
744 size = loc + sizeof(struct gfs2_quota); 742 size = loc + sizeof(struct gfs2_quota);
745 if (size > inode->i_size) 743 if (size > inode->i_size)
746 i_size_write(inode, size); 744 i_size_write(inode, size);
747 inode->i_mtime = inode->i_atime = CURRENT_TIME; 745 inode->i_mtime = inode->i_atime = CURRENT_TIME;
748 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
749 gfs2_dinode_out(ip, dibh->b_data);
750 brelse(dibh);
751 mark_inode_dirty(inode); 746 mark_inode_dirty(inode);
752
753out:
754 return err; 747 return err;
748
755unlock_out: 749unlock_out:
756 unlock_page(page); 750 unlock_page(page);
757 page_cache_release(page); 751 page_cache_release(page);
@@ -822,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
822 goto out_alloc; 816 goto out_alloc;
823 817
824 if (nalloc) 818 if (nalloc)
825 blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS; 819 blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS;
826 820
827 error = gfs2_trans_begin(sdp, blocks, 0); 821 error = gfs2_trans_begin(sdp, blocks, 0);
828 if (error) 822 if (error)
@@ -936,7 +930,9 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
936 unsigned int x; 930 unsigned int x;
937 int error = 0; 931 int error = 0;
938 932
939 gfs2_quota_hold(ip, uid, gid); 933 error = gfs2_quota_hold(ip, uid, gid);
934 if (error)
935 return error;
940 936
941 if (capable(CAP_SYS_RESOURCE) || 937 if (capable(CAP_SYS_RESOURCE) ||
942 sdp->sd_args.ar_quota != GFS2_QUOTA_ON) 938 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
@@ -1607,7 +1603,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1607 error = gfs2_inplace_reserve(ip); 1603 error = gfs2_inplace_reserve(ip);
1608 if (error) 1604 if (error)
1609 goto out_alloc; 1605 goto out_alloc;
1610 blocks += gfs2_rg_blocks(al); 1606 blocks += gfs2_rg_blocks(ip);
1611 } 1607 }
1612 1608
1613 /* Some quotas span block boundaries and can update two blocks, 1609 /* Some quotas span block boundaries and can update two blocks,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7f8af1eb02de..96bd6d759f29 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -15,6 +15,7 @@
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/prefetch.h> 16#include <linux/prefetch.h>
17#include <linux/blkdev.h> 17#include <linux/blkdev.h>
18#include <linux/rbtree.h>
18 19
19#include "gfs2.h" 20#include "gfs2.h"
20#include "incore.h" 21#include "incore.h"
@@ -328,18 +329,22 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
328 329
329struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) 330struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
330{ 331{
331 struct gfs2_rgrpd *rgd; 332 struct rb_node **newn;
333 struct gfs2_rgrpd *cur;
332 334
333 spin_lock(&sdp->sd_rindex_spin); 335 spin_lock(&sdp->sd_rindex_spin);
334 336 newn = &sdp->sd_rindex_tree.rb_node;
335 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { 337 while (*newn) {
336 if (rgrp_contains_block(rgd, blk)) { 338 cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node);
337 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); 339 if (blk < cur->rd_addr)
340 newn = &((*newn)->rb_left);
341 else if (blk >= cur->rd_data0 + cur->rd_data)
342 newn = &((*newn)->rb_right);
343 else {
338 spin_unlock(&sdp->sd_rindex_spin); 344 spin_unlock(&sdp->sd_rindex_spin);
339 return rgd; 345 return cur;
340 } 346 }
341 } 347 }
342
343 spin_unlock(&sdp->sd_rindex_spin); 348 spin_unlock(&sdp->sd_rindex_spin);
344 349
345 return NULL; 350 return NULL;
@@ -354,8 +359,15 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
354 359
355struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) 360struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
356{ 361{
357 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list)); 362 const struct rb_node *n;
358 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list); 363 struct gfs2_rgrpd *rgd;
364
365 spin_lock(&sdp->sd_rindex_spin);
366 n = rb_first(&sdp->sd_rindex_tree);
367 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
368 spin_unlock(&sdp->sd_rindex_spin);
369
370 return rgd;
359} 371}
360 372
361/** 373/**
@@ -367,47 +379,60 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
367 379
368struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) 380struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
369{ 381{
370 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list) 382 struct gfs2_sbd *sdp = rgd->rd_sbd;
383 const struct rb_node *n;
384
385 spin_lock(&sdp->sd_rindex_spin);
386 n = rb_next(&rgd->rd_node);
387 if (n == NULL)
388 n = rb_first(&sdp->sd_rindex_tree);
389
390 if (unlikely(&rgd->rd_node == n)) {
391 spin_unlock(&sdp->sd_rindex_spin);
371 return NULL; 392 return NULL;
372 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list); 393 }
394 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
395 spin_unlock(&sdp->sd_rindex_spin);
396 return rgd;
373} 397}
374 398
375static void clear_rgrpdi(struct gfs2_sbd *sdp) 399void gfs2_free_clones(struct gfs2_rgrpd *rgd)
376{ 400{
377 struct list_head *head; 401 int x;
402
403 for (x = 0; x < rgd->rd_length; x++) {
404 struct gfs2_bitmap *bi = rgd->rd_bits + x;
405 kfree(bi->bi_clone);
406 bi->bi_clone = NULL;
407 }
408}
409
410void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
411{
412 struct rb_node *n;
378 struct gfs2_rgrpd *rgd; 413 struct gfs2_rgrpd *rgd;
379 struct gfs2_glock *gl; 414 struct gfs2_glock *gl;
380 415
381 spin_lock(&sdp->sd_rindex_spin); 416 while ((n = rb_first(&sdp->sd_rindex_tree))) {
382 sdp->sd_rindex_forward = NULL; 417 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
383 spin_unlock(&sdp->sd_rindex_spin);
384
385 head = &sdp->sd_rindex_list;
386 while (!list_empty(head)) {
387 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
388 gl = rgd->rd_gl; 418 gl = rgd->rd_gl;
389 419
390 list_del(&rgd->rd_list); 420 rb_erase(n, &sdp->sd_rindex_tree);
391 list_del(&rgd->rd_list_mru);
392 421
393 if (gl) { 422 if (gl) {
423 spin_lock(&gl->gl_spin);
394 gl->gl_object = NULL; 424 gl->gl_object = NULL;
425 spin_unlock(&gl->gl_spin);
395 gfs2_glock_add_to_lru(gl); 426 gfs2_glock_add_to_lru(gl);
396 gfs2_glock_put(gl); 427 gfs2_glock_put(gl);
397 } 428 }
398 429
430 gfs2_free_clones(rgd);
399 kfree(rgd->rd_bits); 431 kfree(rgd->rd_bits);
400 kmem_cache_free(gfs2_rgrpd_cachep, rgd); 432 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
401 } 433 }
402} 434}
403 435
404void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
405{
406 mutex_lock(&sdp->sd_rindex_mutex);
407 clear_rgrpdi(sdp);
408 mutex_unlock(&sdp->sd_rindex_mutex);
409}
410
411static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) 436static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
412{ 437{
413 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); 438 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
@@ -524,22 +549,34 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
524 return total_data; 549 return total_data;
525} 550}
526 551
527static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf) 552static void rgd_insert(struct gfs2_rgrpd *rgd)
528{ 553{
529 const struct gfs2_rindex *str = buf; 554 struct gfs2_sbd *sdp = rgd->rd_sbd;
555 struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
556
557 /* Figure out where to put new node */
558 while (*newn) {
559 struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
560 rd_node);
561
562 parent = *newn;
563 if (rgd->rd_addr < cur->rd_addr)
564 newn = &((*newn)->rb_left);
565 else if (rgd->rd_addr > cur->rd_addr)
566 newn = &((*newn)->rb_right);
567 else
568 return;
569 }
530 570
531 rgd->rd_addr = be64_to_cpu(str->ri_addr); 571 rb_link_node(&rgd->rd_node, parent, newn);
532 rgd->rd_length = be32_to_cpu(str->ri_length); 572 rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
533 rgd->rd_data0 = be64_to_cpu(str->ri_data0);
534 rgd->rd_data = be32_to_cpu(str->ri_data);
535 rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes);
536} 573}
537 574
538/** 575/**
539 * read_rindex_entry - Pull in a new resource index entry from the disk 576 * read_rindex_entry - Pull in a new resource index entry from the disk
540 * @gl: The glock covering the rindex inode 577 * @gl: The glock covering the rindex inode
541 * 578 *
542 * Returns: 0 on success, error code otherwise 579 * Returns: 0 on success, > 0 on EOF, error code otherwise
543 */ 580 */
544 581
545static int read_rindex_entry(struct gfs2_inode *ip, 582static int read_rindex_entry(struct gfs2_inode *ip,
@@ -547,44 +584,53 @@ static int read_rindex_entry(struct gfs2_inode *ip,
547{ 584{
548 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 585 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
549 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); 586 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
550 char buf[sizeof(struct gfs2_rindex)]; 587 struct gfs2_rindex buf;
551 int error; 588 int error;
552 struct gfs2_rgrpd *rgd; 589 struct gfs2_rgrpd *rgd;
553 590
554 error = gfs2_internal_read(ip, ra_state, buf, &pos, 591 if (pos >= i_size_read(&ip->i_inode))
592 return 1;
593
594 error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos,
555 sizeof(struct gfs2_rindex)); 595 sizeof(struct gfs2_rindex));
556 if (!error) 596
557 return 0; 597 if (error != sizeof(struct gfs2_rindex))
558 if (error != sizeof(struct gfs2_rindex)) { 598 return (error == 0) ? 1 : error;
559 if (error > 0)
560 error = -EIO;
561 return error;
562 }
563 599
564 rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); 600 rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
565 error = -ENOMEM; 601 error = -ENOMEM;
566 if (!rgd) 602 if (!rgd)
567 return error; 603 return error;
568 604
569 mutex_init(&rgd->rd_mutex);
570 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
571 rgd->rd_sbd = sdp; 605 rgd->rd_sbd = sdp;
606 rgd->rd_addr = be64_to_cpu(buf.ri_addr);
607 rgd->rd_length = be32_to_cpu(buf.ri_length);
608 rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
609 rgd->rd_data = be32_to_cpu(buf.ri_data);
610 rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
572 611
573 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
574 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
575
576 gfs2_rindex_in(rgd, buf);
577 error = compute_bitstructs(rgd); 612 error = compute_bitstructs(rgd);
578 if (error) 613 if (error)
579 return error; 614 goto fail;
580 615
581 error = gfs2_glock_get(sdp, rgd->rd_addr, 616 error = gfs2_glock_get(sdp, rgd->rd_addr,
582 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); 617 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
583 if (error) 618 if (error)
584 return error; 619 goto fail;
585 620
586 rgd->rd_gl->gl_object = rgd; 621 rgd->rd_gl->gl_object = rgd;
587 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 622 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
623 if (rgd->rd_data > sdp->sd_max_rg_data)
624 sdp->sd_max_rg_data = rgd->rd_data;
625 spin_lock(&sdp->sd_rindex_spin);
626 rgd_insert(rgd);
627 sdp->sd_rgrps++;
628 spin_unlock(&sdp->sd_rindex_spin);
629 return error;
630
631fail:
632 kfree(rgd->rd_bits);
633 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
588 return error; 634 return error;
589} 635}
590 636
@@ -595,40 +641,28 @@ static int read_rindex_entry(struct gfs2_inode *ip,
595 * Returns: 0 on successful update, error code otherwise 641 * Returns: 0 on successful update, error code otherwise
596 */ 642 */
597 643
598int gfs2_ri_update(struct gfs2_inode *ip) 644static int gfs2_ri_update(struct gfs2_inode *ip)
599{ 645{
600 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 646 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
601 struct inode *inode = &ip->i_inode; 647 struct inode *inode = &ip->i_inode;
602 struct file_ra_state ra_state; 648 struct file_ra_state ra_state;
603 u64 rgrp_count = i_size_read(inode);
604 struct gfs2_rgrpd *rgd;
605 unsigned int max_data = 0;
606 int error; 649 int error;
607 650
608 do_div(rgrp_count, sizeof(struct gfs2_rindex));
609 clear_rgrpdi(sdp);
610
611 file_ra_state_init(&ra_state, inode->i_mapping); 651 file_ra_state_init(&ra_state, inode->i_mapping);
612 for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) { 652 do {
613 error = read_rindex_entry(ip, &ra_state); 653 error = read_rindex_entry(ip, &ra_state);
614 if (error) { 654 } while (error == 0);
615 clear_rgrpdi(sdp); 655
616 return error; 656 if (error < 0)
617 } 657 return error;
618 }
619 658
620 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
621 if (rgd->rd_data > max_data)
622 max_data = rgd->rd_data;
623 sdp->sd_max_rg_data = max_data;
624 sdp->sd_rindex_uptodate = 1; 659 sdp->sd_rindex_uptodate = 1;
625 return 0; 660 return 0;
626} 661}
627 662
628/** 663/**
629 * gfs2_rindex_hold - Grab a lock on the rindex 664 * gfs2_rindex_update - Update the rindex if required
630 * @sdp: The GFS2 superblock 665 * @sdp: The GFS2 superblock
631 * @ri_gh: the glock holder
632 * 666 *
633 * We grab a lock on the rindex inode to make sure that it doesn't 667 * We grab a lock on the rindex inode to make sure that it doesn't
634 * change whilst we are performing an operation. We keep this lock 668 * change whilst we are performing an operation. We keep this lock
@@ -640,30 +674,29 @@ int gfs2_ri_update(struct gfs2_inode *ip)
640 * special file, which might have been updated if someone expanded the 674 * special file, which might have been updated if someone expanded the
641 * filesystem (via gfs2_grow utility), which adds new resource groups. 675 * filesystem (via gfs2_grow utility), which adds new resource groups.
642 * 676 *
643 * Returns: 0 on success, error code otherwise 677 * Returns: 0 on succeess, error code otherwise
644 */ 678 */
645 679
646int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) 680int gfs2_rindex_update(struct gfs2_sbd *sdp)
647{ 681{
648 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); 682 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
649 struct gfs2_glock *gl = ip->i_gl; 683 struct gfs2_glock *gl = ip->i_gl;
650 int error; 684 struct gfs2_holder ri_gh;
651 685 int error = 0;
652 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
653 if (error)
654 return error;
655 686
656 /* Read new copy from disk if we don't have the latest */ 687 /* Read new copy from disk if we don't have the latest */
657 if (!sdp->sd_rindex_uptodate) { 688 if (!sdp->sd_rindex_uptodate) {
658 mutex_lock(&sdp->sd_rindex_mutex); 689 mutex_lock(&sdp->sd_rindex_mutex);
659 if (!sdp->sd_rindex_uptodate) { 690 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
691 if (error)
692 return error;
693 if (!sdp->sd_rindex_uptodate)
660 error = gfs2_ri_update(ip); 694 error = gfs2_ri_update(ip);
661 if (error) 695 gfs2_glock_dq_uninit(&ri_gh);
662 gfs2_glock_dq_uninit(ri_gh);
663 }
664 mutex_unlock(&sdp->sd_rindex_mutex); 696 mutex_unlock(&sdp->sd_rindex_mutex);
665 } 697 }
666 698
699
667 return error; 700 return error;
668} 701}
669 702
@@ -694,7 +727,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
694} 727}
695 728
696/** 729/**
697 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps 730 * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps
698 * @rgd: the struct gfs2_rgrpd describing the RG to read in 731 * @rgd: the struct gfs2_rgrpd describing the RG to read in
699 * 732 *
700 * Read in all of a Resource Group's header and bitmap blocks. 733 * Read in all of a Resource Group's header and bitmap blocks.
@@ -703,8 +736,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
703 * Returns: errno 736 * Returns: errno
704 */ 737 */
705 738
706int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) 739int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
707{ 740{
741 struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
708 struct gfs2_sbd *sdp = rgd->rd_sbd; 742 struct gfs2_sbd *sdp = rgd->rd_sbd;
709 struct gfs2_glock *gl = rgd->rd_gl; 743 struct gfs2_glock *gl = rgd->rd_gl;
710 unsigned int length = rgd->rd_length; 744 unsigned int length = rgd->rd_length;
@@ -712,17 +746,6 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
712 unsigned int x, y; 746 unsigned int x, y;
713 int error; 747 int error;
714 748
715 mutex_lock(&rgd->rd_mutex);
716
717 spin_lock(&sdp->sd_rindex_spin);
718 if (rgd->rd_bh_count) {
719 rgd->rd_bh_count++;
720 spin_unlock(&sdp->sd_rindex_spin);
721 mutex_unlock(&rgd->rd_mutex);
722 return 0;
723 }
724 spin_unlock(&sdp->sd_rindex_spin);
725
726 for (x = 0; x < length; x++) { 749 for (x = 0; x < length; x++) {
727 bi = rgd->rd_bits + x; 750 bi = rgd->rd_bits + x;
728 error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); 751 error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
@@ -747,15 +770,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
747 clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); 770 clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
748 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); 771 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
749 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); 772 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
773 rgd->rd_free_clone = rgd->rd_free;
750 } 774 }
751 775
752 spin_lock(&sdp->sd_rindex_spin);
753 rgd->rd_free_clone = rgd->rd_free;
754 rgd->rd_bh_count++;
755 spin_unlock(&sdp->sd_rindex_spin);
756
757 mutex_unlock(&rgd->rd_mutex);
758
759 return 0; 776 return 0;
760 777
761fail: 778fail:
@@ -765,52 +782,32 @@ fail:
765 bi->bi_bh = NULL; 782 bi->bi_bh = NULL;
766 gfs2_assert_warn(sdp, !bi->bi_clone); 783 gfs2_assert_warn(sdp, !bi->bi_clone);
767 } 784 }
768 mutex_unlock(&rgd->rd_mutex);
769 785
770 return error; 786 return error;
771} 787}
772 788
773void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
774{
775 struct gfs2_sbd *sdp = rgd->rd_sbd;
776
777 spin_lock(&sdp->sd_rindex_spin);
778 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
779 rgd->rd_bh_count++;
780 spin_unlock(&sdp->sd_rindex_spin);
781}
782
783/** 789/**
784 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get() 790 * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
785 * @rgd: the struct gfs2_rgrpd describing the RG to read in 791 * @rgd: the struct gfs2_rgrpd describing the RG to read in
786 * 792 *
787 */ 793 */
788 794
789void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) 795void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
790{ 796{
791 struct gfs2_sbd *sdp = rgd->rd_sbd; 797 struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
792 int x, length = rgd->rd_length; 798 int x, length = rgd->rd_length;
793 799
794 spin_lock(&sdp->sd_rindex_spin);
795 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
796 if (--rgd->rd_bh_count) {
797 spin_unlock(&sdp->sd_rindex_spin);
798 return;
799 }
800
801 for (x = 0; x < length; x++) { 800 for (x = 0; x < length; x++) {
802 struct gfs2_bitmap *bi = rgd->rd_bits + x; 801 struct gfs2_bitmap *bi = rgd->rd_bits + x;
803 kfree(bi->bi_clone);
804 bi->bi_clone = NULL;
805 brelse(bi->bi_bh); 802 brelse(bi->bi_bh);
806 bi->bi_bh = NULL; 803 bi->bi_bh = NULL;
807 } 804 }
808 805
809 spin_unlock(&sdp->sd_rindex_spin);
810} 806}
811 807
812static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, 808void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
813 const struct gfs2_bitmap *bi) 809 struct buffer_head *bh,
810 const struct gfs2_bitmap *bi)
814{ 811{
815 struct super_block *sb = sdp->sd_vfs; 812 struct super_block *sb = sdp->sd_vfs;
816 struct block_device *bdev = sb->s_bdev; 813 struct block_device *bdev = sb->s_bdev;
@@ -823,7 +820,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
823 unsigned int x; 820 unsigned int x;
824 821
825 for (x = 0; x < bi->bi_len; x++) { 822 for (x = 0; x < bi->bi_len; x++) {
826 const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x; 823 const u8 *orig = bh->b_data + bi->bi_offset + x;
827 const u8 *clone = bi->bi_clone + bi->bi_offset + x; 824 const u8 *clone = bi->bi_clone + bi->bi_offset + x;
828 u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); 825 u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
829 diff &= 0x55; 826 diff &= 0x55;
@@ -862,28 +859,6 @@ fail:
862 sdp->sd_args.ar_discard = 0; 859 sdp->sd_args.ar_discard = 0;
863} 860}
864 861
865void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
866{
867 struct gfs2_sbd *sdp = rgd->rd_sbd;
868 unsigned int length = rgd->rd_length;
869 unsigned int x;
870
871 for (x = 0; x < length; x++) {
872 struct gfs2_bitmap *bi = rgd->rd_bits + x;
873 if (!bi->bi_clone)
874 continue;
875 if (sdp->sd_args.ar_discard)
876 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
877 clear_bit(GBF_FULL, &bi->bi_flags);
878 memcpy(bi->bi_clone + bi->bi_offset,
879 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
880 }
881
882 spin_lock(&sdp->sd_rindex_spin);
883 rgd->rd_free_clone = rgd->rd_free;
884 spin_unlock(&sdp->sd_rindex_spin);
885}
886
887/** 862/**
888 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode 863 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
889 * @ip: the incore GFS2 inode structure 864 * @ip: the incore GFS2 inode structure
@@ -893,38 +868,35 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
893 868
894struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) 869struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
895{ 870{
871 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
872 int error;
896 BUG_ON(ip->i_alloc != NULL); 873 BUG_ON(ip->i_alloc != NULL);
897 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); 874 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
875 error = gfs2_rindex_update(sdp);
876 if (error)
877 fs_warn(sdp, "rindex update returns %d\n", error);
898 return ip->i_alloc; 878 return ip->i_alloc;
899} 879}
900 880
901/** 881/**
902 * try_rgrp_fit - See if a given reservation will fit in a given RG 882 * try_rgrp_fit - See if a given reservation will fit in a given RG
903 * @rgd: the RG data 883 * @rgd: the RG data
904 * @al: the struct gfs2_alloc structure describing the reservation 884 * @ip: the inode
905 * 885 *
906 * If there's room for the requested blocks to be allocated from the RG: 886 * If there's room for the requested blocks to be allocated from the RG:
907 * Sets the $al_rgd field in @al.
908 * 887 *
909 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) 888 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
910 */ 889 */
911 890
912static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) 891static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
913{ 892{
914 struct gfs2_sbd *sdp = rgd->rd_sbd; 893 const struct gfs2_alloc *al = ip->i_alloc;
915 int ret = 0;
916 894
917 if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) 895 if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
918 return 0; 896 return 0;
919 897 if (rgd->rd_free_clone >= al->al_requested)
920 spin_lock(&sdp->sd_rindex_spin); 898 return 1;
921 if (rgd->rd_free_clone >= al->al_requested) { 899 return 0;
922 al->al_rgd = rgd;
923 ret = 1;
924 }
925 spin_unlock(&sdp->sd_rindex_spin);
926
927 return ret;
928} 900}
929 901
930/** 902/**
@@ -992,76 +964,6 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
992} 964}
993 965
994/** 966/**
995 * recent_rgrp_next - get next RG from "recent" list
996 * @cur_rgd: current rgrp
997 *
998 * Returns: The next rgrp in the recent list
999 */
1000
1001static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
1002{
1003 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
1004 struct list_head *head;
1005 struct gfs2_rgrpd *rgd;
1006
1007 spin_lock(&sdp->sd_rindex_spin);
1008 head = &sdp->sd_rindex_mru_list;
1009 if (unlikely(cur_rgd->rd_list_mru.next == head)) {
1010 spin_unlock(&sdp->sd_rindex_spin);
1011 return NULL;
1012 }
1013 rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
1014 spin_unlock(&sdp->sd_rindex_spin);
1015 return rgd;
1016}
1017
1018/**
1019 * forward_rgrp_get - get an rgrp to try next from full list
1020 * @sdp: The GFS2 superblock
1021 *
1022 * Returns: The rgrp to try next
1023 */
1024
1025static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
1026{
1027 struct gfs2_rgrpd *rgd;
1028 unsigned int journals = gfs2_jindex_size(sdp);
1029 unsigned int rg = 0, x;
1030
1031 spin_lock(&sdp->sd_rindex_spin);
1032
1033 rgd = sdp->sd_rindex_forward;
1034 if (!rgd) {
1035 if (sdp->sd_rgrps >= journals)
1036 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
1037
1038 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
1039 x++, rgd = gfs2_rgrpd_get_next(rgd))
1040 /* Do Nothing */;
1041
1042 sdp->sd_rindex_forward = rgd;
1043 }
1044
1045 spin_unlock(&sdp->sd_rindex_spin);
1046
1047 return rgd;
1048}
1049
1050/**
1051 * forward_rgrp_set - set the forward rgrp pointer
1052 * @sdp: the filesystem
1053 * @rgd: The new forward rgrp
1054 *
1055 */
1056
1057static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1058{
1059 spin_lock(&sdp->sd_rindex_spin);
1060 sdp->sd_rindex_forward = rgd;
1061 spin_unlock(&sdp->sd_rindex_spin);
1062}
1063
1064/**
1065 * get_local_rgrp - Choose and lock a rgrp for allocation 967 * get_local_rgrp - Choose and lock a rgrp for allocation
1066 * @ip: the inode to reserve space for 968 * @ip: the inode to reserve space for
1067 * @rgp: the chosen and locked rgrp 969 * @rgp: the chosen and locked rgrp
@@ -1076,14 +978,18 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1076 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 978 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1077 struct gfs2_rgrpd *rgd, *begin = NULL; 979 struct gfs2_rgrpd *rgd, *begin = NULL;
1078 struct gfs2_alloc *al = ip->i_alloc; 980 struct gfs2_alloc *al = ip->i_alloc;
1079 int flags = LM_FLAG_TRY;
1080 int skipped = 0;
1081 int loops = 0;
1082 int error, rg_locked; 981 int error, rg_locked;
982 int loops = 0;
983
984 if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
985 rgd = begin = ip->i_rgd;
986 else
987 rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal);
1083 988
1084 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 989 if (rgd == NULL)
990 return -EBADSLT;
1085 991
1086 while (rgd) { 992 while (loops < 3) {
1087 rg_locked = 0; 993 rg_locked = 0;
1088 994
1089 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { 995 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
@@ -1095,92 +1001,36 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1095 } 1001 }
1096 switch (error) { 1002 switch (error) {
1097 case 0: 1003 case 0:
1098 if (try_rgrp_fit(rgd, al)) 1004 if (try_rgrp_fit(rgd, ip)) {
1099 goto out; 1005 ip->i_rgd = rgd;
1006 return 0;
1007 }
1100 if (rgd->rd_flags & GFS2_RDF_CHECK) 1008 if (rgd->rd_flags & GFS2_RDF_CHECK)
1101 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1009 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1102 if (!rg_locked) 1010 if (!rg_locked)
1103 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1011 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1104 /* fall through */ 1012 /* fall through */
1105 case GLR_TRYFAILED: 1013 case GLR_TRYFAILED:
1106 rgd = recent_rgrp_next(rgd); 1014 rgd = gfs2_rgrpd_get_next(rgd);
1107 break; 1015 if (rgd == begin)
1108 1016 loops++;
1109 default:
1110 return error;
1111 }
1112 }
1113
1114 /* Go through full list of rgrps */
1115
1116 begin = rgd = forward_rgrp_get(sdp);
1117
1118 for (;;) {
1119 rg_locked = 0;
1120
1121 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
1122 rg_locked = 1;
1123 error = 0;
1124 } else {
1125 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
1126 &al->al_rgd_gh);
1127 }
1128 switch (error) {
1129 case 0:
1130 if (try_rgrp_fit(rgd, al))
1131 goto out;
1132 if (rgd->rd_flags & GFS2_RDF_CHECK)
1133 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1134 if (!rg_locked)
1135 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1136 break;
1137
1138 case GLR_TRYFAILED:
1139 skipped++;
1140 break; 1017 break;
1141
1142 default: 1018 default:
1143 return error; 1019 return error;
1144 } 1020 }
1145
1146 rgd = gfs2_rgrpd_get_next(rgd);
1147 if (!rgd)
1148 rgd = gfs2_rgrpd_get_first(sdp);
1149
1150 if (rgd == begin) {
1151 if (++loops >= 3)
1152 return -ENOSPC;
1153 if (!skipped)
1154 loops++;
1155 flags = 0;
1156 if (loops == 2)
1157 gfs2_log_flush(sdp, NULL);
1158 }
1159 } 1021 }
1160 1022
1161out: 1023 return -ENOSPC;
1162 if (begin) {
1163 spin_lock(&sdp->sd_rindex_spin);
1164 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
1165 spin_unlock(&sdp->sd_rindex_spin);
1166 rgd = gfs2_rgrpd_get_next(rgd);
1167 if (!rgd)
1168 rgd = gfs2_rgrpd_get_first(sdp);
1169 forward_rgrp_set(sdp, rgd);
1170 }
1171
1172 return 0;
1173} 1024}
1174 1025
1175/** 1026/**
1176 * gfs2_inplace_reserve_i - Reserve space in the filesystem 1027 * gfs2_inplace_reserve - Reserve space in the filesystem
1177 * @ip: the inode to reserve space for 1028 * @ip: the inode to reserve space for
1178 * 1029 *
1179 * Returns: errno 1030 * Returns: errno
1180 */ 1031 */
1181 1032
1182int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, 1033int gfs2_inplace_reserve(struct gfs2_inode *ip)
1183 char *file, unsigned int line)
1184{ 1034{
1185 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1035 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1186 struct gfs2_alloc *al = ip->i_alloc; 1036 struct gfs2_alloc *al = ip->i_alloc;
@@ -1191,45 +1041,22 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1191 if (gfs2_assert_warn(sdp, al->al_requested)) 1041 if (gfs2_assert_warn(sdp, al->al_requested))
1192 return -EINVAL; 1042 return -EINVAL;
1193 1043
1194 if (hold_rindex) {
1195 /* We need to hold the rindex unless the inode we're using is
1196 the rindex itself, in which case it's already held. */
1197 if (ip != GFS2_I(sdp->sd_rindex))
1198 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1199 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1200 in, so: */
1201 error = gfs2_ri_update(ip);
1202 if (error)
1203 return error;
1204 }
1205
1206try_again:
1207 do { 1044 do {
1208 error = get_local_rgrp(ip, &last_unlinked); 1045 error = get_local_rgrp(ip, &last_unlinked);
1209 /* If there is no space, flushing the log may release some */ 1046 if (error != -ENOSPC)
1210 if (error) { 1047 break;
1211 if (ip == GFS2_I(sdp->sd_rindex) && 1048 /* Check that fs hasn't grown if writing to rindex */
1212 !sdp->sd_rindex_uptodate) { 1049 if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
1213 error = gfs2_ri_update(ip); 1050 error = gfs2_ri_update(ip);
1214 if (error) 1051 if (error)
1215 return error; 1052 break;
1216 goto try_again; 1053 continue;
1217 }
1218 gfs2_log_flush(sdp, NULL);
1219 } 1054 }
1220 } while (error && tries++ < 3); 1055 /* Flushing the log may release space */
1221 1056 gfs2_log_flush(sdp, NULL);
1222 if (error) { 1057 } while (tries++ < 3);
1223 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1224 gfs2_glock_dq_uninit(&al->al_ri_gh);
1225 return error;
1226 }
1227
1228 /* no error, so we have the rgrp set in the inode's allocation. */
1229 al->al_file = file;
1230 al->al_line = line;
1231 1058
1232 return 0; 1059 return error;
1233} 1060}
1234 1061
1235/** 1062/**
@@ -1241,20 +1068,10 @@ try_again:
1241 1068
1242void gfs2_inplace_release(struct gfs2_inode *ip) 1069void gfs2_inplace_release(struct gfs2_inode *ip)
1243{ 1070{
1244 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1245 struct gfs2_alloc *al = ip->i_alloc; 1071 struct gfs2_alloc *al = ip->i_alloc;
1246 1072
1247 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1248 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
1249 "al_file = %s, al_line = %u\n",
1250 al->al_alloced, al->al_requested, al->al_file,
1251 al->al_line);
1252
1253 al->al_rgd = NULL;
1254 if (al->al_rgd_gh.gh_gl) 1073 if (al->al_rgd_gh.gh_gl)
1255 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1074 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1256 if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
1257 gfs2_glock_dq_uninit(&al->al_ri_gh);
1258} 1075}
1259 1076
1260/** 1077/**
@@ -1352,6 +1169,7 @@ do_search:
1352 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone 1169 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1353 bitmaps, so we must search the originals for that. */ 1170 bitmaps, so we must search the originals for that. */
1354 buffer = bi->bi_bh->b_data + bi->bi_offset; 1171 buffer = bi->bi_bh->b_data + bi->bi_offset;
1172 WARN_ON(!buffer_uptodate(bi->bi_bh));
1355 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) 1173 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1356 buffer = bi->bi_clone + bi->bi_offset; 1174 buffer = bi->bi_clone + bi->bi_offset;
1357 1175
@@ -1371,6 +1189,7 @@ skip:
1371 1189
1372 if (blk == BFITNOENT) 1190 if (blk == BFITNOENT)
1373 return blk; 1191 return blk;
1192
1374 *n = 1; 1193 *n = 1;
1375 if (old_state == new_state) 1194 if (old_state == new_state)
1376 goto out; 1195 goto out;
@@ -1503,7 +1322,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1503 if (al == NULL) 1322 if (al == NULL)
1504 return -ECANCELED; 1323 return -ECANCELED;
1505 1324
1506 rgd = al->al_rgd; 1325 rgd = ip->i_rgd;
1507 1326
1508 if (rgrp_contains_block(rgd, ip->i_goal)) 1327 if (rgrp_contains_block(rgd, ip->i_goal))
1509 goal = ip->i_goal - rgd->rd_data0; 1328 goal = ip->i_goal - rgd->rd_data0;
@@ -1518,7 +1337,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1518 1337
1519 rgd->rd_last_alloc = blk; 1338 rgd->rd_last_alloc = blk;
1520 block = rgd->rd_data0 + blk; 1339 block = rgd->rd_data0 + blk;
1521 ip->i_goal = block; 1340 ip->i_goal = block + *n - 1;
1522 error = gfs2_meta_inode_buffer(ip, &dibh); 1341 error = gfs2_meta_inode_buffer(ip, &dibh);
1523 if (error == 0) { 1342 if (error == 0) {
1524 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; 1343 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
@@ -1539,9 +1358,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1539 gfs2_statfs_change(sdp, 0, -(s64)*n, 0); 1358 gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
1540 gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid); 1359 gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
1541 1360
1542 spin_lock(&sdp->sd_rindex_spin);
1543 rgd->rd_free_clone -= *n; 1361 rgd->rd_free_clone -= *n;
1544 spin_unlock(&sdp->sd_rindex_spin);
1545 trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED); 1362 trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
1546 *bn = block; 1363 *bn = block;
1547 return 0; 1364 return 0;
@@ -1564,7 +1381,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
1564{ 1381{
1565 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 1382 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1566 struct gfs2_alloc *al = dip->i_alloc; 1383 struct gfs2_alloc *al = dip->i_alloc;
1567 struct gfs2_rgrpd *rgd = al->al_rgd; 1384 struct gfs2_rgrpd *rgd = dip->i_rgd;
1568 u32 blk; 1385 u32 blk;
1569 u64 block; 1386 u64 block;
1570 unsigned int n = 1; 1387 unsigned int n = 1;
@@ -1594,9 +1411,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
1594 gfs2_statfs_change(sdp, 0, -1, +1); 1411 gfs2_statfs_change(sdp, 0, -1, +1);
1595 gfs2_trans_add_unrevoke(sdp, block, 1); 1412 gfs2_trans_add_unrevoke(sdp, block, 1);
1596 1413
1597 spin_lock(&sdp->sd_rindex_spin);
1598 rgd->rd_free_clone--; 1414 rgd->rd_free_clone--;
1599 spin_unlock(&sdp->sd_rindex_spin);
1600 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); 1415 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
1601 *bn = block; 1416 *bn = block;
1602 return 0; 1417 return 0;
@@ -1629,8 +1444,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
1629 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1444 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1630 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1445 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1631 1446
1632 gfs2_trans_add_rg(rgd);
1633
1634 /* Directories keep their data in the metadata address space */ 1447 /* Directories keep their data in the metadata address space */
1635 if (meta || ip->i_depth) 1448 if (meta || ip->i_depth)
1636 gfs2_meta_wipe(ip, bstart, blen); 1449 gfs2_meta_wipe(ip, bstart, blen);
@@ -1666,7 +1479,6 @@ void gfs2_unlink_di(struct inode *inode)
1666 trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED); 1479 trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
1667 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1480 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1668 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1481 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1669 gfs2_trans_add_rg(rgd);
1670} 1482}
1671 1483
1672static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) 1484static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
@@ -1688,7 +1500,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1688 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1500 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1689 1501
1690 gfs2_statfs_change(sdp, 0, +1, -1); 1502 gfs2_statfs_change(sdp, 0, +1, -1);
1691 gfs2_trans_add_rg(rgd);
1692} 1503}
1693 1504
1694 1505
@@ -1714,41 +1525,33 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1714int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) 1525int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
1715{ 1526{
1716 struct gfs2_rgrpd *rgd; 1527 struct gfs2_rgrpd *rgd;
1717 struct gfs2_holder ri_gh, rgd_gh; 1528 struct gfs2_holder rgd_gh;
1718 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1719 int ri_locked = 0;
1720 int error; 1529 int error;
1721 1530
1722 if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { 1531 error = gfs2_rindex_update(sdp);
1723 error = gfs2_rindex_hold(sdp, &ri_gh); 1532 if (error)
1724 if (error) 1533 return error;
1725 goto fail;
1726 ri_locked = 1;
1727 }
1728 1534
1729 error = -EINVAL; 1535 error = -EINVAL;
1730 rgd = gfs2_blk2rgrpd(sdp, no_addr); 1536 rgd = gfs2_blk2rgrpd(sdp, no_addr);
1731 if (!rgd) 1537 if (!rgd)
1732 goto fail_rindex; 1538 goto fail;
1733 1539
1734 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); 1540 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
1735 if (error) 1541 if (error)
1736 goto fail_rindex; 1542 goto fail;
1737 1543
1738 if (gfs2_get_block_type(rgd, no_addr) != type) 1544 if (gfs2_get_block_type(rgd, no_addr) != type)
1739 error = -ESTALE; 1545 error = -ESTALE;
1740 1546
1741 gfs2_glock_dq_uninit(&rgd_gh); 1547 gfs2_glock_dq_uninit(&rgd_gh);
1742fail_rindex:
1743 if (ri_locked)
1744 gfs2_glock_dq_uninit(&ri_gh);
1745fail: 1548fail:
1746 return error; 1549 return error;
1747} 1550}
1748 1551
1749/** 1552/**
1750 * gfs2_rlist_add - add a RG to a list of RGs 1553 * gfs2_rlist_add - add a RG to a list of RGs
1751 * @sdp: the filesystem 1554 * @ip: the inode
1752 * @rlist: the list of resource groups 1555 * @rlist: the list of resource groups
1753 * @block: the block 1556 * @block: the block
1754 * 1557 *
@@ -1758,9 +1561,10 @@ fail:
1758 * 1561 *
1759 */ 1562 */
1760 1563
1761void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, 1564void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
1762 u64 block) 1565 u64 block)
1763{ 1566{
1567 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1764 struct gfs2_rgrpd *rgd; 1568 struct gfs2_rgrpd *rgd;
1765 struct gfs2_rgrpd **tmp; 1569 struct gfs2_rgrpd **tmp;
1766 unsigned int new_space; 1570 unsigned int new_space;
@@ -1769,12 +1573,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1769 if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) 1573 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1770 return; 1574 return;
1771 1575
1772 rgd = gfs2_blk2rgrpd(sdp, block); 1576 if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
1577 rgd = ip->i_rgd;
1578 else
1579 rgd = gfs2_blk2rgrpd(sdp, block);
1773 if (!rgd) { 1580 if (!rgd) {
1774 if (gfs2_consist(sdp)) 1581 fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
1775 fs_err(sdp, "block = %llu\n", (unsigned long long)block);
1776 return; 1582 return;
1777 } 1583 }
1584 ip->i_rgd = rgd;
1778 1585
1779 for (x = 0; x < rlist->rl_rgrps; x++) 1586 for (x = 0; x < rlist->rl_rgrps; x++)
1780 if (rlist->rl_rgd[x] == rgd) 1587 if (rlist->rl_rgd[x] == rgd)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index d253f9a8c70e..cf5c50180192 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -18,18 +18,15 @@ struct gfs2_holder;
18 18
19extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); 19extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
20 20
21struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); 21extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
22struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); 22extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
23struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); 23extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
24 24
25extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); 25extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
26extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); 26extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
27 27extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
28extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); 28extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
29extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); 29extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
30extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
31
32extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
33 30
34extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); 31extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
35static inline void gfs2_alloc_put(struct gfs2_inode *ip) 32static inline void gfs2_alloc_put(struct gfs2_inode *ip)
@@ -39,16 +36,9 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
39 ip->i_alloc = NULL; 36 ip->i_alloc = NULL;
40} 37}
41 38
42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, 39extern int gfs2_inplace_reserve(struct gfs2_inode *ip);
43 char *file, unsigned int line);
44#define gfs2_inplace_reserve(ip) \
45 gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
46#define gfs2_inplace_reserve_ri(ip) \
47 gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
48
49extern void gfs2_inplace_release(struct gfs2_inode *ip); 40extern void gfs2_inplace_release(struct gfs2_inode *ip);
50 41
51extern int gfs2_ri_update(struct gfs2_inode *ip);
52extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 42extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
53extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); 43extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
54 44
@@ -66,11 +56,14 @@ struct gfs2_rgrp_list {
66 struct gfs2_holder *rl_ghs; 56 struct gfs2_holder *rl_ghs;
67}; 57};
68 58
69extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, 59extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
70 u64 block); 60 u64 block);
71extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); 61extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
72extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 62extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
73extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); 63extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
74extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); 64extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
65extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
66 struct buffer_head *bh,
67 const struct gfs2_bitmap *bi);
75 68
76#endif /* __RGRP_DOT_H__ */ 69#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b7beadd9ba4c..71e420989f77 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -752,51 +752,77 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
752 struct gfs2_sbd *sdp = GFS2_SB(inode); 752 struct gfs2_sbd *sdp = GFS2_SB(inode);
753 struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); 753 struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
754 struct backing_dev_info *bdi = metamapping->backing_dev_info; 754 struct backing_dev_info *bdi = metamapping->backing_dev_info;
755 struct gfs2_holder gh; 755 int ret = 0;
756
757 if (wbc->sync_mode == WB_SYNC_ALL)
758 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
759 if (bdi->dirty_exceeded)
760 gfs2_ail1_flush(sdp, wbc);
761 else
762 filemap_fdatawrite(metamapping);
763 if (wbc->sync_mode == WB_SYNC_ALL)
764 ret = filemap_fdatawait(metamapping);
765 if (ret)
766 mark_inode_dirty_sync(inode);
767 return ret;
768}
769
770/**
771 * gfs2_dirty_inode - check for atime updates
772 * @inode: The inode in question
773 * @flags: The type of dirty
774 *
775 * Unfortunately it can be called under any combination of inode
776 * glock and transaction lock, so we have to check carefully.
777 *
778 * At the moment this deals only with atime - it should be possible
779 * to expand that role in future, once a review of the locking has
780 * been carried out.
781 */
782
783static void gfs2_dirty_inode(struct inode *inode, int flags)
784{
785 struct gfs2_inode *ip = GFS2_I(inode);
786 struct gfs2_sbd *sdp = GFS2_SB(inode);
756 struct buffer_head *bh; 787 struct buffer_head *bh;
757 struct timespec atime; 788 struct gfs2_holder gh;
758 struct gfs2_dinode *di; 789 int need_unlock = 0;
759 int ret = -EAGAIN; 790 int need_endtrans = 0;
760 int unlock_required = 0; 791 int ret;
761 792
762 /* Skip timestamp update, if this is from a memalloc */ 793 if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
763 if (current->flags & PF_MEMALLOC) 794 return;
764 goto do_flush; 795
765 if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { 796 if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
766 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 797 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
767 if (ret) 798 if (ret) {
768 goto do_flush; 799 fs_err(sdp, "dirty_inode: glock %d\n", ret);
769 unlock_required = 1; 800 return;
801 }
802 need_unlock = 1;
770 } 803 }
771 ret = gfs2_trans_begin(sdp, RES_DINODE, 0); 804
772 if (ret) 805 if (current->journal_info == NULL) {
773 goto do_unlock; 806 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
807 if (ret) {
808 fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret);
809 goto out;
810 }
811 need_endtrans = 1;
812 }
813
774 ret = gfs2_meta_inode_buffer(ip, &bh); 814 ret = gfs2_meta_inode_buffer(ip, &bh);
775 if (ret == 0) { 815 if (ret == 0) {
776 di = (struct gfs2_dinode *)bh->b_data; 816 gfs2_trans_add_bh(ip->i_gl, bh, 1);
777 atime.tv_sec = be64_to_cpu(di->di_atime); 817 gfs2_dinode_out(ip, bh->b_data);
778 atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
779 if (timespec_compare(&inode->i_atime, &atime) > 0) {
780 gfs2_trans_add_bh(ip->i_gl, bh, 1);
781 gfs2_dinode_out(ip, bh->b_data);
782 }
783 brelse(bh); 818 brelse(bh);
784 } 819 }
785 gfs2_trans_end(sdp); 820
786do_unlock: 821 if (need_endtrans)
787 if (unlock_required) 822 gfs2_trans_end(sdp);
823out:
824 if (need_unlock)
788 gfs2_glock_dq_uninit(&gh); 825 gfs2_glock_dq_uninit(&gh);
789do_flush:
790 if (wbc->sync_mode == WB_SYNC_ALL)
791 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
792 filemap_fdatawrite(metamapping);
793 if (bdi->dirty_exceeded)
794 gfs2_ail1_flush(sdp, wbc);
795 if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
796 ret = filemap_fdatawait(metamapping);
797 if (ret)
798 mark_inode_dirty_sync(inode);
799 return ret;
800} 826}
801 827
802/** 828/**
@@ -1011,7 +1037,6 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
1011 1037
1012static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) 1038static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
1013{ 1039{
1014 struct gfs2_holder ri_gh;
1015 struct gfs2_rgrpd *rgd_next; 1040 struct gfs2_rgrpd *rgd_next;
1016 struct gfs2_holder *gha, *gh; 1041 struct gfs2_holder *gha, *gh;
1017 unsigned int slots = 64; 1042 unsigned int slots = 64;
@@ -1024,10 +1049,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
1024 if (!gha) 1049 if (!gha)
1025 return -ENOMEM; 1050 return -ENOMEM;
1026 1051
1027 error = gfs2_rindex_hold(sdp, &ri_gh);
1028 if (error)
1029 goto out;
1030
1031 rgd_next = gfs2_rgrpd_get_first(sdp); 1052 rgd_next = gfs2_rgrpd_get_first(sdp);
1032 1053
1033 for (;;) { 1054 for (;;) {
@@ -1070,9 +1091,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
1070 yield(); 1091 yield();
1071 } 1092 }
1072 1093
1073 gfs2_glock_dq_uninit(&ri_gh);
1074
1075out:
1076 kfree(gha); 1094 kfree(gha);
1077 return error; 1095 return error;
1078} 1096}
@@ -1124,6 +1142,10 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
1124 struct gfs2_statfs_change_host sc; 1142 struct gfs2_statfs_change_host sc;
1125 int error; 1143 int error;
1126 1144
1145 error = gfs2_rindex_update(sdp);
1146 if (error)
1147 return error;
1148
1127 if (gfs2_tune_get(sdp, gt_statfs_slow)) 1149 if (gfs2_tune_get(sdp, gt_statfs_slow))
1128 error = gfs2_statfs_slow(sdp, &sc); 1150 error = gfs2_statfs_slow(sdp, &sc);
1129 else 1151 else
@@ -1394,21 +1416,17 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
1394 if (error) 1416 if (error)
1395 goto out; 1417 goto out;
1396 1418
1397 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1398 if (error)
1399 goto out_qs;
1400
1401 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); 1419 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
1402 if (!rgd) { 1420 if (!rgd) {
1403 gfs2_consist_inode(ip); 1421 gfs2_consist_inode(ip);
1404 error = -EIO; 1422 error = -EIO;
1405 goto out_rindex_relse; 1423 goto out_qs;
1406 } 1424 }
1407 1425
1408 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, 1426 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1409 &al->al_rgd_gh); 1427 &al->al_rgd_gh);
1410 if (error) 1428 if (error)
1411 goto out_rindex_relse; 1429 goto out_qs;
1412 1430
1413 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1431 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
1414 sdp->sd_jdesc->jd_blocks); 1432 sdp->sd_jdesc->jd_blocks);
@@ -1423,8 +1441,6 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
1423 1441
1424out_rg_gunlock: 1442out_rg_gunlock:
1425 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1443 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1426out_rindex_relse:
1427 gfs2_glock_dq_uninit(&al->al_ri_gh);
1428out_qs: 1444out_qs:
1429 gfs2_quota_unhold(ip); 1445 gfs2_quota_unhold(ip);
1430out: 1446out:
@@ -1471,9 +1487,11 @@ static void gfs2_evict_inode(struct inode *inode)
1471 goto out; 1487 goto out;
1472 } 1488 }
1473 1489
1474 error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); 1490 if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
1475 if (error) 1491 error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
1476 goto out_truncate; 1492 if (error)
1493 goto out_truncate;
1494 }
1477 1495
1478 if (test_bit(GIF_INVALID, &ip->i_flags)) { 1496 if (test_bit(GIF_INVALID, &ip->i_flags)) {
1479 error = gfs2_inode_refresh(ip); 1497 error = gfs2_inode_refresh(ip);
@@ -1513,6 +1531,10 @@ static void gfs2_evict_inode(struct inode *inode)
1513 goto out_unlock; 1531 goto out_unlock;
1514 1532
1515out_truncate: 1533out_truncate:
1534 gfs2_log_flush(sdp, ip->i_gl);
1535 write_inode_now(inode, 1);
1536 gfs2_ail_flush(ip->i_gl, 0);
1537
1516 /* Case 2 starts here */ 1538 /* Case 2 starts here */
1517 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 1539 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1518 if (error) 1540 if (error)
@@ -1552,6 +1574,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
1552 if (ip) { 1574 if (ip) {
1553 ip->i_flags = 0; 1575 ip->i_flags = 0;
1554 ip->i_gl = NULL; 1576 ip->i_gl = NULL;
1577 ip->i_rgd = NULL;
1555 } 1578 }
1556 return &ip->i_inode; 1579 return &ip->i_inode;
1557} 1580}
@@ -1572,6 +1595,7 @@ const struct super_operations gfs2_super_ops = {
1572 .alloc_inode = gfs2_alloc_inode, 1595 .alloc_inode = gfs2_alloc_inode,
1573 .destroy_inode = gfs2_destroy_inode, 1596 .destroy_inode = gfs2_destroy_inode,
1574 .write_inode = gfs2_write_inode, 1597 .write_inode = gfs2_write_inode,
1598 .dirty_inode = gfs2_dirty_inode,
1575 .evict_inode = gfs2_evict_inode, 1599 .evict_inode = gfs2_evict_inode,
1576 .put_super = gfs2_put_super, 1600 .put_super = gfs2_put_super,
1577 .sync_fs = gfs2_sync_fs, 1601 .sync_fs = gfs2_sync_fs,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 9ec73a854111..86ac75d99d31 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -185,8 +185,3 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
185 gfs2_log_unlock(sdp); 185 gfs2_log_unlock(sdp);
186} 186}
187 187
188void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
189{
190 lops_add(rgd->rd_sbd, &rgd->rd_le);
191}
192
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index fb56b783e028..f8f101ef600c 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -28,20 +28,20 @@ struct gfs2_glock;
28 28
29/* reserve either the number of blocks to be allocated plus the rg header 29/* reserve either the number of blocks to be allocated plus the rg header
30 * block, or all of the blocks in the rg, whichever is smaller */ 30 * block, or all of the blocks in the rg, whichever is smaller */
31static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) 31static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip)
32{ 32{
33 return (al->al_requested < al->al_rgd->rd_length)? 33 const struct gfs2_alloc *al = ip->i_alloc;
34 al->al_requested + 1 : al->al_rgd->rd_length; 34 if (al->al_requested < ip->i_rgd->rd_length)
35 return al->al_requested + 1;
36 return ip->i_rgd->rd_length;
35} 37}
36 38
37int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 39extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
38 unsigned int revokes); 40 unsigned int revokes);
39 41
40void gfs2_trans_end(struct gfs2_sbd *sdp); 42extern void gfs2_trans_end(struct gfs2_sbd *sdp);
41 43extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
42void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); 44extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
43void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); 45extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
44void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
45void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
46 46
47#endif /* __TRANS_DOT_H__ */ 47#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 439b61c03262..71d7bf830c09 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -332,15 +332,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
332 if (error) 332 if (error)
333 goto out_alloc; 333 goto out_alloc;
334 334
335 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
336 if (error)
337 goto out_quota;
338
339 error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); 335 error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
340 336
341 gfs2_glock_dq_uninit(&al->al_ri_gh);
342
343out_quota:
344 gfs2_quota_unhold(ip); 337 gfs2_quota_unhold(ip);
345out_alloc: 338out_alloc:
346 gfs2_alloc_put(ip); 339 gfs2_alloc_put(ip);
@@ -734,7 +727,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 goto out_gunlock_q; 727 goto out_gunlock_q;
735 728
736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 729 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
737 blks + gfs2_rg_blocks(al) + 730 blks + gfs2_rg_blocks(ip) +
738 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 731 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
739 if (error) 732 if (error)
740 goto out_ipres; 733 goto out_ipres;
@@ -1296,7 +1289,8 @@ fail:
1296 1289
1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) 1290int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1298{ 1291{
1299 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1292 struct inode *inode = &ip->i_inode;
1293 struct gfs2_sbd *sdp = GFS2_SB(inode);
1300 struct gfs2_ea_location el; 1294 struct gfs2_ea_location el;
1301 int error; 1295 int error;
1302 1296
@@ -1319,7 +1313,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1319 if (error) 1313 if (error)
1320 return error; 1314 return error;
1321 1315
1322 error = gfs2_setattr_simple(ip, attr); 1316 error = gfs2_setattr_simple(inode, attr);
1323 gfs2_trans_end(sdp); 1317 gfs2_trans_end(sdp);
1324 return error; 1318 return error;
1325} 1319}
@@ -1362,14 +1356,14 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1362 blen++; 1356 blen++;
1363 else { 1357 else {
1364 if (bstart) 1358 if (bstart)
1365 gfs2_rlist_add(sdp, &rlist, bstart); 1359 gfs2_rlist_add(ip, &rlist, bstart);
1366 bstart = bn; 1360 bstart = bn;
1367 blen = 1; 1361 blen = 1;
1368 } 1362 }
1369 blks++; 1363 blks++;
1370 } 1364 }
1371 if (bstart) 1365 if (bstart)
1372 gfs2_rlist_add(sdp, &rlist, bstart); 1366 gfs2_rlist_add(ip, &rlist, bstart);
1373 else 1367 else
1374 goto out; 1368 goto out;
1375 1369
@@ -1501,24 +1495,18 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
1501 if (error) 1495 if (error)
1502 goto out_alloc; 1496 goto out_alloc;
1503 1497
1504 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
1505 if (error)
1506 goto out_quota;
1507
1508 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); 1498 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1509 if (error) 1499 if (error)
1510 goto out_rindex; 1500 goto out_quota;
1511 1501
1512 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { 1502 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
1513 error = ea_dealloc_indirect(ip); 1503 error = ea_dealloc_indirect(ip);
1514 if (error) 1504 if (error)
1515 goto out_rindex; 1505 goto out_quota;
1516 } 1506 }
1517 1507
1518 error = ea_dealloc_block(ip); 1508 error = ea_dealloc_block(ip);
1519 1509
1520out_rindex:
1521 gfs2_glock_dq_uninit(&al->al_ri_gh);
1522out_quota: 1510out_quota:
1523 gfs2_quota_unhold(ip); 1511 gfs2_quota_unhold(ip);
1524out_alloc: 1512out_alloc:
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c106ca22e812..d24a9b666a23 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -344,6 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
344 struct inode *root, *inode; 344 struct inode *root, *inode;
345 struct qstr str; 345 struct qstr str;
346 struct nls_table *nls = NULL; 346 struct nls_table *nls = NULL;
347 u64 last_fs_block, last_fs_page;
347 int err; 348 int err;
348 349
349 err = -EINVAL; 350 err = -EINVAL;
@@ -399,9 +400,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
399 if (!sbi->rsrc_clump_blocks) 400 if (!sbi->rsrc_clump_blocks)
400 sbi->rsrc_clump_blocks = 1; 401 sbi->rsrc_clump_blocks = 1;
401 402
402 err = generic_check_addressable(sbi->alloc_blksz_shift, 403 err = -EFBIG;
403 sbi->total_blocks); 404 last_fs_block = sbi->total_blocks - 1;
404 if (err) { 405 last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
406 PAGE_CACHE_SHIFT;
407
408 if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
409 (last_fs_page > (pgoff_t)(~0ULL))) {
405 printk(KERN_ERR "hfs: filesystem size too large.\n"); 410 printk(KERN_ERR "hfs: filesystem size too large.\n");
406 goto out_free_vhdr; 411 goto out_free_vhdr;
407 } 412 }
@@ -525,8 +530,8 @@ out_close_cat_tree:
525out_close_ext_tree: 530out_close_ext_tree:
526 hfs_btree_close(sbi->ext_tree); 531 hfs_btree_close(sbi->ext_tree);
527out_free_vhdr: 532out_free_vhdr:
528 kfree(sbi->s_vhdr); 533 kfree(sbi->s_vhdr_buf);
529 kfree(sbi->s_backup_vhdr); 534 kfree(sbi->s_backup_vhdr_buf);
530out_unload_nls: 535out_unload_nls:
531 unload_nls(sbi->nls); 536 unload_nls(sbi->nls);
532 unload_nls(nls); 537 unload_nls(nls);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 10e515a0d452..7daf4b852d1c 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -272,9 +272,9 @@ reread:
272 return 0; 272 return 0;
273 273
274out_free_backup_vhdr: 274out_free_backup_vhdr:
275 kfree(sbi->s_backup_vhdr); 275 kfree(sbi->s_backup_vhdr_buf);
276out_free_vhdr: 276out_free_vhdr:
277 kfree(sbi->s_vhdr); 277 kfree(sbi->s_vhdr_buf);
278out: 278out:
279 return error; 279 return error;
280} 280}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87b6e0421c12..ec889538e5a6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -491,6 +491,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
491 inode->i_op = &page_symlink_inode_operations; 491 inode->i_op = &page_symlink_inode_operations;
492 break; 492 break;
493 } 493 }
494 lockdep_annotate_inode_mutex_key(inode);
494 } 495 }
495 return inode; 496 return inode;
496} 497}
diff --git a/fs/inode.c b/fs/inode.c
index 73920d555c88..ecbb68dc7e2a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -634,7 +634,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
634 * inode to the back of the list so we don't spin on it. 634 * inode to the back of the list so we don't spin on it.
635 */ 635 */
636 if (!spin_trylock(&inode->i_lock)) { 636 if (!spin_trylock(&inode->i_lock)) {
637 list_move(&inode->i_lru, &sb->s_inode_lru); 637 list_move_tail(&inode->i_lru, &sb->s_inode_lru);
638 continue; 638 continue;
639 } 639 }
640 640
@@ -848,16 +848,9 @@ struct inode *new_inode(struct super_block *sb)
848} 848}
849EXPORT_SYMBOL(new_inode); 849EXPORT_SYMBOL(new_inode);
850 850
851/**
852 * unlock_new_inode - clear the I_NEW state and wake up any waiters
853 * @inode: new inode to unlock
854 *
855 * Called when the inode is fully initialised to clear the new state of the
856 * inode and wake up anyone waiting for the inode to finish initialisation.
857 */
858void unlock_new_inode(struct inode *inode)
859{
860#ifdef CONFIG_DEBUG_LOCK_ALLOC 851#ifdef CONFIG_DEBUG_LOCK_ALLOC
852void lockdep_annotate_inode_mutex_key(struct inode *inode)
853{
861 if (S_ISDIR(inode->i_mode)) { 854 if (S_ISDIR(inode->i_mode)) {
862 struct file_system_type *type = inode->i_sb->s_type; 855 struct file_system_type *type = inode->i_sb->s_type;
863 856
@@ -873,7 +866,20 @@ void unlock_new_inode(struct inode *inode)
873 &type->i_mutex_dir_key); 866 &type->i_mutex_dir_key);
874 } 867 }
875 } 868 }
869}
870EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
876#endif 871#endif
872
873/**
874 * unlock_new_inode - clear the I_NEW state and wake up any waiters
875 * @inode: new inode to unlock
876 *
877 * Called when the inode is fully initialised to clear the new state of the
878 * inode and wake up anyone waiting for the inode to finish initialisation.
879 */
880void unlock_new_inode(struct inode *inode)
881{
882 lockdep_annotate_inode_mutex_key(inode);
877 spin_lock(&inode->i_lock); 883 spin_lock(&inode->i_lock);
878 WARN_ON(!(inode->i_state & I_NEW)); 884 WARN_ON(!(inode->i_state & I_NEW));
879 inode->i_state &= ~I_NEW; 885 inode->i_state &= ~I_NEW;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index cfeb7164b085..0f20208df602 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -22,26 +22,29 @@
22#include <linux/security.h> 22#include <linux/security.h>
23#include "nodelist.h" 23#include "nodelist.h"
24 24
25/* ---- Initial Security Label Attachment -------------- */ 25/* ---- Initial Security Label(s) Attachment callback --- */
26int jffs2_init_security(struct inode *inode, struct inode *dir, 26int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
27 const struct qstr *qstr) 27 void *fs_info)
28{ 28{
29 int rc; 29 const struct xattr *xattr;
30 size_t len; 30 int err = 0;
31 void *value;
32 char *name;
33 31
34 rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len); 32 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
35 if (rc) { 33 err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
36 if (rc == -EOPNOTSUPP) 34 xattr->name, xattr->value,
37 return 0; 35 xattr->value_len, 0);
38 return rc; 36 if (err < 0)
37 break;
39 } 38 }
40 rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0); 39 return err;
40}
41 41
42 kfree(name); 42/* ---- Initial Security Label(s) Attachment ----------- */
43 kfree(value); 43int jffs2_init_security(struct inode *inode, struct inode *dir,
44 return rc; 44 const struct qstr *qstr)
45{
46 return security_inode_init_security(inode, dir, qstr,
47 &jffs2_initxattrs, NULL);
45} 48}
46 49
47/* ---- XATTR Handler for "security.*" ----------------- */ 50/* ---- XATTR Handler for "security.*" ----------------- */
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index adcf92d3b603..7971f37534a3 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
68 /* 68 /*
69 * Wait for outstanding transactions to be written to log: 69 * Wait for outstanding transactions to be written to log:
70 */ 70 */
71 jfs_flush_journal(log, 1); 71 jfs_flush_journal(log, 2);
72 72
73 /* 73 /*
74 * close fileset inode allocation map (aka fileset inode) 74 * close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
146 * 146 *
147 * remove file system from log active file system list. 147 * remove file system from log active file system list.
148 */ 148 */
149 jfs_flush_journal(log, 1); 149 jfs_flush_journal(log, 2);
150 150
151 /* 151 /*
152 * Make sure all metadata makes it to disk 152 * Make sure all metadata makes it to disk
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index e87fedef23db..26683e15b3ac 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -1089,38 +1089,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
1089} 1089}
1090 1090
1091#ifdef CONFIG_JFS_SECURITY 1091#ifdef CONFIG_JFS_SECURITY
1092int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, 1092int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
1093 const struct qstr *qstr) 1093 void *fs_info)
1094{ 1094{
1095 int rc; 1095 const struct xattr *xattr;
1096 size_t len; 1096 tid_t *tid = fs_info;
1097 void *value;
1098 char *suffix;
1099 char *name; 1097 char *name;
1100 1098 int err = 0;
1101 rc = security_inode_init_security(inode, dir, qstr, &suffix, &value, 1099
1102 &len); 1100 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
1103 if (rc) { 1101 name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
1104 if (rc == -EOPNOTSUPP) 1102 strlen(xattr->name) + 1, GFP_NOFS);
1105 return 0; 1103 if (!name) {
1106 return rc; 1104 err = -ENOMEM;
1107 } 1105 break;
1108 name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix), 1106 }
1109 GFP_NOFS); 1107 strcpy(name, XATTR_SECURITY_PREFIX);
1110 if (!name) { 1108 strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
1111 rc = -ENOMEM; 1109
1112 goto kmalloc_failed; 1110 err = __jfs_setxattr(*tid, inode, name,
1111 xattr->value, xattr->value_len, 0);
1112 kfree(name);
1113 if (err < 0)
1114 break;
1113 } 1115 }
1114 strcpy(name, XATTR_SECURITY_PREFIX); 1116 return err;
1115 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); 1117}
1116
1117 rc = __jfs_setxattr(tid, inode, name, value, len, 0);
1118
1119 kfree(name);
1120kmalloc_failed:
1121 kfree(suffix);
1122 kfree(value);
1123 1118
1124 return rc; 1119int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
1120 const struct qstr *qstr)
1121{
1122 return security_inode_init_security(inode, dir, qstr,
1123 &jfs_initxattrs, &tid);
1125} 1124}
1126#endif 1125#endif
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index b7c99bfb3da6..6f29836ec0cb 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
316 struct hlist_node *pos; 316 struct hlist_node *pos;
317 struct nlm_host *host = NULL; 317 struct nlm_host *host = NULL;
318 struct nsm_handle *nsm = NULL; 318 struct nsm_handle *nsm = NULL;
319 struct sockaddr_in sin = { 319 struct sockaddr *src_sap = svc_daddr(rqstp);
320 .sin_family = AF_INET, 320 size_t src_len = rqstp->rq_daddrlen;
321 };
322 struct sockaddr_in6 sin6 = {
323 .sin6_family = AF_INET6,
324 };
325 struct sockaddr *src_sap;
326 size_t src_len = rqstp->rq_addrlen;
327 struct nlm_lookup_host_info ni = { 321 struct nlm_lookup_host_info ni = {
328 .server = 1, 322 .server = 1,
329 .sap = svc_addr(rqstp), 323 .sap = svc_addr(rqstp),
@@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
340 334
341 mutex_lock(&nlm_host_mutex); 335 mutex_lock(&nlm_host_mutex);
342 336
343 switch (ni.sap->sa_family) {
344 case AF_INET:
345 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
346 src_sap = (struct sockaddr *)&sin;
347 break;
348 case AF_INET6:
349 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
350 src_sap = (struct sockaddr *)&sin6;
351 break;
352 default:
353 dprintk("lockd: %s failed; unrecognized address family\n",
354 __func__);
355 goto out;
356 }
357
358 if (time_after_eq(jiffies, next_gc)) 337 if (time_after_eq(jiffies, next_gc))
359 nlm_gc_hosts(); 338 nlm_gc_hosts();
360 339
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index abfff9d7979d..c061b9aa7ddb 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -282,7 +282,7 @@ int lockd_up(void)
282 /* 282 /*
283 * Create the kernel thread and wait for it to start. 283 * Create the kernel thread and wait for it to start.
284 */ 284 */
285 nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); 285 nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
286 if (IS_ERR(nlmsvc_rqst)) { 286 if (IS_ERR(nlmsvc_rqst)) {
287 error = PTR_ERR(nlmsvc_rqst); 287 error = PTR_ERR(nlmsvc_rqst);
288 nlmsvc_rqst = NULL; 288 nlmsvc_rqst = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index 703f545097de..3b0d05dcd7c1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -60,7 +60,7 @@
60 * 60 *
61 * Initial implementation of mandatory locks. SunOS turned out to be 61 * Initial implementation of mandatory locks. SunOS turned out to be
62 * a rotten model, so I implemented the "obvious" semantics. 62 * a rotten model, so I implemented the "obvious" semantics.
63 * See 'Documentation/mandatory.txt' for details. 63 * See 'Documentation/filesystems/mandatory-locking.txt' for details.
64 * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. 64 * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
65 * 65 *
66 * Don't allow mandatory locks on mmap()'ed files. Added simple functions to 66 * Don't allow mandatory locks on mmap()'ed files. Added simple functions to
@@ -133,6 +133,20 @@
133#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) 133#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
134#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) 134#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
135 135
136static bool lease_breaking(struct file_lock *fl)
137{
138 return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
139}
140
141static int target_leasetype(struct file_lock *fl)
142{
143 if (fl->fl_flags & FL_UNLOCK_PENDING)
144 return F_UNLCK;
145 if (fl->fl_flags & FL_DOWNGRADE_PENDING)
146 return F_RDLCK;
147 return fl->fl_type;
148}
149
136int leases_enable = 1; 150int leases_enable = 1;
137int lease_break_time = 45; 151int lease_break_time = 45;
138 152
@@ -1119,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode,
1119 1133
1120EXPORT_SYMBOL(locks_mandatory_area); 1134EXPORT_SYMBOL(locks_mandatory_area);
1121 1135
1136static void lease_clear_pending(struct file_lock *fl, int arg)
1137{
1138 switch (arg) {
1139 case F_UNLCK:
1140 fl->fl_flags &= ~FL_UNLOCK_PENDING;
1141 /* fall through: */
1142 case F_RDLCK:
1143 fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
1144 }
1145}
1146
1122/* We already had a lease on this file; just change its type */ 1147/* We already had a lease on this file; just change its type */
1123int lease_modify(struct file_lock **before, int arg) 1148int lease_modify(struct file_lock **before, int arg)
1124{ 1149{
@@ -1127,6 +1152,7 @@ int lease_modify(struct file_lock **before, int arg)
1127 1152
1128 if (error) 1153 if (error)
1129 return error; 1154 return error;
1155 lease_clear_pending(fl, arg);
1130 locks_wake_up_blocks(fl); 1156 locks_wake_up_blocks(fl);
1131 if (arg == F_UNLCK) 1157 if (arg == F_UNLCK)
1132 locks_delete_lock(before); 1158 locks_delete_lock(before);
@@ -1135,19 +1161,25 @@ int lease_modify(struct file_lock **before, int arg)
1135 1161
1136EXPORT_SYMBOL(lease_modify); 1162EXPORT_SYMBOL(lease_modify);
1137 1163
1164static bool past_time(unsigned long then)
1165{
1166 if (!then)
1167 /* 0 is a special value meaning "this never expires": */
1168 return false;
1169 return time_after(jiffies, then);
1170}
1171
1138static void time_out_leases(struct inode *inode) 1172static void time_out_leases(struct inode *inode)
1139{ 1173{
1140 struct file_lock **before; 1174 struct file_lock **before;
1141 struct file_lock *fl; 1175 struct file_lock *fl;
1142 1176
1143 before = &inode->i_flock; 1177 before = &inode->i_flock;
1144 while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) { 1178 while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
1145 if ((fl->fl_break_time == 0) 1179 if (past_time(fl->fl_downgrade_time))
1146 || time_before(jiffies, fl->fl_break_time)) { 1180 lease_modify(before, F_RDLCK);
1147 before = &fl->fl_next; 1181 if (past_time(fl->fl_break_time))
1148 continue; 1182 lease_modify(before, F_UNLCK);
1149 }
1150 lease_modify(before, fl->fl_type & ~F_INPROGRESS);
1151 if (fl == *before) /* lease_modify may have freed fl */ 1183 if (fl == *before) /* lease_modify may have freed fl */
1152 before = &fl->fl_next; 1184 before = &fl->fl_next;
1153 } 1185 }
@@ -1165,7 +1197,7 @@ static void time_out_leases(struct inode *inode)
1165 */ 1197 */
1166int __break_lease(struct inode *inode, unsigned int mode) 1198int __break_lease(struct inode *inode, unsigned int mode)
1167{ 1199{
1168 int error = 0, future; 1200 int error = 0;
1169 struct file_lock *new_fl, *flock; 1201 struct file_lock *new_fl, *flock;
1170 struct file_lock *fl; 1202 struct file_lock *fl;
1171 unsigned long break_time; 1203 unsigned long break_time;
@@ -1182,24 +1214,13 @@ int __break_lease(struct inode *inode, unsigned int mode)
1182 if ((flock == NULL) || !IS_LEASE(flock)) 1214 if ((flock == NULL) || !IS_LEASE(flock))
1183 goto out; 1215 goto out;
1184 1216
1217 if (!locks_conflict(flock, new_fl))
1218 goto out;
1219
1185 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) 1220 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
1186 if (fl->fl_owner == current->files) 1221 if (fl->fl_owner == current->files)
1187 i_have_this_lease = 1; 1222 i_have_this_lease = 1;
1188 1223
1189 if (want_write) {
1190 /* If we want write access, we have to revoke any lease. */
1191 future = F_UNLCK | F_INPROGRESS;
1192 } else if (flock->fl_type & F_INPROGRESS) {
1193 /* If the lease is already being broken, we just leave it */
1194 future = flock->fl_type;
1195 } else if (flock->fl_type & F_WRLCK) {
1196 /* Downgrade the exclusive lease to a read-only lease. */
1197 future = F_RDLCK | F_INPROGRESS;
1198 } else {
1199 /* the existing lease was read-only, so we can read too. */
1200 goto out;
1201 }
1202
1203 if (IS_ERR(new_fl) && !i_have_this_lease 1224 if (IS_ERR(new_fl) && !i_have_this_lease
1204 && ((mode & O_NONBLOCK) == 0)) { 1225 && ((mode & O_NONBLOCK) == 0)) {
1205 error = PTR_ERR(new_fl); 1226 error = PTR_ERR(new_fl);
@@ -1214,12 +1235,18 @@ int __break_lease(struct inode *inode, unsigned int mode)
1214 } 1235 }
1215 1236
1216 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { 1237 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
1217 if (fl->fl_type != future) { 1238 if (want_write) {
1218 fl->fl_type = future; 1239 if (fl->fl_flags & FL_UNLOCK_PENDING)
1240 continue;
1241 fl->fl_flags |= FL_UNLOCK_PENDING;
1219 fl->fl_break_time = break_time; 1242 fl->fl_break_time = break_time;
1220 /* lease must have lmops break callback */ 1243 } else {
1221 fl->fl_lmops->lm_break(fl); 1244 if (lease_breaking(flock))
1245 continue;
1246 fl->fl_flags |= FL_DOWNGRADE_PENDING;
1247 fl->fl_downgrade_time = break_time;
1222 } 1248 }
1249 fl->fl_lmops->lm_break(fl);
1223 } 1250 }
1224 1251
1225 if (i_have_this_lease || (mode & O_NONBLOCK)) { 1252 if (i_have_this_lease || (mode & O_NONBLOCK)) {
@@ -1243,10 +1270,13 @@ restart:
1243 if (error >= 0) { 1270 if (error >= 0) {
1244 if (error == 0) 1271 if (error == 0)
1245 time_out_leases(inode); 1272 time_out_leases(inode);
1246 /* Wait for the next lease that has not been broken yet */ 1273 /*
1274 * Wait for the next conflicting lease that has not been
1275 * broken yet
1276 */
1247 for (flock = inode->i_flock; flock && IS_LEASE(flock); 1277 for (flock = inode->i_flock; flock && IS_LEASE(flock);
1248 flock = flock->fl_next) { 1278 flock = flock->fl_next) {
1249 if (flock->fl_type & F_INPROGRESS) 1279 if (locks_conflict(new_fl, flock))
1250 goto restart; 1280 goto restart;
1251 } 1281 }
1252 error = 0; 1282 error = 0;
@@ -1314,7 +1344,7 @@ int fcntl_getlease(struct file *filp)
1314 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); 1344 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
1315 fl = fl->fl_next) { 1345 fl = fl->fl_next) {
1316 if (fl->fl_file == filp) { 1346 if (fl->fl_file == filp) {
1317 type = fl->fl_type & ~F_INPROGRESS; 1347 type = target_leasetype(fl);
1318 break; 1348 break;
1319 } 1349 }
1320 } 1350 }
@@ -1322,50 +1352,23 @@ int fcntl_getlease(struct file *filp)
1322 return type; 1352 return type;
1323} 1353}
1324 1354
1325/** 1355int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1326 * generic_setlease - sets a lease on an open file
1327 * @filp: file pointer
1328 * @arg: type of lease to obtain
1329 * @flp: input - file_lock to use, output - file_lock inserted
1330 *
1331 * The (input) flp->fl_lmops->lm_break function is required
1332 * by break_lease().
1333 *
1334 * Called with file_lock_lock held.
1335 */
1336int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1337{ 1356{
1338 struct file_lock *fl, **before, **my_before = NULL, *lease; 1357 struct file_lock *fl, **before, **my_before = NULL, *lease;
1339 struct dentry *dentry = filp->f_path.dentry; 1358 struct dentry *dentry = filp->f_path.dentry;
1340 struct inode *inode = dentry->d_inode; 1359 struct inode *inode = dentry->d_inode;
1341 int error, rdlease_count = 0, wrlease_count = 0; 1360 int error;
1342 1361
1343 lease = *flp; 1362 lease = *flp;
1344 1363
1345 error = -EACCES; 1364 error = -EAGAIN;
1346 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) 1365 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1347 goto out;
1348 error = -EINVAL;
1349 if (!S_ISREG(inode->i_mode))
1350 goto out; 1366 goto out;
1351 error = security_file_lock(filp, arg); 1367 if ((arg == F_WRLCK)
1352 if (error) 1368 && ((dentry->d_count > 1)
1369 || (atomic_read(&inode->i_count) > 1)))
1353 goto out; 1370 goto out;
1354 1371
1355 time_out_leases(inode);
1356
1357 BUG_ON(!(*flp)->fl_lmops->lm_break);
1358
1359 if (arg != F_UNLCK) {
1360 error = -EAGAIN;
1361 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1362 goto out;
1363 if ((arg == F_WRLCK)
1364 && ((dentry->d_count > 1)
1365 || (atomic_read(&inode->i_count) > 1)))
1366 goto out;
1367 }
1368
1369 /* 1372 /*
1370 * At this point, we know that if there is an exclusive 1373 * At this point, we know that if there is an exclusive
1371 * lease on this file, then we hold it on this filp 1374 * lease on this file, then we hold it on this filp
@@ -1374,27 +1377,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1374 * then the file is not open by anyone (including us) 1377 * then the file is not open by anyone (including us)
1375 * except for this filp. 1378 * except for this filp.
1376 */ 1379 */
1380 error = -EAGAIN;
1377 for (before = &inode->i_flock; 1381 for (before = &inode->i_flock;
1378 ((fl = *before) != NULL) && IS_LEASE(fl); 1382 ((fl = *before) != NULL) && IS_LEASE(fl);
1379 before = &fl->fl_next) { 1383 before = &fl->fl_next) {
1380 if (fl->fl_file == filp) 1384 if (fl->fl_file == filp) {
1381 my_before = before; 1385 my_before = before;
1382 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) 1386 continue;
1383 /* 1387 }
1384 * Someone is in the process of opening this 1388 /*
1385 * file for writing so we may not take an 1389 * No exclusive leases if someone else has a lease on
1386 * exclusive lease on it. 1390 * this file:
1387 */ 1391 */
1388 wrlease_count++; 1392 if (arg == F_WRLCK)
1389 else 1393 goto out;
1390 rdlease_count++; 1394 /*
1395 * Modifying our existing lease is OK, but no getting a
1396 * new lease if someone else is opening for write:
1397 */
1398 if (fl->fl_flags & FL_UNLOCK_PENDING)
1399 goto out;
1391 } 1400 }
1392 1401
1393 error = -EAGAIN;
1394 if ((arg == F_RDLCK && (wrlease_count > 0)) ||
1395 (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0)))
1396 goto out;
1397
1398 if (my_before != NULL) { 1402 if (my_before != NULL) {
1399 error = lease->fl_lmops->lm_change(my_before, arg); 1403 error = lease->fl_lmops->lm_change(my_before, arg);
1400 if (!error) 1404 if (!error)
@@ -1402,9 +1406,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1402 goto out; 1406 goto out;
1403 } 1407 }
1404 1408
1405 if (arg == F_UNLCK)
1406 goto out;
1407
1408 error = -EINVAL; 1409 error = -EINVAL;
1409 if (!leases_enable) 1410 if (!leases_enable)
1410 goto out; 1411 goto out;
@@ -1415,6 +1416,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1415out: 1416out:
1416 return error; 1417 return error;
1417} 1418}
1419
1420int generic_delete_lease(struct file *filp, struct file_lock **flp)
1421{
1422 struct file_lock *fl, **before;
1423 struct dentry *dentry = filp->f_path.dentry;
1424 struct inode *inode = dentry->d_inode;
1425
1426 for (before = &inode->i_flock;
1427 ((fl = *before) != NULL) && IS_LEASE(fl);
1428 before = &fl->fl_next) {
1429 if (fl->fl_file != filp)
1430 continue;
1431 return (*flp)->fl_lmops->lm_change(before, F_UNLCK);
1432 }
1433 return -EAGAIN;
1434}
1435
1436/**
1437 * generic_setlease - sets a lease on an open file
1438 * @filp: file pointer
1439 * @arg: type of lease to obtain
1440 * @flp: input - file_lock to use, output - file_lock inserted
1441 *
1442 * The (input) flp->fl_lmops->lm_break function is required
1443 * by break_lease().
1444 *
1445 * Called with file_lock_lock held.
1446 */
1447int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1448{
1449 struct dentry *dentry = filp->f_path.dentry;
1450 struct inode *inode = dentry->d_inode;
1451 int error;
1452
1453 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
1454 return -EACCES;
1455 if (!S_ISREG(inode->i_mode))
1456 return -EINVAL;
1457 error = security_file_lock(filp, arg);
1458 if (error)
1459 return error;
1460
1461 time_out_leases(inode);
1462
1463 BUG_ON(!(*flp)->fl_lmops->lm_break);
1464
1465 switch (arg) {
1466 case F_UNLCK:
1467 return generic_delete_lease(filp, flp);
1468 case F_RDLCK:
1469 case F_WRLCK:
1470 return generic_add_lease(filp, arg, flp);
1471 default:
1472 BUG();
1473 }
1474}
1418EXPORT_SYMBOL(generic_setlease); 1475EXPORT_SYMBOL(generic_setlease);
1419 1476
1420static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) 1477static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
@@ -2126,7 +2183,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2126 } 2183 }
2127 } else if (IS_LEASE(fl)) { 2184 } else if (IS_LEASE(fl)) {
2128 seq_printf(f, "LEASE "); 2185 seq_printf(f, "LEASE ");
2129 if (fl->fl_type & F_INPROGRESS) 2186 if (lease_breaking(fl))
2130 seq_printf(f, "BREAKING "); 2187 seq_printf(f, "BREAKING ");
2131 else if (fl->fl_file) 2188 else if (fl->fl_file)
2132 seq_printf(f, "ACTIVE "); 2189 seq_printf(f, "ACTIVE ");
@@ -2142,7 +2199,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2142 : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); 2199 : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
2143 } else { 2200 } else {
2144 seq_printf(f, "%s ", 2201 seq_printf(f, "%s ",
2145 (fl->fl_type & F_INPROGRESS) 2202 (lease_breaking(fl))
2146 ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " 2203 ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
2147 : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); 2204 : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
2148 } 2205 }
diff --git a/fs/namei.c b/fs/namei.c
index 2826db35dc25..7657be4352bf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -221,14 +221,12 @@ static int check_acl(struct inode *inode, int mask)
221} 221}
222 222
223/* 223/*
224 * This does basic POSIX ACL permission checking 224 * This does the basic permission checking
225 */ 225 */
226static int acl_permission_check(struct inode *inode, int mask) 226static int acl_permission_check(struct inode *inode, int mask)
227{ 227{
228 unsigned int mode = inode->i_mode; 228 unsigned int mode = inode->i_mode;
229 229
230 mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
231
232 if (current_user_ns() != inode_userns(inode)) 230 if (current_user_ns() != inode_userns(inode))
233 goto other_perms; 231 goto other_perms;
234 232
@@ -257,7 +255,7 @@ other_perms:
257/** 255/**
258 * generic_permission - check for access rights on a Posix-like filesystem 256 * generic_permission - check for access rights on a Posix-like filesystem
259 * @inode: inode to check access rights for 257 * @inode: inode to check access rights for
260 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 258 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
261 * 259 *
262 * Used to check for read/write/execute permissions on a file. 260 * Used to check for read/write/execute permissions on a file.
263 * We use "fsuid" for this, letting us set arbitrary permissions 261 * We use "fsuid" for this, letting us set arbitrary permissions
@@ -273,7 +271,7 @@ int generic_permission(struct inode *inode, int mask)
273 int ret; 271 int ret;
274 272
275 /* 273 /*
276 * Do the basic POSIX ACL permission checks. 274 * Do the basic permission checks.
277 */ 275 */
278 ret = acl_permission_check(inode, mask); 276 ret = acl_permission_check(inode, mask);
279 if (ret != -EACCES) 277 if (ret != -EACCES)
@@ -331,12 +329,14 @@ static inline int do_inode_permission(struct inode *inode, int mask)
331/** 329/**
332 * inode_permission - check for access rights to a given inode 330 * inode_permission - check for access rights to a given inode
333 * @inode: inode to check permission on 331 * @inode: inode to check permission on
334 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 332 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
335 * 333 *
336 * Used to check for read/write/execute permissions on an inode. 334 * Used to check for read/write/execute permissions on an inode.
337 * We use "fsuid" for this, letting us set arbitrary permissions 335 * We use "fsuid" for this, letting us set arbitrary permissions
338 * for filesystem access without changing the "normal" uids which 336 * for filesystem access without changing the "normal" uids which
339 * are used for other things. 337 * are used for other things.
338 *
339 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
340 */ 340 */
341int inode_permission(struct inode *inode, int mask) 341int inode_permission(struct inode *inode, int mask)
342{ 342{
@@ -721,31 +721,22 @@ static int follow_automount(struct path *path, unsigned flags,
721 if (!path->dentry->d_op || !path->dentry->d_op->d_automount) 721 if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
722 return -EREMOTE; 722 return -EREMOTE;
723 723
724 /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT 724 /* We don't want to mount if someone's just doing a stat -
725 * and this is the terminal part of the path. 725 * unless they're stat'ing a directory and appended a '/' to
726 * the name.
727 *
728 * We do, however, want to mount if someone wants to open or
729 * create a file of any type under the mountpoint, wants to
730 * traverse through the mountpoint or wants to open the
731 * mounted directory. Also, autofs may mark negative dentries
732 * as being automount points. These will need the attentions
733 * of the daemon to instantiate them before they can be used.
726 */ 734 */
727 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) 735 if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
728 return -EISDIR; /* we actually want to stop here */ 736 LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
737 path->dentry->d_inode)
738 return -EISDIR;
729 739
730 /*
731 * We don't want to mount if someone's just doing a stat and they've
732 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
733 * appended a '/' to the name.
734 */
735 if (!(flags & LOOKUP_FOLLOW)) {
736 /* We do, however, want to mount if someone wants to open or
737 * create a file of any type under the mountpoint, wants to
738 * traverse through the mountpoint or wants to open the mounted
739 * directory.
740 * Also, autofs may mark negative dentries as being automount
741 * points. These will need the attentions of the daemon to
742 * instantiate them before they can be used.
743 */
744 if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
745 LOOKUP_OPEN | LOOKUP_CREATE)) &&
746 path->dentry->d_inode)
747 return -EISDIR;
748 }
749 current->total_link_count++; 740 current->total_link_count++;
750 if (current->total_link_count >= 40) 741 if (current->total_link_count >= 40)
751 return -ELOOP; 742 return -ELOOP;
@@ -2044,10 +2035,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
2044 if (flag & O_NOATIME && !inode_owner_or_capable(inode)) 2035 if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2045 return -EPERM; 2036 return -EPERM;
2046 2037
2047 /* 2038 return 0;
2048 * Ensure there are no outstanding leases on the file.
2049 */
2050 return break_lease(inode, flag);
2051} 2039}
2052 2040
2053static int handle_truncate(struct file *filp) 2041static int handle_truncate(struct file *filp)
@@ -2619,6 +2607,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2619 if (!dir->i_op->rmdir) 2607 if (!dir->i_op->rmdir)
2620 return -EPERM; 2608 return -EPERM;
2621 2609
2610 dget(dentry);
2622 mutex_lock(&dentry->d_inode->i_mutex); 2611 mutex_lock(&dentry->d_inode->i_mutex);
2623 2612
2624 error = -EBUSY; 2613 error = -EBUSY;
@@ -2639,6 +2628,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2639 2628
2640out: 2629out:
2641 mutex_unlock(&dentry->d_inode->i_mutex); 2630 mutex_unlock(&dentry->d_inode->i_mutex);
2631 dput(dentry);
2642 if (!error) 2632 if (!error)
2643 d_delete(dentry); 2633 d_delete(dentry);
2644 return error; 2634 return error;
@@ -3028,6 +3018,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3028 if (error) 3018 if (error)
3029 return error; 3019 return error;
3030 3020
3021 dget(new_dentry);
3031 if (target) 3022 if (target)
3032 mutex_lock(&target->i_mutex); 3023 mutex_lock(&target->i_mutex);
3033 3024
@@ -3048,6 +3039,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3048out: 3039out:
3049 if (target) 3040 if (target)
3050 mutex_unlock(&target->i_mutex); 3041 mutex_unlock(&target->i_mutex);
3042 dput(new_dentry);
3051 if (!error) 3043 if (!error)
3052 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3044 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3053 d_move(old_dentry,new_dentry); 3045 d_move(old_dentry,new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 22bfe8273c68..e5e1c7d1839b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1109,6 +1109,7 @@ static int show_vfsstat(struct seq_file *m, void *v)
1109 1109
1110 /* device */ 1110 /* device */
1111 if (mnt->mnt_sb->s_op->show_devname) { 1111 if (mnt->mnt_sb->s_op->show_devname) {
1112 seq_puts(m, "device ");
1112 err = mnt->mnt_sb->s_op->show_devname(m, mnt); 1113 err = mnt->mnt_sb->s_op->show_devname(m, mnt);
1113 } else { 1114 } else {
1114 if (mnt->mnt_devname) { 1115 if (mnt->mnt_devname) {
@@ -1757,7 +1758,7 @@ static int do_loopback(struct path *path, char *old_name,
1757 return err; 1758 return err;
1758 if (!old_name || !*old_name) 1759 if (!old_name || !*old_name)
1759 return -EINVAL; 1760 return -EINVAL;
1760 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 1761 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
1761 if (err) 1762 if (err)
1762 return err; 1763 return err;
1763 1764
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index be020771c6b4..dbcd82126aed 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -79,12 +79,9 @@ config NFS_V4_1
79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL 79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL
80 select SUNRPC_BACKCHANNEL 80 select SUNRPC_BACKCHANNEL
81 select PNFS_FILE_LAYOUT 81 select PNFS_FILE_LAYOUT
82 select PNFS_BLOCK
83 select MD
84 select BLK_DEV_DM
85 help 82 help
86 This option enables support for minor version 1 of the NFSv4 protocol 83 This option enables support for minor version 1 of the NFSv4 protocol
87 (RFC 5661 and RFC 5663) in the kernel's NFS client. 84 (RFC 5661) in the kernel's NFS client.
88 85
89 If unsure, say N. 86 If unsure, say N.
90 87
@@ -93,16 +90,13 @@ config PNFS_FILE_LAYOUT
93 90
94config PNFS_BLOCK 91config PNFS_BLOCK
95 tristate 92 tristate
93 depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM
94 default m
96 95
97config PNFS_OBJLAYOUT 96config PNFS_OBJLAYOUT
98 tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" 97 tristate
99 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD 98 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
100 help 99 default m
101 Say M here if you want your pNFS client to support the Objects Layout Driver.
102 Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
103 upper level driver (SCSI_OSD_ULD).
104
105 If unsure, say N.
106 100
107config ROOT_NFS 101config ROOT_NFS
108 bool "Root file system on NFS" 102 bool "Root file system on NFS"
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e56564d2ef95..281ae95932c9 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -36,6 +36,7 @@
36#include <linux/namei.h> 36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */ 37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */ 38#include <linux/buffer_head.h> /* various write calls */
39#include <linux/prefetch.h>
39 40
40#include "blocklayout.h" 41#include "blocklayout.h"
41 42
@@ -175,17 +176,6 @@ retry:
175 return bio; 176 return bio;
176} 177}
177 178
178static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
179{
180 if (lseg->pls_range.iomode == IOMODE_RW) {
181 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
182 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
183 } else {
184 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
185 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
186 }
187}
188
189/* This is basically copied from mpage_end_io_read */ 179/* This is basically copied from mpage_end_io_read */
190static void bl_end_io_read(struct bio *bio, int err) 180static void bl_end_io_read(struct bio *bio, int err)
191{ 181{
@@ -205,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err)
205 if (!uptodate) { 195 if (!uptodate) {
206 if (!rdata->pnfs_error) 196 if (!rdata->pnfs_error)
207 rdata->pnfs_error = -EIO; 197 rdata->pnfs_error = -EIO;
208 bl_set_lo_fail(rdata->lseg); 198 pnfs_set_lo_fail(rdata->lseg);
209 } 199 }
210 bio_put(bio); 200 bio_put(bio);
211 put_parallel(par); 201 put_parallel(par);
@@ -302,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
302 bl_end_io_read, par); 292 bl_end_io_read, par);
303 if (IS_ERR(bio)) { 293 if (IS_ERR(bio)) {
304 rdata->pnfs_error = PTR_ERR(bio); 294 rdata->pnfs_error = PTR_ERR(bio);
295 bio = NULL;
305 goto out; 296 goto out;
306 } 297 }
307 } 298 }
@@ -369,7 +360,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
369 if (!uptodate) { 360 if (!uptodate) {
370 if (!wdata->pnfs_error) 361 if (!wdata->pnfs_error)
371 wdata->pnfs_error = -EIO; 362 wdata->pnfs_error = -EIO;
372 bl_set_lo_fail(wdata->lseg); 363 pnfs_set_lo_fail(wdata->lseg);
373 } 364 }
374 bio_put(bio); 365 bio_put(bio);
375 put_parallel(par); 366 put_parallel(par);
@@ -385,7 +376,7 @@ static void bl_end_io_write(struct bio *bio, int err)
385 if (!uptodate) { 376 if (!uptodate) {
386 if (!wdata->pnfs_error) 377 if (!wdata->pnfs_error)
387 wdata->pnfs_error = -EIO; 378 wdata->pnfs_error = -EIO;
388 bl_set_lo_fail(wdata->lseg); 379 pnfs_set_lo_fail(wdata->lseg);
389 } 380 }
390 bio_put(bio); 381 bio_put(bio);
391 put_parallel(par); 382 put_parallel(par);
@@ -542,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
542fill_invalid_ext: 533fill_invalid_ext:
543 dprintk("%s need to zero %d pages\n", __func__, npg_zero); 534 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
544 for (;npg_zero > 0; npg_zero--) { 535 for (;npg_zero > 0; npg_zero--) {
536 if (bl_is_sector_init(be->be_inval, isect)) {
537 dprintk("isect %llu already init\n",
538 (unsigned long long)isect);
539 goto next_page;
540 }
545 /* page ref released in bl_end_io_write_zero */ 541 /* page ref released in bl_end_io_write_zero */
546 index = isect >> PAGE_CACHE_SECTOR_SHIFT; 542 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
547 dprintk("%s zero %dth page: index %lu isect %llu\n", 543 dprintk("%s zero %dth page: index %lu isect %llu\n",
@@ -561,8 +557,7 @@ fill_invalid_ext:
561 * PageUptodate: It was read before 557 * PageUptodate: It was read before
562 * sector_initialized: already written out 558 * sector_initialized: already written out
563 */ 559 */
564 if (PageDirty(page) || PageWriteback(page) || 560 if (PageDirty(page) || PageWriteback(page)) {
565 bl_is_sector_init(be->be_inval, isect)) {
566 print_page(page); 561 print_page(page);
567 unlock_page(page); 562 unlock_page(page);
568 page_cache_release(page); 563 page_cache_release(page);
@@ -591,6 +586,7 @@ fill_invalid_ext:
591 bl_end_io_write_zero, par); 586 bl_end_io_write_zero, par);
592 if (IS_ERR(bio)) { 587 if (IS_ERR(bio)) {
593 wdata->pnfs_error = PTR_ERR(bio); 588 wdata->pnfs_error = PTR_ERR(bio);
589 bio = NULL;
594 goto out; 590 goto out;
595 } 591 }
596 /* FIXME: This should be done in bi_end_io */ 592 /* FIXME: This should be done in bi_end_io */
@@ -639,6 +635,7 @@ next_page:
639 bl_end_io_write, par); 635 bl_end_io_write, par);
640 if (IS_ERR(bio)) { 636 if (IS_ERR(bio)) {
641 wdata->pnfs_error = PTR_ERR(bio); 637 wdata->pnfs_error = PTR_ERR(bio);
638 bio = NULL;
642 goto out; 639 goto out;
643 } 640 }
644 isect += PAGE_CACHE_SECTORS; 641 isect += PAGE_CACHE_SECTORS;
@@ -804,7 +801,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
804 struct nfs4_deviceid *d_id) 801 struct nfs4_deviceid *d_id)
805{ 802{
806 struct pnfs_device *dev; 803 struct pnfs_device *dev;
807 struct pnfs_block_dev *rv = NULL; 804 struct pnfs_block_dev *rv;
808 u32 max_resp_sz; 805 u32 max_resp_sz;
809 int max_pages; 806 int max_pages;
810 struct page **pages = NULL; 807 struct page **pages = NULL;
@@ -822,18 +819,20 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
822 dev = kmalloc(sizeof(*dev), GFP_NOFS); 819 dev = kmalloc(sizeof(*dev), GFP_NOFS);
823 if (!dev) { 820 if (!dev) {
824 dprintk("%s kmalloc failed\n", __func__); 821 dprintk("%s kmalloc failed\n", __func__);
825 return NULL; 822 return ERR_PTR(-ENOMEM);
826 } 823 }
827 824
828 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); 825 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
829 if (pages == NULL) { 826 if (pages == NULL) {
830 kfree(dev); 827 kfree(dev);
831 return NULL; 828 return ERR_PTR(-ENOMEM);
832 } 829 }
833 for (i = 0; i < max_pages; i++) { 830 for (i = 0; i < max_pages; i++) {
834 pages[i] = alloc_page(GFP_NOFS); 831 pages[i] = alloc_page(GFP_NOFS);
835 if (!pages[i]) 832 if (!pages[i]) {
833 rv = ERR_PTR(-ENOMEM);
836 goto out_free; 834 goto out_free;
835 }
837 } 836 }
838 837
839 memcpy(&dev->dev_id, d_id, sizeof(*d_id)); 838 memcpy(&dev->dev_id, d_id, sizeof(*d_id));
@@ -846,8 +845,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
846 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 845 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
847 rc = nfs4_proc_getdeviceinfo(server, dev); 846 rc = nfs4_proc_getdeviceinfo(server, dev);
848 dprintk("%s getdevice info returns %d\n", __func__, rc); 847 dprintk("%s getdevice info returns %d\n", __func__, rc);
849 if (rc) 848 if (rc) {
849 rv = ERR_PTR(rc);
850 goto out_free; 850 goto out_free;
851 }
851 852
852 rv = nfs4_blk_decode_device(server, dev); 853 rv = nfs4_blk_decode_device(server, dev);
853 out_free: 854 out_free:
@@ -865,7 +866,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
865 struct pnfs_devicelist *dlist = NULL; 866 struct pnfs_devicelist *dlist = NULL;
866 struct pnfs_block_dev *bdev; 867 struct pnfs_block_dev *bdev;
867 LIST_HEAD(block_disklist); 868 LIST_HEAD(block_disklist);
868 int status = 0, i; 869 int status, i;
869 870
870 dprintk("%s enter\n", __func__); 871 dprintk("%s enter\n", __func__);
871 872
@@ -897,8 +898,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
897 for (i = 0; i < dlist->num_devs; i++) { 898 for (i = 0; i < dlist->num_devs; i++) {
898 bdev = nfs4_blk_get_deviceinfo(server, fh, 899 bdev = nfs4_blk_get_deviceinfo(server, fh,
899 &dlist->dev_id[i]); 900 &dlist->dev_id[i]);
900 if (!bdev) { 901 if (IS_ERR(bdev)) {
901 status = -ENODEV; 902 status = PTR_ERR(bdev);
902 goto out_error; 903 goto out_error;
903 } 904 }
904 spin_lock(&b_mt_id->bm_lock); 905 spin_lock(&b_mt_id->bm_lock);
@@ -959,7 +960,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
959}; 960};
960 961
961static const struct rpc_pipe_ops bl_upcall_ops = { 962static const struct rpc_pipe_ops bl_upcall_ops = {
962 .upcall = bl_pipe_upcall, 963 .upcall = rpc_pipe_generic_upcall,
963 .downcall = bl_pipe_downcall, 964 .downcall = bl_pipe_downcall,
964 .destroy_msg = bl_pipe_destroy_msg, 965 .destroy_msg = bl_pipe_destroy_msg,
965}; 966};
@@ -988,17 +989,20 @@ static int __init nfs4blocklayout_init(void)
988 mnt, 989 mnt,
989 NFS_PIPE_DIRNAME, 0, &path); 990 NFS_PIPE_DIRNAME, 0, &path);
990 if (ret) 991 if (ret)
991 goto out_remove; 992 goto out_putrpc;
992 993
993 bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, 994 bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
994 &bl_upcall_ops, 0); 995 &bl_upcall_ops, 0);
996 path_put(&path);
995 if (IS_ERR(bl_device_pipe)) { 997 if (IS_ERR(bl_device_pipe)) {
996 ret = PTR_ERR(bl_device_pipe); 998 ret = PTR_ERR(bl_device_pipe);
997 goto out_remove; 999 goto out_putrpc;
998 } 1000 }
999out: 1001out:
1000 return ret; 1002 return ret;
1001 1003
1004out_putrpc:
1005 rpc_put_mount();
1002out_remove: 1006out_remove:
1003 pnfs_unregister_layoutdriver(&blocklayout_type); 1007 pnfs_unregister_layoutdriver(&blocklayout_type);
1004 return ret; 1008 return ret;
@@ -1011,6 +1015,7 @@ static void __exit nfs4blocklayout_exit(void)
1011 1015
1012 pnfs_unregister_layoutdriver(&blocklayout_type); 1016 pnfs_unregister_layoutdriver(&blocklayout_type);
1013 rpc_unlink(bl_device_pipe); 1017 rpc_unlink(bl_device_pipe);
1018 rpc_put_mount();
1014} 1019}
1015 1020
1016MODULE_ALIAS("nfs-layouttype4-3"); 1021MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f27d827960a3..42acf7ef5992 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -150,7 +150,7 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
150} 150}
151 151
152struct bl_dev_msg { 152struct bl_dev_msg {
153 int status; 153 int32_t status;
154 uint32_t major, minor; 154 uint32_t major, minor;
155}; 155};
156 156
@@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq;
169#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ 169#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
170 170
171/* blocklayoutdev.c */ 171/* blocklayoutdev.c */
172ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
173 char __user *, size_t);
174ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); 172ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
175void bl_pipe_destroy_msg(struct rpc_pipe_msg *); 173void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
176struct block_device *nfs4_blkdev_get(dev_t dev); 174struct block_device *nfs4_blkdev_get(dev_t dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a83b393fb01c..d08ba9107fde 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev)
79 return blkdev_put(bdev, FMODE_READ); 79 return blkdev_put(bdev, FMODE_READ);
80} 80}
81 81
82/*
83 * Shouldn't there be a rpc_generic_upcall() to do this for us?
84 */
85ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
86 char __user *dst, size_t buflen)
87{
88 char *data = (char *)msg->data + msg->copied;
89 size_t mlen = min(msg->len - msg->copied, buflen);
90 unsigned long left;
91
92 left = copy_to_user(dst, data, mlen);
93 if (left == mlen) {
94 msg->errno = -EFAULT;
95 return -EFAULT;
96 }
97
98 mlen -= left;
99 msg->copied += mlen;
100 msg->errno = 0;
101 return mlen;
102}
103
104static struct bl_dev_msg bl_mount_reply; 82static struct bl_dev_msg bl_mount_reply;
105 83
106ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, 84ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
@@ -131,7 +109,7 @@ struct pnfs_block_dev *
131nfs4_blk_decode_device(struct nfs_server *server, 109nfs4_blk_decode_device(struct nfs_server *server,
132 struct pnfs_device *dev) 110 struct pnfs_device *dev)
133{ 111{
134 struct pnfs_block_dev *rv = NULL; 112 struct pnfs_block_dev *rv;
135 struct block_device *bd = NULL; 113 struct block_device *bd = NULL;
136 struct rpc_pipe_msg msg; 114 struct rpc_pipe_msg msg;
137 struct bl_msg_hdr bl_msg = { 115 struct bl_msg_hdr bl_msg = {
@@ -141,7 +119,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
141 uint8_t *dataptr; 119 uint8_t *dataptr;
142 DECLARE_WAITQUEUE(wq, current); 120 DECLARE_WAITQUEUE(wq, current);
143 struct bl_dev_msg *reply = &bl_mount_reply; 121 struct bl_dev_msg *reply = &bl_mount_reply;
144 int offset, len, i; 122 int offset, len, i, rc;
145 123
146 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); 124 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
147 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, 125 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
@@ -168,8 +146,10 @@ nfs4_blk_decode_device(struct nfs_server *server,
168 146
169 dprintk("%s CALLING USERSPACE DAEMON\n", __func__); 147 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
170 add_wait_queue(&bl_wq, &wq); 148 add_wait_queue(&bl_wq, &wq);
171 if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { 149 rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
150 if (rc < 0) {
172 remove_wait_queue(&bl_wq, &wq); 151 remove_wait_queue(&bl_wq, &wq);
152 rv = ERR_PTR(rc);
173 goto out; 153 goto out;
174 } 154 }
175 155
@@ -187,8 +167,9 @@ nfs4_blk_decode_device(struct nfs_server *server,
187 167
188 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); 168 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
189 if (IS_ERR(bd)) { 169 if (IS_ERR(bd)) {
190 dprintk("%s failed to open device : %ld\n", 170 rc = PTR_ERR(bd);
191 __func__, PTR_ERR(bd)); 171 dprintk("%s failed to open device : %d\n", __func__, rc);
172 rv = ERR_PTR(rc);
192 goto out; 173 goto out;
193 } 174 }
194 175
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e3d294269058..516f3375e067 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv)
125 else 125 else
126 goto out_err; 126 goto out_err;
127 127
128 return svc_prepare_thread(serv, &serv->sv_pools[0]); 128 return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
129 129
130out_err: 130out_err:
131 if (ret == 0) 131 if (ret == 0)
@@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
199 INIT_LIST_HEAD(&serv->sv_cb_list); 199 INIT_LIST_HEAD(&serv->sv_cb_list);
200 spin_lock_init(&serv->sv_cb_lock); 200 spin_lock_init(&serv->sv_cb_lock);
201 init_waitqueue_head(&serv->sv_cb_waitq); 201 init_waitqueue_head(&serv->sv_cb_waitq);
202 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); 202 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
203 if (IS_ERR(rqstp)) { 203 if (IS_ERR(rqstp)) {
204 svc_xprt_put(serv->sv_bc_xprt); 204 svc_xprt_put(serv->sv_bc_xprt);
205 serv->sv_bc_xprt = NULL; 205 serv->sv_bc_xprt = NULL;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b257383bb565..07df5f1d85e5 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,6 +38,7 @@ enum nfs4_callback_opnum {
38struct cb_process_state { 38struct cb_process_state {
39 __be32 drc_status; 39 __be32 drc_status;
40 struct nfs_client *clp; 40 struct nfs_client *clp;
41 int slotid;
41}; 42};
42 43
43struct cb_compound_hdr_arg { 44struct cb_compound_hdr_arg {
@@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall(
166 void *dummy, struct cb_process_state *cps); 167 void *dummy, struct cb_process_state *cps);
167 168
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); 169extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170 170
171struct cb_devicenotifyitem { 171struct cb_devicenotifyitem {
172 uint32_t cbd_notify_type; 172 uint32_t cbd_notify_type;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 74780f9f852c..43926add945b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -348,7 +348,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
348 /* Normal */ 348 /* Normal */
349 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { 349 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
350 slot->seq_nr++; 350 slot->seq_nr++;
351 return htonl(NFS4_OK); 351 goto out_ok;
352 } 352 }
353 353
354 /* Replay */ 354 /* Replay */
@@ -367,11 +367,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
367 /* Wraparound */ 367 /* Wraparound */
368 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { 368 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
369 slot->seq_nr = 1; 369 slot->seq_nr = 1;
370 return htonl(NFS4_OK); 370 goto out_ok;
371 } 371 }
372 372
373 /* Misordered request */ 373 /* Misordered request */
374 return htonl(NFS4ERR_SEQ_MISORDERED); 374 return htonl(NFS4ERR_SEQ_MISORDERED);
375out_ok:
376 tbl->highest_used_slotid = args->csa_slotid;
377 return htonl(NFS4_OK);
375} 378}
376 379
377/* 380/*
@@ -433,26 +436,37 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
433 struct cb_sequenceres *res, 436 struct cb_sequenceres *res,
434 struct cb_process_state *cps) 437 struct cb_process_state *cps)
435{ 438{
439 struct nfs4_slot_table *tbl;
436 struct nfs_client *clp; 440 struct nfs_client *clp;
437 int i; 441 int i;
438 __be32 status = htonl(NFS4ERR_BADSESSION); 442 __be32 status = htonl(NFS4ERR_BADSESSION);
439 443
440 cps->clp = NULL;
441
442 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); 444 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
443 if (clp == NULL) 445 if (clp == NULL)
444 goto out; 446 goto out;
445 447
448 tbl = &clp->cl_session->bc_slot_table;
449
450 spin_lock(&tbl->slot_tbl_lock);
446 /* state manager is resetting the session */ 451 /* state manager is resetting the session */
447 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { 452 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
448 status = NFS4ERR_DELAY; 453 spin_unlock(&tbl->slot_tbl_lock);
454 status = htonl(NFS4ERR_DELAY);
455 /* Return NFS4ERR_BADSESSION if we're draining the session
456 * in order to reset it.
457 */
458 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
459 status = htonl(NFS4ERR_BADSESSION);
449 goto out; 460 goto out;
450 } 461 }
451 462
452 status = validate_seqid(&clp->cl_session->bc_slot_table, args); 463 status = validate_seqid(&clp->cl_session->bc_slot_table, args);
464 spin_unlock(&tbl->slot_tbl_lock);
453 if (status) 465 if (status)
454 goto out; 466 goto out;
455 467
468 cps->slotid = args->csa_slotid;
469
456 /* 470 /*
457 * Check for pending referring calls. If a match is found, a 471 * Check for pending referring calls. If a match is found, a
458 * related callback was received before the response to the original 472 * related callback was received before the response to the original
@@ -469,7 +483,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
469 res->csr_slotid = args->csa_slotid; 483 res->csr_slotid = args->csa_slotid;
470 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 484 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
471 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 485 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
472 nfs4_cb_take_slot(clp);
473 486
474out: 487out:
475 cps->clp = clp; /* put in nfs4_callback_compound */ 488 cps->clp = clp; /* put in nfs4_callback_compound */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c6c86a77e043..918ad647afea 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
754 * Let the state manager know callback processing done. 754 * Let the state manager know callback processing done.
755 * A single slot, so highest used slotid is either 0 or -1 755 * A single slot, so highest used slotid is either 0 or -1
756 */ 756 */
757 tbl->highest_used_slotid--; 757 tbl->highest_used_slotid = -1;
758 nfs4_check_drain_bc_complete(session); 758 nfs4_check_drain_bc_complete(session);
759 spin_unlock(&tbl->slot_tbl_lock); 759 spin_unlock(&tbl->slot_tbl_lock);
760} 760}
761 761
762static void nfs4_cb_free_slot(struct nfs_client *clp) 762static void nfs4_cb_free_slot(struct cb_process_state *cps)
763{ 763{
764 if (clp && clp->cl_session) 764 if (cps->slotid != -1)
765 nfs4_callback_free_slot(clp->cl_session); 765 nfs4_callback_free_slot(cps->clp->cl_session);
766}
767
768/* A single slot, so highest used slotid is either 0 or -1 */
769void nfs4_cb_take_slot(struct nfs_client *clp)
770{
771 struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
772
773 spin_lock(&tbl->slot_tbl_lock);
774 tbl->highest_used_slotid++;
775 BUG_ON(tbl->highest_used_slotid != 0);
776 spin_unlock(&tbl->slot_tbl_lock);
777} 766}
778 767
779#else /* CONFIG_NFS_V4_1 */ 768#else /* CONFIG_NFS_V4_1 */
@@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
784 return htonl(NFS4ERR_MINOR_VERS_MISMATCH); 773 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
785} 774}
786 775
787static void nfs4_cb_free_slot(struct nfs_client *clp) 776static void nfs4_cb_free_slot(struct cb_process_state *cps)
788{ 777{
789} 778}
790#endif /* CONFIG_NFS_V4_1 */ 779#endif /* CONFIG_NFS_V4_1 */
@@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
866 struct cb_process_state cps = { 855 struct cb_process_state cps = {
867 .drc_status = 0, 856 .drc_status = 0,
868 .clp = NULL, 857 .clp = NULL,
858 .slotid = -1,
869 }; 859 };
870 unsigned int nops = 0; 860 unsigned int nops = 0;
871 861
@@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
906 896
907 *hdr_res.status = status; 897 *hdr_res.status = status;
908 *hdr_res.nops = htonl(nops); 898 *hdr_res.nops = htonl(nops);
909 nfs4_cb_free_slot(cps.clp); 899 nfs4_cb_free_slot(&cps);
910 nfs_put_client(cps.clp); 900 nfs_put_client(cps.clp);
911 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 901 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
912 return rpc_success; 902 return rpc_success;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5833fbbf59b0..873bf00d51a2 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
336 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; 336 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
337 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; 337 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
338 338
339 if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && 339 if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
340 sin1->sin6_scope_id != sin2->sin6_scope_id)
341 return 0; 340 return 0;
341 else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
342 return sin1->sin6_scope_id == sin2->sin6_scope_id;
342 343
343 return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); 344 return 1;
344} 345}
345#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ 346#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
346static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, 347static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -1867,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
1867 /* display one transport per line on subsequent lines */ 1868 /* display one transport per line on subsequent lines */
1868 clp = list_entry(v, struct nfs_client, cl_share_link); 1869 clp = list_entry(v, struct nfs_client, cl_share_link);
1869 1870
1871 /* Check if the client is initialized */
1872 if (clp->cl_cons_state != NFS_CS_READY)
1873 return 0;
1874
1870 seq_printf(m, "v%u %s %s %3d %s\n", 1875 seq_printf(m, "v%u %s %s %3d %s\n",
1871 clp->rpc_ops->version, 1876 clp->rpc_ops->version,
1872 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), 1877 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 321a66bc3846..7f2654069806 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
240 sizeof(delegation->stateid.data)); 240 sizeof(delegation->stateid.data));
241 delegation->type = res->delegation_type; 241 delegation->type = res->delegation_type;
242 delegation->maxsize = res->maxsize; 242 delegation->maxsize = res->maxsize;
243 delegation->change_attr = nfsi->change_attr; 243 delegation->change_attr = inode->i_version;
244 delegation->cred = get_rpccred(cred); 244 delegation->cred = get_rpccred(cred);
245 delegation->inode = inode; 245 delegation->inode = inode;
246 delegation->flags = 1<<NFS_DELEGATION_REFERENCED; 246 delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 28b8c3f3cda3..91c01f0a4c3b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -180,8 +180,6 @@ force_reval:
180 180
181static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) 181static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
182{ 182{
183 loff_t loff;
184
185 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", 183 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
186 filp->f_path.dentry->d_parent->d_name.name, 184 filp->f_path.dentry->d_parent->d_name.name,
187 filp->f_path.dentry->d_name.name, 185 filp->f_path.dentry->d_name.name,
@@ -197,13 +195,9 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
197 int retval = nfs_revalidate_file_size(inode, filp); 195 int retval = nfs_revalidate_file_size(inode, filp);
198 if (retval < 0) 196 if (retval < 0)
199 return (loff_t)retval; 197 return (loff_t)retval;
198 }
200 199
201 spin_lock(&inode->i_lock); 200 return generic_file_llseek(filp, offset, origin);
202 loff = generic_file_llseek_unlocked(filp, offset, origin);
203 spin_unlock(&inode->i_lock);
204 } else
205 loff = generic_file_llseek_unlocked(filp, offset, origin);
206 return loff;
207} 201}
208 202
209/* 203/*
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 5b1006480bc2..7cf2c4699b08 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
212 auxdata.ctime = nfsi->vfs_inode.i_ctime; 212 auxdata.ctime = nfsi->vfs_inode.i_ctime;
213 213
214 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) 214 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
215 auxdata.change_attr = nfsi->change_attr; 215 auxdata.change_attr = nfsi->vfs_inode.i_version;
216 216
217 if (bufmax > sizeof(auxdata)) 217 if (bufmax > sizeof(auxdata))
218 bufmax = sizeof(auxdata); 218 bufmax = sizeof(auxdata);
@@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
244 auxdata.ctime = nfsi->vfs_inode.i_ctime; 244 auxdata.ctime = nfsi->vfs_inode.i_ctime;
245 245
246 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) 246 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
247 auxdata.change_attr = nfsi->change_attr; 247 auxdata.change_attr = nfsi->vfs_inode.i_version;
248 248
249 if (memcmp(data, &auxdata, datalen) != 0) 249 if (memcmp(data, &auxdata, datalen) != 0)
250 return FSCACHE_CHECKAUX_OBSOLETE; 250 return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index f20801ae0a16..47d1c6ff2d8e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -336,8 +336,6 @@ struct idmap {
336 struct idmap_hashtable idmap_group_hash; 336 struct idmap_hashtable idmap_group_hash;
337}; 337};
338 338
339static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
340 char __user *, size_t);
341static ssize_t idmap_pipe_downcall(struct file *, const char __user *, 339static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
342 size_t); 340 size_t);
343static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); 341static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
@@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
345static unsigned int fnvhash32(const void *, size_t); 343static unsigned int fnvhash32(const void *, size_t);
346 344
347static const struct rpc_pipe_ops idmap_upcall_ops = { 345static const struct rpc_pipe_ops idmap_upcall_ops = {
348 .upcall = idmap_pipe_upcall, 346 .upcall = rpc_pipe_generic_upcall,
349 .downcall = idmap_pipe_downcall, 347 .downcall = idmap_pipe_downcall,
350 .destroy_msg = idmap_pipe_destroy_msg, 348 .destroy_msg = idmap_pipe_destroy_msg,
351}; 349};
@@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
595 return ret; 593 return ret;
596} 594}
597 595
598/* RPC pipefs upcall/downcall routines */
599static ssize_t
600idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
601 char __user *dst, size_t buflen)
602{
603 char *data = (char *)msg->data + msg->copied;
604 size_t mlen = min(msg->len, buflen);
605 unsigned long left;
606
607 left = copy_to_user(dst, data, mlen);
608 if (left == mlen) {
609 msg->errno = -EFAULT;
610 return -EFAULT;
611 }
612
613 mlen -= left;
614 msg->copied += mlen;
615 msg->errno = 0;
616 return mlen;
617}
618
619static ssize_t 596static ssize_t
620idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) 597idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
621{ 598{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fe1203797b2b..4dc6d078f108 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -318,7 +318,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
318 memset(&inode->i_atime, 0, sizeof(inode->i_atime)); 318 memset(&inode->i_atime, 0, sizeof(inode->i_atime));
319 memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); 319 memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
320 memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); 320 memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
321 nfsi->change_attr = 0; 321 inode->i_version = 0;
322 inode->i_size = 0; 322 inode->i_size = 0;
323 inode->i_nlink = 0; 323 inode->i_nlink = 0;
324 inode->i_uid = -2; 324 inode->i_uid = -2;
@@ -344,7 +344,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
344 | NFS_INO_INVALID_ACCESS 344 | NFS_INO_INVALID_ACCESS
345 | NFS_INO_INVALID_ACL; 345 | NFS_INO_INVALID_ACL;
346 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) 346 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
347 nfsi->change_attr = fattr->change_attr; 347 inode->i_version = fattr->change_attr;
348 else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) 348 else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
349 nfsi->cache_validity |= NFS_INO_INVALID_ATTR 349 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
350 | NFS_INO_INVALID_DATA; 350 | NFS_INO_INVALID_DATA;
@@ -897,8 +897,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
897 897
898 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) 898 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
899 && (fattr->valid & NFS_ATTR_FATTR_CHANGE) 899 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
900 && nfsi->change_attr == fattr->pre_change_attr) { 900 && inode->i_version == fattr->pre_change_attr) {
901 nfsi->change_attr = fattr->change_attr; 901 inode->i_version = fattr->change_attr;
902 if (S_ISDIR(inode->i_mode)) 902 if (S_ISDIR(inode->i_mode))
903 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 903 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
904 ret |= NFS_INO_INVALID_ATTR; 904 ret |= NFS_INO_INVALID_ATTR;
@@ -952,7 +952,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
952 return -EIO; 952 return -EIO;
953 953
954 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && 954 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
955 nfsi->change_attr != fattr->change_attr) 955 inode->i_version != fattr->change_attr)
956 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 956 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
957 957
958 /* Verify a few of the more important attributes */ 958 /* Verify a few of the more important attributes */
@@ -1163,7 +1163,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
1163 } 1163 }
1164 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && 1164 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
1165 (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { 1165 (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
1166 fattr->pre_change_attr = NFS_I(inode)->change_attr; 1166 fattr->pre_change_attr = inode->i_version;
1167 fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; 1167 fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
1168 } 1168 }
1169 if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && 1169 if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
@@ -1244,13 +1244,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1244 1244
1245 /* More cache consistency checks */ 1245 /* More cache consistency checks */
1246 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { 1246 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
1247 if (nfsi->change_attr != fattr->change_attr) { 1247 if (inode->i_version != fattr->change_attr) {
1248 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1248 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1249 inode->i_sb->s_id, inode->i_ino); 1249 inode->i_sb->s_id, inode->i_ino);
1250 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1250 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1251 if (S_ISDIR(inode->i_mode)) 1251 if (S_ISDIR(inode->i_mode))
1252 nfs_force_lookup_revalidate(inode); 1252 nfs_force_lookup_revalidate(inode);
1253 nfsi->change_attr = fattr->change_attr; 1253 inode->i_version = fattr->change_attr;
1254 } 1254 }
1255 } else if (server->caps & NFS_CAP_CHANGE_ATTR) 1255 } else if (server->caps & NFS_CAP_CHANGE_ATTR)
1256 invalid |= save_cache_validity; 1256 invalid |= save_cache_validity;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ab12913dd473..c1a1bd8ddf1c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -457,13 +457,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
457 PAGE_SIZE - 1) >> PAGE_SHIFT; 457 PAGE_SIZE - 1) >> PAGE_SHIFT;
458} 458}
459 459
460/*
461 * Helper for restarting RPC calls in the possible presence of NFSv4.1
462 * sessions.
463 */
464static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
465{
466 if (nfs4_has_session(clp))
467 return rpc_restart_call_prepare(task);
468 return rpc_restart_call(task);
469}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1ec1a85fa71c..693ae22f8731 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,30 +13,6 @@
13 13
14struct idmap; 14struct idmap;
15 15
16/*
17 * In a seqid-mutating op, this macro controls which error return
18 * values trigger incrementation of the seqid.
19 *
20 * from rfc 3010:
21 * The client MUST monotonically increment the sequence number for the
22 * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
23 * operations. This is true even in the event that the previous
24 * operation that used the sequence number received an error. The only
25 * exception to this rule is if the previous operation received one of
26 * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
27 * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
28 * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
29 *
30 */
31#define seqid_mutating_err(err) \
32(((err) != NFSERR_STALE_CLIENTID) && \
33 ((err) != NFSERR_STALE_STATEID) && \
34 ((err) != NFSERR_BAD_STATEID) && \
35 ((err) != NFSERR_BAD_SEQID) && \
36 ((err) != NFSERR_BAD_XDR) && \
37 ((err) != NFSERR_RESOURCE) && \
38 ((err) != NFSERR_NOFILEHANDLE))
39
40enum nfs4_client_state { 16enum nfs4_client_state {
41 NFS4CLNT_MANAGER_RUNNING = 0, 17 NFS4CLNT_MANAGER_RUNNING = 0,
42 NFS4CLNT_CHECK_LEASE, 18 NFS4CLNT_CHECK_LEASE,
@@ -56,6 +32,9 @@ enum nfs4_session_state {
56 NFS4_SESSION_DRAINING, 32 NFS4_SESSION_DRAINING,
57}; 33};
58 34
35#define NFS4_RENEW_TIMEOUT 0x01
36#define NFS4_RENEW_DELEGATION_CB 0x02
37
59struct nfs4_minor_version_ops { 38struct nfs4_minor_version_ops {
60 u32 minor_version; 39 u32 minor_version;
61 40
@@ -225,7 +204,7 @@ struct nfs4_state_recovery_ops {
225}; 204};
226 205
227struct nfs4_state_maintenance_ops { 206struct nfs4_state_maintenance_ops {
228 int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *); 207 int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
229 struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); 208 struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
230 int (*renew_lease)(struct nfs_client *, struct rpc_cred *); 209 int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
231}; 210};
@@ -237,8 +216,6 @@ extern const struct inode_operations nfs4_dir_inode_operations;
237extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); 216extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
238extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); 217extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
239extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); 218extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
240extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
241extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 219extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 220extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
244extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); 221extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
@@ -349,6 +326,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
349extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); 326extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
350extern void nfs4_schedule_lease_recovery(struct nfs_client *); 327extern void nfs4_schedule_lease_recovery(struct nfs_client *);
351extern void nfs4_schedule_state_manager(struct nfs_client *); 328extern void nfs4_schedule_state_manager(struct nfs_client *);
329extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
352extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); 330extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
353extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 331extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
354extern void nfs41_handle_recall_slot(struct nfs_client *clp); 332extern void nfs41_handle_recall_slot(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e8915d4840ad..09119418402f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -77,19 +77,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
77 BUG(); 77 BUG();
78} 78}
79 79
80/* For data server errors we don't recover from */
81static void
82filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
83{
84 if (lseg->pls_range.iomode == IOMODE_RW) {
85 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
86 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
87 } else {
88 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
89 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
90 }
91}
92
93static int filelayout_async_handle_error(struct rpc_task *task, 80static int filelayout_async_handle_error(struct rpc_task *task,
94 struct nfs4_state *state, 81 struct nfs4_state *state,
95 struct nfs_client *clp, 82 struct nfs_client *clp,
@@ -135,7 +122,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
135static int filelayout_read_done_cb(struct rpc_task *task, 122static int filelayout_read_done_cb(struct rpc_task *task,
136 struct nfs_read_data *data) 123 struct nfs_read_data *data)
137{ 124{
138 struct nfs_client *clp = data->ds_clp;
139 int reset = 0; 125 int reset = 0;
140 126
141 dprintk("%s DS read\n", __func__); 127 dprintk("%s DS read\n", __func__);
@@ -145,11 +131,10 @@ static int filelayout_read_done_cb(struct rpc_task *task,
145 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", 131 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
146 __func__, data->ds_clp, data->ds_clp->cl_session); 132 __func__, data->ds_clp, data->ds_clp->cl_session);
147 if (reset) { 133 if (reset) {
148 filelayout_set_lo_fail(data->lseg); 134 pnfs_set_lo_fail(data->lseg);
149 nfs4_reset_read(task, data); 135 nfs4_reset_read(task, data);
150 clp = NFS_SERVER(data->inode)->nfs_client;
151 } 136 }
152 nfs_restart_rpc(task, clp); 137 rpc_restart_call_prepare(task);
153 return -EAGAIN; 138 return -EAGAIN;
154 } 139 }
155 140
@@ -216,17 +201,13 @@ static int filelayout_write_done_cb(struct rpc_task *task,
216 201
217 if (filelayout_async_handle_error(task, data->args.context->state, 202 if (filelayout_async_handle_error(task, data->args.context->state,
218 data->ds_clp, &reset) == -EAGAIN) { 203 data->ds_clp, &reset) == -EAGAIN) {
219 struct nfs_client *clp;
220
221 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", 204 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
222 __func__, data->ds_clp, data->ds_clp->cl_session); 205 __func__, data->ds_clp, data->ds_clp->cl_session);
223 if (reset) { 206 if (reset) {
224 filelayout_set_lo_fail(data->lseg); 207 pnfs_set_lo_fail(data->lseg);
225 nfs4_reset_write(task, data); 208 nfs4_reset_write(task, data);
226 clp = NFS_SERVER(data->inode)->nfs_client; 209 }
227 } else 210 rpc_restart_call_prepare(task);
228 clp = data->ds_clp;
229 nfs_restart_rpc(task, clp);
230 return -EAGAIN; 211 return -EAGAIN;
231 } 212 }
232 213
@@ -256,9 +237,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
256 __func__, data->ds_clp, data->ds_clp->cl_session); 237 __func__, data->ds_clp, data->ds_clp->cl_session);
257 if (reset) { 238 if (reset) {
258 prepare_to_resend_writes(data); 239 prepare_to_resend_writes(data);
259 filelayout_set_lo_fail(data->lseg); 240 pnfs_set_lo_fail(data->lseg);
260 } else 241 } else
261 nfs_restart_rpc(task, data->ds_clp); 242 rpc_restart_call_prepare(task);
262 return -EAGAIN; 243 return -EAGAIN;
263 } 244 }
264 245
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8c77039e7a81..d2ae413c986a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -73,9 +73,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data);
73static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 73static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
74static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 74static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
75static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 75static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
76static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
77 const struct qstr *name, struct nfs_fh *fhandle,
78 struct nfs_fattr *fattr);
79static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 76static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
80static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 77static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
81 struct nfs_fattr *fattr, struct iattr *sattr, 78 struct nfs_fattr *fattr, struct iattr *sattr,
@@ -753,9 +750,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
753 750
754 spin_lock(&dir->i_lock); 751 spin_lock(&dir->i_lock);
755 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; 752 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
756 if (!cinfo->atomic || cinfo->before != nfsi->change_attr) 753 if (!cinfo->atomic || cinfo->before != dir->i_version)
757 nfs_force_lookup_revalidate(dir); 754 nfs_force_lookup_revalidate(dir);
758 nfsi->change_attr = cinfo->after; 755 dir->i_version = cinfo->after;
759 spin_unlock(&dir->i_lock); 756 spin_unlock(&dir->i_lock);
760} 757}
761 758
@@ -1596,8 +1593,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1596 int status; 1593 int status;
1597 1594
1598 status = nfs4_run_open_task(data, 0); 1595 status = nfs4_run_open_task(data, 0);
1599 if (status != 0 || !data->rpc_done) 1596 if (!data->rpc_done)
1597 return status;
1598 if (status != 0) {
1599 if (status == -NFS4ERR_BADNAME &&
1600 !(o_arg->open_flags & O_CREAT))
1601 return -ENOENT;
1600 return status; 1602 return status;
1603 }
1601 1604
1602 if (o_arg->open_flags & O_CREAT) { 1605 if (o_arg->open_flags & O_CREAT) {
1603 update_changeattr(dir, &o_res->cinfo); 1606 update_changeattr(dir, &o_res->cinfo);
@@ -2408,14 +2411,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2408 return status; 2411 return status;
2409} 2412}
2410 2413
2411static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, 2414static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2412 const struct nfs_fh *dirfh, const struct qstr *name, 2415 const struct qstr *name, struct nfs_fh *fhandle,
2413 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2416 struct nfs_fattr *fattr)
2414{ 2417{
2418 struct nfs_server *server = NFS_SERVER(dir);
2415 int status; 2419 int status;
2416 struct nfs4_lookup_arg args = { 2420 struct nfs4_lookup_arg args = {
2417 .bitmask = server->attr_bitmask, 2421 .bitmask = server->attr_bitmask,
2418 .dir_fh = dirfh, 2422 .dir_fh = NFS_FH(dir),
2419 .name = name, 2423 .name = name,
2420 }; 2424 };
2421 struct nfs4_lookup_res res = { 2425 struct nfs4_lookup_res res = {
@@ -2431,40 +2435,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
2431 2435
2432 nfs_fattr_init(fattr); 2436 nfs_fattr_init(fattr);
2433 2437
2434 dprintk("NFS call lookupfh %s\n", name->name);
2435 status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
2436 dprintk("NFS reply lookupfh: %d\n", status);
2437 return status;
2438}
2439
2440static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
2441 struct qstr *name, struct nfs_fh *fhandle,
2442 struct nfs_fattr *fattr)
2443{
2444 struct nfs4_exception exception = { };
2445 int err;
2446 do {
2447 err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
2448 /* FIXME: !!!! */
2449 if (err == -NFS4ERR_MOVED) {
2450 err = -EREMOTE;
2451 break;
2452 }
2453 err = nfs4_handle_exception(server, err, &exception);
2454 } while (exception.retry);
2455 return err;
2456}
2457
2458static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2459 const struct qstr *name, struct nfs_fh *fhandle,
2460 struct nfs_fattr *fattr)
2461{
2462 int status;
2463
2464 dprintk("NFS call lookup %s\n", name->name); 2438 dprintk("NFS call lookup %s\n", name->name);
2465 status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); 2439 status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
2466 if (status == -NFS4ERR_MOVED)
2467 status = nfs4_get_referral(dir, name, fattr, fhandle);
2468 dprintk("NFS reply lookup: %d\n", status); 2440 dprintk("NFS reply lookup: %d\n", status);
2469 return status; 2441 return status;
2470} 2442}
@@ -2485,11 +2457,20 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
2485 struct nfs4_exception exception = { }; 2457 struct nfs4_exception exception = { };
2486 int err; 2458 int err;
2487 do { 2459 do {
2488 err = nfs4_handle_exception(NFS_SERVER(dir), 2460 int status;
2489 _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), 2461
2490 &exception); 2462 status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr);
2491 if (err == -EPERM) 2463 switch (status) {
2464 case -NFS4ERR_BADNAME:
2465 return -ENOENT;
2466 case -NFS4ERR_MOVED:
2467 err = nfs4_get_referral(dir, name, fattr, fhandle);
2468 break;
2469 case -NFS4ERR_WRONGSEC:
2492 nfs_fixup_secinfo_attributes(fattr, fhandle); 2470 nfs_fixup_secinfo_attributes(fattr, fhandle);
2471 }
2472 err = nfs4_handle_exception(NFS_SERVER(dir),
2473 status, &exception);
2493 } while (exception.retry); 2474 } while (exception.retry);
2494 return err; 2475 return err;
2495} 2476}
@@ -3210,7 +3191,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3210 struct nfs_server *server = NFS_SERVER(data->inode); 3191 struct nfs_server *server = NFS_SERVER(data->inode);
3211 3192
3212 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 3193 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
3213 nfs_restart_rpc(task, server->nfs_client); 3194 rpc_restart_call_prepare(task);
3214 return -EAGAIN; 3195 return -EAGAIN;
3215 } 3196 }
3216 3197
@@ -3260,7 +3241,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
3260 struct inode *inode = data->inode; 3241 struct inode *inode = data->inode;
3261 3242
3262 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 3243 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3263 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3244 rpc_restart_call_prepare(task);
3264 return -EAGAIN; 3245 return -EAGAIN;
3265 } 3246 }
3266 if (task->tk_status >= 0) { 3247 if (task->tk_status >= 0) {
@@ -3317,7 +3298,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat
3317 struct inode *inode = data->inode; 3298 struct inode *inode = data->inode;
3318 3299
3319 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 3300 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
3320 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3301 rpc_restart_call_prepare(task);
3321 return -EAGAIN; 3302 return -EAGAIN;
3322 } 3303 }
3323 nfs_refresh_inode(inode, data->res.fattr); 3304 nfs_refresh_inode(inode, data->res.fattr);
@@ -3374,9 +3355,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3374 3355
3375 if (task->tk_status < 0) { 3356 if (task->tk_status < 0) {
3376 /* Unless we're shutting down, schedule state recovery! */ 3357 /* Unless we're shutting down, schedule state recovery! */
3377 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) 3358 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
3359 return;
3360 if (task->tk_status != NFS4ERR_CB_PATH_DOWN) {
3378 nfs4_schedule_lease_recovery(clp); 3361 nfs4_schedule_lease_recovery(clp);
3379 return; 3362 return;
3363 }
3364 nfs4_schedule_path_down_recovery(clp);
3380 } 3365 }
3381 do_renew_lease(clp, timestamp); 3366 do_renew_lease(clp, timestamp);
3382} 3367}
@@ -3386,7 +3371,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {
3386 .rpc_release = nfs4_renew_release, 3371 .rpc_release = nfs4_renew_release,
3387}; 3372};
3388 3373
3389int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) 3374static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
3390{ 3375{
3391 struct rpc_message msg = { 3376 struct rpc_message msg = {
3392 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], 3377 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -3395,9 +3380,11 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3395 }; 3380 };
3396 struct nfs4_renewdata *data; 3381 struct nfs4_renewdata *data;
3397 3382
3383 if (renew_flags == 0)
3384 return 0;
3398 if (!atomic_inc_not_zero(&clp->cl_count)) 3385 if (!atomic_inc_not_zero(&clp->cl_count))
3399 return -EIO; 3386 return -EIO;
3400 data = kmalloc(sizeof(*data), GFP_KERNEL); 3387 data = kmalloc(sizeof(*data), GFP_NOFS);
3401 if (data == NULL) 3388 if (data == NULL)
3402 return -ENOMEM; 3389 return -ENOMEM;
3403 data->client = clp; 3390 data->client = clp;
@@ -3406,7 +3393,7 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3406 &nfs4_renew_ops, data); 3393 &nfs4_renew_ops, data);
3407} 3394}
3408 3395
3409int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) 3396static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
3410{ 3397{
3411 struct rpc_message msg = { 3398 struct rpc_message msg = {
3412 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], 3399 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -3851,7 +3838,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
3851 default: 3838 default:
3852 if (nfs4_async_handle_error(task, data->res.server, NULL) == 3839 if (nfs4_async_handle_error(task, data->res.server, NULL) ==
3853 -EAGAIN) { 3840 -EAGAIN) {
3854 nfs_restart_rpc(task, data->res.server->nfs_client); 3841 rpc_restart_call_prepare(task);
3855 return; 3842 return;
3856 } 3843 }
3857 } 3844 }
@@ -4105,8 +4092,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
4105 break; 4092 break;
4106 default: 4093 default:
4107 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) 4094 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
4108 nfs_restart_rpc(task, 4095 rpc_restart_call_prepare(task);
4109 calldata->server->nfs_client);
4110 } 4096 }
4111} 4097}
4112 4098
@@ -4939,7 +4925,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4939 task->tk_status = 0; 4925 task->tk_status = 0;
4940 /* fall through */ 4926 /* fall through */
4941 case -NFS4ERR_RETRY_UNCACHED_REP: 4927 case -NFS4ERR_RETRY_UNCACHED_REP:
4942 nfs_restart_rpc(task, data->clp); 4928 rpc_restart_call_prepare(task);
4943 return; 4929 return;
4944 } 4930 }
4945 dprintk("<-- %s\n", __func__); 4931 dprintk("<-- %s\n", __func__);
@@ -5504,11 +5490,13 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
5504 return rpc_run_task(&task_setup_data); 5490 return rpc_run_task(&task_setup_data);
5505} 5491}
5506 5492
5507static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) 5493static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
5508{ 5494{
5509 struct rpc_task *task; 5495 struct rpc_task *task;
5510 int ret = 0; 5496 int ret = 0;
5511 5497
5498 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
5499 return 0;
5512 task = _nfs41_proc_sequence(clp, cred); 5500 task = _nfs41_proc_sequence(clp, cred);
5513 if (IS_ERR(task)) 5501 if (IS_ERR(task))
5514 ret = PTR_ERR(task); 5502 ret = PTR_ERR(task);
@@ -5778,7 +5766,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
5778 5766
5779 server = NFS_SERVER(lrp->args.inode); 5767 server = NFS_SERVER(lrp->args.inode);
5780 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 5768 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5781 nfs_restart_rpc(task, lrp->clp); 5769 rpc_restart_call_prepare(task);
5782 return; 5770 return;
5783 } 5771 }
5784 spin_lock(&lo->plh_inode->i_lock); 5772 spin_lock(&lo->plh_inode->i_lock);
@@ -5949,7 +5937,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
5949 } 5937 }
5950 5938
5951 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 5939 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5952 nfs_restart_rpc(task, server->nfs_client); 5940 rpc_restart_call_prepare(task);
5953 return; 5941 return;
5954 } 5942 }
5955 5943
@@ -6262,7 +6250,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
6262 .getroot = nfs4_proc_get_root, 6250 .getroot = nfs4_proc_get_root,
6263 .getattr = nfs4_proc_getattr, 6251 .getattr = nfs4_proc_getattr,
6264 .setattr = nfs4_proc_setattr, 6252 .setattr = nfs4_proc_setattr,
6265 .lookupfh = nfs4_proc_lookupfh,
6266 .lookup = nfs4_proc_lookup, 6253 .lookup = nfs4_proc_lookup,
6267 .access = nfs4_proc_access, 6254 .access = nfs4_proc_access,
6268 .readlink = nfs4_proc_readlink, 6255 .readlink = nfs4_proc_readlink,
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index df8e7f3ca56d..dc484c0eae7f 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -60,6 +60,7 @@ nfs4_renew_state(struct work_struct *work)
60 struct rpc_cred *cred; 60 struct rpc_cred *cred;
61 long lease; 61 long lease;
62 unsigned long last, now; 62 unsigned long last, now;
63 unsigned renew_flags = 0;
63 64
64 ops = clp->cl_mvops->state_renewal_ops; 65 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 66 dprintk("%s: start\n", __func__);
@@ -72,18 +73,23 @@ nfs4_renew_state(struct work_struct *work)
72 last = clp->cl_last_renewal; 73 last = clp->cl_last_renewal;
73 now = jiffies; 74 now = jiffies;
74 /* Are we close to a lease timeout? */ 75 /* Are we close to a lease timeout? */
75 if (time_after(now, last + lease/3)) { 76 if (time_after(now, last + lease/3))
77 renew_flags |= NFS4_RENEW_TIMEOUT;
78 if (nfs_delegations_present(clp))
79 renew_flags |= NFS4_RENEW_DELEGATION_CB;
80
81 if (renew_flags != 0) {
76 cred = ops->get_state_renewal_cred_locked(clp); 82 cred = ops->get_state_renewal_cred_locked(clp);
77 spin_unlock(&clp->cl_lock); 83 spin_unlock(&clp->cl_lock);
78 if (cred == NULL) { 84 if (cred == NULL) {
79 if (!nfs_delegations_present(clp)) { 85 if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
80 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 86 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
81 goto out; 87 goto out;
82 } 88 }
83 nfs_expire_all_delegations(clp); 89 nfs_expire_all_delegations(clp);
84 } else { 90 } else {
85 /* Queue an asynchronous RENEW. */ 91 /* Queue an asynchronous RENEW. */
86 ops->sched_state_renewal(clp, cred); 92 ops->sched_state_renewal(clp, cred, renew_flags);
87 put_rpccred(cred); 93 put_rpccred(cred);
88 goto out_exp; 94 goto out_exp;
89 } 95 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 72ab97ef3d61..39914be40b03 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1038,6 +1038,12 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
1038 nfs4_schedule_state_manager(clp); 1038 nfs4_schedule_state_manager(clp);
1039} 1039}
1040 1040
1041void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
1042{
1043 nfs_handle_cb_pathdown(clp);
1044 nfs4_schedule_state_manager(clp);
1045}
1046
1041static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) 1047static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
1042{ 1048{
1043 1049
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9383ca7245bc..d0cda12fddc3 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write)
479 for (i = 0; i < ios->numdevs; i++) { 479 for (i = 0; i < ios->numdevs; i++) {
480 struct osd_sense_info osi; 480 struct osd_sense_info osi;
481 struct osd_request *or = ios->per_dev[i].or; 481 struct osd_request *or = ios->per_dev[i].or;
482 unsigned dev;
483 int ret; 482 int ret;
484 483
485 if (!or) 484 if (!or)
@@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write)
500 499
501 continue; /* we recovered */ 500 continue; /* we recovered */
502 } 501 }
503 dev = ios->per_dev[i].dev; 502 objlayout_io_set_result(&ios->ol_state, i,
504 objlayout_io_set_result(&ios->ol_state, dev, 503 &ios->layout->comps[i].oc_object_id,
505 &ios->layout->comps[dev].oc_object_id,
506 osd_pri_2_pnfs_err(osi.osd_err_pri), 504 osd_pri_2_pnfs_err(osi.osd_err_pri),
507 ios->per_dev[i].offset, 505 ios->per_dev[i].offset,
508 ios->per_dev[i].length, 506 ios->per_dev[i].length,
@@ -589,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
589} 587}
590 588
591static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, 589static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
592 unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, 590 unsigned pgbase, struct _objio_per_comp *per_dev, int len,
593 gfp_t gfp_flags) 591 gfp_t gfp_flags)
594{ 592{
595 unsigned pg = *cur_pg; 593 unsigned pg = *cur_pg;
594 int cur_len = len;
596 struct request_queue *q = 595 struct request_queue *q =
597 osd_request_queue(_io_od(ios, per_dev->dev)); 596 osd_request_queue(_io_od(ios, per_dev->dev));
598 597
599 per_dev->length += cur_len;
600
601 if (per_dev->bio == NULL) { 598 if (per_dev->bio == NULL) {
602 unsigned stripes = ios->layout->num_comps / 599 unsigned pages_in_stripe = ios->layout->group_width *
603 ios->layout->mirrors_p1;
604 unsigned pages_in_stripe = stripes *
605 (ios->layout->stripe_unit / PAGE_SIZE); 600 (ios->layout->stripe_unit / PAGE_SIZE);
606 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / 601 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
607 stripes; 602 ios->layout->group_width;
608 603
609 if (BIO_MAX_PAGES_KMALLOC < bio_size) 604 if (BIO_MAX_PAGES_KMALLOC < bio_size)
610 bio_size = BIO_MAX_PAGES_KMALLOC; 605 bio_size = BIO_MAX_PAGES_KMALLOC;
@@ -632,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
632 } 627 }
633 BUG_ON(cur_len); 628 BUG_ON(cur_len);
634 629
630 per_dev->length += len;
635 *cur_pg = pg; 631 *cur_pg = pg;
636 return 0; 632 return 0;
637} 633}
@@ -650,7 +646,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
650 int ret = 0; 646 int ret = 0;
651 647
652 while (length) { 648 while (length) {
653 struct _objio_per_comp *per_dev = &ios->per_dev[dev]; 649 struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
654 unsigned cur_len, page_off = 0; 650 unsigned cur_len, page_off = 0;
655 651
656 if (!per_dev->length) { 652 if (!per_dev->length) {
@@ -670,8 +666,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
670 cur_len = stripe_unit; 666 cur_len = stripe_unit;
671 } 667 }
672 668
673 if (max_comp < dev) 669 if (max_comp < dev - first_dev)
674 max_comp = dev; 670 max_comp = dev - first_dev;
675 } else { 671 } else {
676 cur_len = stripe_unit; 672 cur_len = stripe_unit;
677 } 673 }
@@ -806,7 +802,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
806 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; 802 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
807 unsigned dev = per_dev->dev; 803 unsigned dev = per_dev->dev;
808 struct pnfs_osd_object_cred *cred = 804 struct pnfs_osd_object_cred *cred =
809 &ios->layout->comps[dev]; 805 &ios->layout->comps[cur_comp];
810 struct osd_obj_id obj = { 806 struct osd_obj_id obj = {
811 .partition = cred->oc_object_id.oid_partition_id, 807 .partition = cred->oc_object_id.oid_partition_id,
812 .id = cred->oc_object_id.oid_object_id, 808 .id = cred->oc_object_id.oid_object_id,
@@ -904,7 +900,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
904 for (; cur_comp < last_comp; ++cur_comp, ++dev) { 900 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
905 struct osd_request *or = NULL; 901 struct osd_request *or = NULL;
906 struct pnfs_osd_object_cred *cred = 902 struct pnfs_osd_object_cred *cred =
907 &ios->layout->comps[dev]; 903 &ios->layout->comps[cur_comp];
908 struct osd_obj_id obj = { 904 struct osd_obj_id obj = {
909 .partition = cred->oc_object_id.oid_partition_id, 905 .partition = cred->oc_object_id.oid_partition_id,
910 .id = cred->oc_object_id.oid_object_id, 906 .id = cred->oc_object_id.oid_object_id,
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index 16fc758e9123..b3918f7ac34d 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
170 p = _osd_xdr_decode_data_map(p, &layout->olo_map); 170 p = _osd_xdr_decode_data_map(p, &layout->olo_map);
171 layout->olo_comps_index = be32_to_cpup(p++); 171 layout->olo_comps_index = be32_to_cpup(p++);
172 layout->olo_num_comps = be32_to_cpup(p++); 172 layout->olo_num_comps = be32_to_cpup(p++);
173 dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
174 layout->olo_comps_index, layout->olo_num_comps);
175
173 iter->total_comps = layout->olo_num_comps; 176 iter->total_comps = layout->olo_num_comps;
174 return 0; 177 return 0;
175} 178}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e550e8836c37..ee73d9a4f700 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1168,23 +1168,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1168/* 1168/*
1169 * Called by non rpc-based layout drivers 1169 * Called by non rpc-based layout drivers
1170 */ 1170 */
1171int 1171void pnfs_ld_write_done(struct nfs_write_data *data)
1172pnfs_ld_write_done(struct nfs_write_data *data)
1173{ 1172{
1174 int status; 1173 if (likely(!data->pnfs_error)) {
1175
1176 if (!data->pnfs_error) {
1177 pnfs_set_layoutcommit(data); 1174 pnfs_set_layoutcommit(data);
1178 data->mds_ops->rpc_call_done(&data->task, data); 1175 data->mds_ops->rpc_call_done(&data->task, data);
1179 data->mds_ops->rpc_release(data); 1176 } else {
1180 return 0; 1177 put_lseg(data->lseg);
1178 data->lseg = NULL;
1179 dprintk("pnfs write error = %d\n", data->pnfs_error);
1181 } 1180 }
1182 1181 data->mds_ops->rpc_release(data);
1183 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
1184 data->pnfs_error);
1185 status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
1186 data->mds_ops, NFS_FILE_SYNC);
1187 return status ? : -EAGAIN;
1188} 1182}
1189EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1183EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1190 1184
@@ -1268,23 +1262,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1268/* 1262/*
1269 * Called by non rpc-based layout drivers 1263 * Called by non rpc-based layout drivers
1270 */ 1264 */
1271int 1265void pnfs_ld_read_done(struct nfs_read_data *data)
1272pnfs_ld_read_done(struct nfs_read_data *data)
1273{ 1266{
1274 int status; 1267 if (likely(!data->pnfs_error)) {
1275
1276 if (!data->pnfs_error) {
1277 __nfs4_read_done_cb(data); 1268 __nfs4_read_done_cb(data);
1278 data->mds_ops->rpc_call_done(&data->task, data); 1269 data->mds_ops->rpc_call_done(&data->task, data);
1279 data->mds_ops->rpc_release(data); 1270 } else {
1280 return 0; 1271 put_lseg(data->lseg);
1272 data->lseg = NULL;
1273 dprintk("pnfs write error = %d\n", data->pnfs_error);
1281 } 1274 }
1282 1275 data->mds_ops->rpc_release(data);
1283 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
1284 data->pnfs_error);
1285 status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
1286 data->mds_ops);
1287 return status ? : -EAGAIN;
1288} 1276}
1289EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1277EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1290 1278
@@ -1381,6 +1369,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1381 } 1369 }
1382} 1370}
1383 1371
1372void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1373{
1374 if (lseg->pls_range.iomode == IOMODE_RW) {
1375 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
1376 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
1377 } else {
1378 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
1379 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
1380 }
1381}
1382EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1383
1384void 1384void
1385pnfs_set_layoutcommit(struct nfs_write_data *wdata) 1385pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1386{ 1386{
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 01cbfd54f3cb..1509530cb111 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -178,6 +178,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
178void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); 178void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
179int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); 179int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
180bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); 180bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
181void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
181int pnfs_layout_process(struct nfs4_layoutget *lgp); 182int pnfs_layout_process(struct nfs4_layoutget *lgp);
182void pnfs_free_lseg_list(struct list_head *tmp_list); 183void pnfs_free_lseg_list(struct list_head *tmp_list);
183void pnfs_destroy_layout(struct nfs_inode *); 184void pnfs_destroy_layout(struct nfs_inode *);
@@ -200,8 +201,8 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
200void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 201void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
201int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 202int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
202int _pnfs_return_layout(struct inode *); 203int _pnfs_return_layout(struct inode *);
203int pnfs_ld_write_done(struct nfs_write_data *); 204void pnfs_ld_write_done(struct nfs_write_data *);
204int pnfs_ld_read_done(struct nfs_read_data *); 205void pnfs_ld_read_done(struct nfs_read_data *);
205struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, 206struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
206 struct nfs_open_context *ctx, 207 struct nfs_open_context *ctx,
207 loff_t pos, 208 loff_t pos,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2171c043ab08..8b48ec63f722 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,16 +35,13 @@ static const struct rpc_call_ops nfs_read_partial_ops;
35static const struct rpc_call_ops nfs_read_full_ops; 35static const struct rpc_call_ops nfs_read_full_ops;
36 36
37static struct kmem_cache *nfs_rdata_cachep; 37static struct kmem_cache *nfs_rdata_cachep;
38static mempool_t *nfs_rdata_mempool;
39
40#define MIN_POOL_READ (32)
41 38
42struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) 39struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
43{ 40{
44 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); 41 struct nfs_read_data *p;
45 42
43 p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
46 if (p) { 44 if (p) {
47 memset(p, 0, sizeof(*p));
48 INIT_LIST_HEAD(&p->pages); 45 INIT_LIST_HEAD(&p->pages);
49 p->npages = pagecount; 46 p->npages = pagecount;
50 if (pagecount <= ARRAY_SIZE(p->page_array)) 47 if (pagecount <= ARRAY_SIZE(p->page_array))
@@ -52,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
52 else { 49 else {
53 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); 50 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
54 if (!p->pagevec) { 51 if (!p->pagevec) {
55 mempool_free(p, nfs_rdata_mempool); 52 kmem_cache_free(nfs_rdata_cachep, p);
56 p = NULL; 53 p = NULL;
57 } 54 }
58 } 55 }
@@ -64,7 +61,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
64{ 61{
65 if (p && (p->pagevec != &p->page_array[0])) 62 if (p && (p->pagevec != &p->page_array[0]))
66 kfree(p->pagevec); 63 kfree(p->pagevec);
67 mempool_free(p, nfs_rdata_mempool); 64 kmem_cache_free(nfs_rdata_cachep, p);
68} 65}
69 66
70void nfs_readdata_release(struct nfs_read_data *rdata) 67void nfs_readdata_release(struct nfs_read_data *rdata)
@@ -276,7 +273,6 @@ nfs_async_read_error(struct list_head *head)
276 while (!list_empty(head)) { 273 while (!list_empty(head)) {
277 req = nfs_list_entry(head->next); 274 req = nfs_list_entry(head->next);
278 nfs_list_remove_request(req); 275 nfs_list_remove_request(req);
279 SetPageError(req->wb_page);
280 nfs_readpage_release(req); 276 nfs_readpage_release(req);
281 } 277 }
282} 278}
@@ -322,7 +318,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
322 offset += len; 318 offset += len;
323 } while(nbytes != 0); 319 } while(nbytes != 0);
324 atomic_set(&req->wb_complete, requests); 320 atomic_set(&req->wb_complete, requests);
325 ClearPageError(page);
326 desc->pg_rpc_callops = &nfs_read_partial_ops; 321 desc->pg_rpc_callops = &nfs_read_partial_ops;
327 return ret; 322 return ret;
328out_bad: 323out_bad:
@@ -331,7 +326,6 @@ out_bad:
331 list_del(&data->list); 326 list_del(&data->list);
332 nfs_readdata_free(data); 327 nfs_readdata_free(data);
333 } 328 }
334 SetPageError(page);
335 nfs_readpage_release(req); 329 nfs_readpage_release(req);
336 return -ENOMEM; 330 return -ENOMEM;
337} 331}
@@ -357,7 +351,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *
357 req = nfs_list_entry(head->next); 351 req = nfs_list_entry(head->next);
358 nfs_list_remove_request(req); 352 nfs_list_remove_request(req);
359 nfs_list_add_request(req, &data->pages); 353 nfs_list_add_request(req, &data->pages);
360 ClearPageError(req->wb_page);
361 *pages++ = req->wb_page; 354 *pages++ = req->wb_page;
362 } 355 }
363 req = nfs_list_entry(data->pages.next); 356 req = nfs_list_entry(data->pages.next);
@@ -435,7 +428,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
435 argp->offset += resp->count; 428 argp->offset += resp->count;
436 argp->pgbase += resp->count; 429 argp->pgbase += resp->count;
437 argp->count -= resp->count; 430 argp->count -= resp->count;
438 nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); 431 rpc_restart_call_prepare(task);
439} 432}
440 433
441/* 434/*
@@ -462,10 +455,10 @@ static void nfs_readpage_release_partial(void *calldata)
462 int status = data->task.tk_status; 455 int status = data->task.tk_status;
463 456
464 if (status < 0) 457 if (status < 0)
465 SetPageError(page); 458 set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags);
466 459
467 if (atomic_dec_and_test(&req->wb_complete)) { 460 if (atomic_dec_and_test(&req->wb_complete)) {
468 if (!PageError(page)) 461 if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags))
469 SetPageUptodate(page); 462 SetPageUptodate(page);
470 nfs_readpage_release(req); 463 nfs_readpage_release(req);
471 } 464 }
@@ -541,13 +534,23 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
541static void nfs_readpage_release_full(void *calldata) 534static void nfs_readpage_release_full(void *calldata)
542{ 535{
543 struct nfs_read_data *data = calldata; 536 struct nfs_read_data *data = calldata;
537 struct nfs_pageio_descriptor pgio;
544 538
539 if (data->pnfs_error) {
540 nfs_pageio_init_read_mds(&pgio, data->inode);
541 pgio.pg_recoalesce = 1;
542 }
545 while (!list_empty(&data->pages)) { 543 while (!list_empty(&data->pages)) {
546 struct nfs_page *req = nfs_list_entry(data->pages.next); 544 struct nfs_page *req = nfs_list_entry(data->pages.next);
547 545
548 nfs_list_remove_request(req); 546 nfs_list_remove_request(req);
549 nfs_readpage_release(req); 547 if (!data->pnfs_error)
548 nfs_readpage_release(req);
549 else
550 nfs_pageio_add_request(&pgio, req);
550 } 551 }
552 if (data->pnfs_error)
553 nfs_pageio_complete(&pgio);
551 nfs_readdata_release(calldata); 554 nfs_readdata_release(calldata);
552} 555}
553 556
@@ -648,7 +651,6 @@ readpage_async_filler(void *data, struct page *page)
648 return 0; 651 return 0;
649out_error: 652out_error:
650 error = PTR_ERR(new); 653 error = PTR_ERR(new);
651 SetPageError(page);
652out_unlock: 654out_unlock:
653 unlock_page(page); 655 unlock_page(page);
654 return error; 656 return error;
@@ -711,16 +713,10 @@ int __init nfs_init_readpagecache(void)
711 if (nfs_rdata_cachep == NULL) 713 if (nfs_rdata_cachep == NULL)
712 return -ENOMEM; 714 return -ENOMEM;
713 715
714 nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
715 nfs_rdata_cachep);
716 if (nfs_rdata_mempool == NULL)
717 return -ENOMEM;
718
719 return 0; 716 return 0;
720} 717}
721 718
722void nfs_destroy_readpagecache(void) 719void nfs_destroy_readpagecache(void)
723{ 720{
724 mempool_destroy(nfs_rdata_mempool);
725 kmem_cache_destroy(nfs_rdata_cachep); 721 kmem_cache_destroy(nfs_rdata_cachep);
726} 722}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b961ceac66b4..480b3b6bf71e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -733,18 +733,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
733 733
734 return 0; 734 return 0;
735} 735}
736
737#ifdef CONFIG_NFS_V4
736#ifdef CONFIG_NFS_V4_1 738#ifdef CONFIG_NFS_V4_1
737void show_sessions(struct seq_file *m, struct nfs_server *server) 739static void show_sessions(struct seq_file *m, struct nfs_server *server)
738{ 740{
739 if (nfs4_has_session(server->nfs_client)) 741 if (nfs4_has_session(server->nfs_client))
740 seq_printf(m, ",sessions"); 742 seq_printf(m, ",sessions");
741} 743}
742#else 744#else
743void show_sessions(struct seq_file *m, struct nfs_server *server) {} 745static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
746#endif
744#endif 747#endif
745 748
749#ifdef CONFIG_NFS_V4
746#ifdef CONFIG_NFS_V4_1 750#ifdef CONFIG_NFS_V4_1
747void show_pnfs(struct seq_file *m, struct nfs_server *server) 751static void show_pnfs(struct seq_file *m, struct nfs_server *server)
748{ 752{
749 seq_printf(m, ",pnfs="); 753 seq_printf(m, ",pnfs=");
750 if (server->pnfs_curr_ld) 754 if (server->pnfs_curr_ld)
@@ -752,9 +756,10 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server)
752 else 756 else
753 seq_printf(m, "not configured"); 757 seq_printf(m, "not configured");
754} 758}
755#else /* CONFIG_NFS_V4_1 */ 759#else
756void show_pnfs(struct seq_file *m, struct nfs_server *server) {} 760static void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
757#endif /* CONFIG_NFS_V4_1 */ 761#endif
762#endif
758 763
759static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) 764static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
760{ 765{
@@ -2035,9 +2040,6 @@ static inline void nfs_initialise_sb(struct super_block *sb)
2035 sb->s_blocksize = nfs_block_bits(server->wsize, 2040 sb->s_blocksize = nfs_block_bits(server->wsize,
2036 &sb->s_blocksize_bits); 2041 &sb->s_blocksize_bits);
2037 2042
2038 if (server->flags & NFS_MOUNT_NOAC)
2039 sb->s_flags |= MS_SYNCHRONOUS;
2040
2041 sb->s_bdi = &server->backing_dev_info; 2043 sb->s_bdi = &server->backing_dev_info;
2042 2044
2043 nfs_super_set_maxbytes(sb, server->maxfilesize); 2045 nfs_super_set_maxbytes(sb, server->maxfilesize);
@@ -2249,6 +2251,10 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2249 if (server->flags & NFS_MOUNT_UNSHARED) 2251 if (server->flags & NFS_MOUNT_UNSHARED)
2250 compare_super = NULL; 2252 compare_super = NULL;
2251 2253
2254 /* -o noac implies -o sync */
2255 if (server->flags & NFS_MOUNT_NOAC)
2256 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2257
2252 /* Get a superblock - note that we may end up sharing one that already exists */ 2258 /* Get a superblock - note that we may end up sharing one that already exists */
2253 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); 2259 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
2254 if (IS_ERR(s)) { 2260 if (IS_ERR(s)) {
@@ -2361,6 +2367,10 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2361 if (server->flags & NFS_MOUNT_UNSHARED) 2367 if (server->flags & NFS_MOUNT_UNSHARED)
2362 compare_super = NULL; 2368 compare_super = NULL;
2363 2369
2370 /* -o noac implies -o sync */
2371 if (server->flags & NFS_MOUNT_NOAC)
2372 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2373
2364 /* Get a superblock - note that we may end up sharing one that already exists */ 2374 /* Get a superblock - note that we may end up sharing one that already exists */
2365 s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); 2375 s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2366 if (IS_ERR(s)) { 2376 if (IS_ERR(s)) {
@@ -2628,6 +2638,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2628 if (server->flags & NFS4_MOUNT_UNSHARED) 2638 if (server->flags & NFS4_MOUNT_UNSHARED)
2629 compare_super = NULL; 2639 compare_super = NULL;
2630 2640
2641 /* -o noac implies -o sync */
2642 if (server->flags & NFS_MOUNT_NOAC)
2643 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2644
2631 /* Get a superblock - note that we may end up sharing one that already exists */ 2645 /* Get a superblock - note that we may end up sharing one that already exists */
2632 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); 2646 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2633 if (IS_ERR(s)) { 2647 if (IS_ERR(s)) {
@@ -2789,7 +2803,7 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2789 goto out_put_mnt_ns; 2803 goto out_put_mnt_ns;
2790 2804
2791 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, 2805 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
2792 export_path, LOOKUP_FOLLOW, &path); 2806 export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2793 2807
2794 nfs_referral_loop_unprotect(); 2808 nfs_referral_loop_unprotect();
2795 put_mnt_ns(ns_private); 2809 put_mnt_ns(ns_private);
@@ -2916,6 +2930,10 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
2916 if (server->flags & NFS4_MOUNT_UNSHARED) 2930 if (server->flags & NFS4_MOUNT_UNSHARED)
2917 compare_super = NULL; 2931 compare_super = NULL;
2918 2932
2933 /* -o noac implies -o sync */
2934 if (server->flags & NFS_MOUNT_NOAC)
2935 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2936
2919 /* Get a superblock - note that we may end up sharing one that already exists */ 2937 /* Get a superblock - note that we may end up sharing one that already exists */
2920 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); 2938 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2921 if (IS_ERR(s)) { 2939 if (IS_ERR(s)) {
@@ -3003,6 +3021,10 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
3003 if (server->flags & NFS4_MOUNT_UNSHARED) 3021 if (server->flags & NFS4_MOUNT_UNSHARED)
3004 compare_super = NULL; 3022 compare_super = NULL;
3005 3023
3024 /* -o noac implies -o sync */
3025 if (server->flags & NFS_MOUNT_NOAC)
3026 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
3027
3006 /* Get a superblock - note that we may end up sharing one that already exists */ 3028 /* Get a superblock - note that we may end up sharing one that already exists */
3007 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); 3029 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
3008 if (IS_ERR(s)) { 3030 if (IS_ERR(s)) {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index b2fbbde58e44..4f9319a2e567 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
87 struct inode *dir = data->dir; 87 struct inode *dir = data->dir;
88 88
89 if (!NFS_PROTO(dir)->unlink_done(task, dir)) 89 if (!NFS_PROTO(dir)->unlink_done(task, dir))
90 nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); 90 rpc_restart_call_prepare(task);
91} 91}
92 92
93/** 93/**
@@ -369,7 +369,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
369 struct dentry *new_dentry = data->new_dentry; 369 struct dentry *new_dentry = data->new_dentry;
370 370
371 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { 371 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
372 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); 372 rpc_restart_call_prepare(task);
373 return; 373 return;
374 } 374 }
375 375
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b39b37f80913..2219c88d96b2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,7 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
390 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); 390 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
391 BUG_ON(error); 391 BUG_ON(error);
392 if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) 392 if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
393 nfsi->change_attr++; 393 inode->i_version++;
394 set_bit(PG_MAPPED, &req->wb_flags); 394 set_bit(PG_MAPPED, &req->wb_flags);
395 SetPagePrivate(req->wb_page); 395 SetPagePrivate(req->wb_page);
396 set_page_private(req->wb_page, (unsigned long)req); 396 set_page_private(req->wb_page, (unsigned long)req);
@@ -428,7 +428,6 @@ static void
428nfs_mark_request_dirty(struct nfs_page *req) 428nfs_mark_request_dirty(struct nfs_page *req)
429{ 429{
430 __set_page_dirty_nobuffers(req->wb_page); 430 __set_page_dirty_nobuffers(req->wb_page);
431 __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
432} 431}
433 432
434#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 433#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -762,6 +761,8 @@ int nfs_updatepage(struct file *file, struct page *page,
762 status = nfs_writepage_setup(ctx, page, offset, count); 761 status = nfs_writepage_setup(ctx, page, offset, count);
763 if (status < 0) 762 if (status < 0)
764 nfs_set_pageerror(page); 763 nfs_set_pageerror(page);
764 else
765 __set_page_dirty_nobuffers(page);
765 766
766 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 767 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
767 status, (long long)i_size_read(inode)); 768 status, (long long)i_size_read(inode));
@@ -958,7 +959,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
958 if (!data) 959 if (!data)
959 goto out_bad; 960 goto out_bad;
960 data->pagevec[0] = page; 961 data->pagevec[0] = page;
961 nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags); 962 nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
962 list_add(&data->list, res); 963 list_add(&data->list, res);
963 requests++; 964 requests++;
964 nbytes -= len; 965 nbytes -= len;
@@ -1010,7 +1011,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r
1010 req = nfs_list_entry(head->next); 1011 req = nfs_list_entry(head->next);
1011 nfs_list_remove_request(req); 1012 nfs_list_remove_request(req);
1012 nfs_list_add_request(req, &data->pages); 1013 nfs_list_add_request(req, &data->pages);
1013 ClearPageError(req->wb_page);
1014 *pages++ = req->wb_page; 1014 *pages++ = req->wb_page;
1015 } 1015 }
1016 req = nfs_list_entry(data->pages.next); 1016 req = nfs_list_entry(data->pages.next);
@@ -1165,7 +1165,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1165static void nfs_writeback_release_full(void *calldata) 1165static void nfs_writeback_release_full(void *calldata)
1166{ 1166{
1167 struct nfs_write_data *data = calldata; 1167 struct nfs_write_data *data = calldata;
1168 int status = data->task.tk_status; 1168 int ret, status = data->task.tk_status;
1169 struct nfs_pageio_descriptor pgio;
1170
1171 if (data->pnfs_error) {
1172 nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
1173 pgio.pg_recoalesce = 1;
1174 }
1169 1175
1170 /* Update attributes as result of writeback. */ 1176 /* Update attributes as result of writeback. */
1171 while (!list_empty(&data->pages)) { 1177 while (!list_empty(&data->pages)) {
@@ -1181,6 +1187,11 @@ static void nfs_writeback_release_full(void *calldata)
1181 req->wb_bytes, 1187 req->wb_bytes,
1182 (long long)req_offset(req)); 1188 (long long)req_offset(req));
1183 1189
1190 if (data->pnfs_error) {
1191 dprintk(", pnfs error = %d\n", data->pnfs_error);
1192 goto next;
1193 }
1194
1184 if (status < 0) { 1195 if (status < 0) {
1185 nfs_set_pageerror(page); 1196 nfs_set_pageerror(page);
1186 nfs_context_set_write_error(req->wb_context, status); 1197 nfs_context_set_write_error(req->wb_context, status);
@@ -1200,7 +1211,19 @@ remove_request:
1200 next: 1211 next:
1201 nfs_clear_page_tag_locked(req); 1212 nfs_clear_page_tag_locked(req);
1202 nfs_end_page_writeback(page); 1213 nfs_end_page_writeback(page);
1214 if (data->pnfs_error) {
1215 lock_page(page);
1216 nfs_pageio_cond_complete(&pgio, page->index);
1217 ret = nfs_page_async_flush(&pgio, page, 0);
1218 if (ret) {
1219 nfs_set_pageerror(page);
1220 dprintk("rewrite to MDS error = %d\n", ret);
1221 }
1222 unlock_page(page);
1223 }
1203 } 1224 }
1225 if (data->pnfs_error)
1226 nfs_pageio_complete(&pgio);
1204 nfs_writedata_release(calldata); 1227 nfs_writedata_release(calldata);
1205} 1228}
1206 1229
@@ -1281,7 +1304,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1281 */ 1304 */
1282 argp->stable = NFS_FILE_SYNC; 1305 argp->stable = NFS_FILE_SYNC;
1283 } 1306 }
1284 nfs_restart_rpc(task, server->nfs_client); 1307 rpc_restart_call_prepare(task);
1285 return; 1308 return;
1286 } 1309 }
1287 if (time_before(complain, jiffies)) { 1310 if (time_before(complain, jiffies)) {
@@ -1553,6 +1576,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
1553 int flags = FLUSH_SYNC; 1576 int flags = FLUSH_SYNC;
1554 int ret = 0; 1577 int ret = 0;
1555 1578
1579 /* no commits means nothing needs to be done */
1580 if (!nfsi->ncommit)
1581 return ret;
1582
1556 if (wbc->sync_mode == WB_SYNC_NONE) { 1583 if (wbc->sync_mode == WB_SYNC_NONE) {
1557 /* Don't commit yet if this is a non-blocking flush and there 1584 /* Don't commit yet if this is a non-blocking flush and there
1558 * are a lot of outstanding writes for this mapping. 1585 * are a lot of outstanding writes for this mapping.
@@ -1686,34 +1713,20 @@ out_error:
1686int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1713int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1687 struct page *page) 1714 struct page *page)
1688{ 1715{
1689 struct nfs_page *req; 1716 /*
1690 int ret; 1717 * If PagePrivate is set, then the page is currently associated with
1718 * an in-progress read or write request. Don't try to migrate it.
1719 *
1720 * FIXME: we could do this in principle, but we'll need a way to ensure
1721 * that we can safely release the inode reference while holding
1722 * the page lock.
1723 */
1724 if (PagePrivate(page))
1725 return -EBUSY;
1691 1726
1692 nfs_fscache_release_page(page, GFP_KERNEL); 1727 nfs_fscache_release_page(page, GFP_KERNEL);
1693 1728
1694 req = nfs_find_and_lock_request(page, false); 1729 return migrate_page(mapping, newpage, page);
1695 ret = PTR_ERR(req);
1696 if (IS_ERR(req))
1697 goto out;
1698
1699 ret = migrate_page(mapping, newpage, page);
1700 if (!req)
1701 goto out;
1702 if (ret)
1703 goto out_unlock;
1704 page_cache_get(newpage);
1705 spin_lock(&mapping->host->i_lock);
1706 req->wb_page = newpage;
1707 SetPagePrivate(newpage);
1708 set_page_private(newpage, (unsigned long)req);
1709 ClearPagePrivate(page);
1710 set_page_private(page, 0);
1711 spin_unlock(&mapping->host->i_lock);
1712 page_cache_release(page);
1713out_unlock:
1714 nfs_clear_page_tag_locked(req);
1715out:
1716 return ret;
1717} 1730}
1718#endif 1731#endif
1719 1732
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f4cc1e2bfc54..62f3b9074e84 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -16,7 +16,6 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/exportfs.h> 17#include <linux/exportfs.h>
18 18
19#include <linux/nfsd/syscall.h>
20#include <net/ipv6.h> 19#include <net/ipv6.h>
21 20
22#include "nfsd.h" 21#include "nfsd.h"
@@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref)
318 struct svc_export *exp = container_of(ref, struct svc_export, h.ref); 317 struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
319 path_put(&exp->ex_path); 318 path_put(&exp->ex_path);
320 auth_domain_put(exp->ex_client); 319 auth_domain_put(exp->ex_client);
321 kfree(exp->ex_pathname);
322 nfsd4_fslocs_free(&exp->ex_fslocs); 320 nfsd4_fslocs_free(&exp->ex_fslocs);
323 kfree(exp); 321 kfree(exp);
324} 322}
@@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
528 526
529 exp.ex_client = dom; 527 exp.ex_client = dom;
530 528
531 err = -ENOMEM;
532 exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
533 if (!exp.ex_pathname)
534 goto out2;
535
536 /* expiry */ 529 /* expiry */
537 err = -EINVAL; 530 err = -EINVAL;
538 exp.h.expiry_time = get_expiry(&mesg); 531 exp.h.expiry_time = get_expiry(&mesg);
@@ -613,8 +606,6 @@ out4:
613 nfsd4_fslocs_free(&exp.ex_fslocs); 606 nfsd4_fslocs_free(&exp.ex_fslocs);
614 kfree(exp.ex_uuid); 607 kfree(exp.ex_uuid);
615out3: 608out3:
616 kfree(exp.ex_pathname);
617out2:
618 path_put(&exp.ex_path); 609 path_put(&exp.ex_path);
619out1: 610out1:
620 auth_domain_put(dom); 611 auth_domain_put(dom);
@@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
678 new->ex_client = item->ex_client; 669 new->ex_client = item->ex_client;
679 new->ex_path.dentry = dget(item->ex_path.dentry); 670 new->ex_path.dentry = dget(item->ex_path.dentry);
680 new->ex_path.mnt = mntget(item->ex_path.mnt); 671 new->ex_path.mnt = mntget(item->ex_path.mnt);
681 new->ex_pathname = NULL;
682 new->ex_fslocs.locations = NULL; 672 new->ex_fslocs.locations = NULL;
683 new->ex_fslocs.locations_count = 0; 673 new->ex_fslocs.locations_count = 0;
684 new->ex_fslocs.migrated = 0; 674 new->ex_fslocs.migrated = 0;
@@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
696 new->ex_fsid = item->ex_fsid; 686 new->ex_fsid = item->ex_fsid;
697 new->ex_uuid = item->ex_uuid; 687 new->ex_uuid = item->ex_uuid;
698 item->ex_uuid = NULL; 688 item->ex_uuid = NULL;
699 new->ex_pathname = item->ex_pathname;
700 item->ex_pathname = NULL;
701 new->ex_fslocs.locations = item->ex_fslocs.locations; 689 new->ex_fslocs.locations = item->ex_fslocs.locations;
702 item->ex_fslocs.locations = NULL; 690 item->ex_fslocs.locations = NULL;
703 new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; 691 new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
@@ -1010,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
1010 return exp; 998 return exp;
1011} 999}
1012 1000
1013static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) 1001struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
1014{ 1002{
1015 u32 fsidv[2]; 1003 u32 fsidv[2];
1016 1004
@@ -1030,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1030 struct svc_export *exp; 1018 struct svc_export *exp;
1031 __be32 rv; 1019 __be32 rv;
1032 1020
1033 exp = find_fsidzero_export(rqstp); 1021 exp = rqst_find_fsidzero_export(rqstp);
1034 if (IS_ERR(exp)) 1022 if (IS_ERR(exp))
1035 return nfserrno(PTR_ERR(exp)); 1023 return nfserrno(PTR_ERR(exp));
1036 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); 1024 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 02eb4edf0ece..7748d6a18d97 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,8 @@
39 39
40#define NFSDDBG_FACILITY NFSDDBG_PROC 40#define NFSDDBG_FACILITY NFSDDBG_PROC
41 41
42static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
43
42#define NFSPROC4_CB_NULL 0 44#define NFSPROC4_CB_NULL 0
43#define NFSPROC4_CB_COMPOUND 1 45#define NFSPROC4_CB_COMPOUND 1
44 46
@@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
351 __be32 *p; 353 __be32 *p;
352 354
353 encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); 355 encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
354 encode_stateid4(xdr, &dp->dl_stateid); 356 encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
355 357
356 p = xdr_reserve_space(xdr, 4); 358 p = xdr_reserve_space(xdr, 4);
357 *p++ = xdr_zero; /* truncate */ 359 *p++ = xdr_zero; /* truncate */
@@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
460 */ 462 */
461 status = 0; 463 status = 0;
462out: 464out:
465 if (status)
466 nfsd4_mark_cb_fault(cb->cb_clp, status);
463 return status; 467 return status;
464out_overflow: 468out_overflow:
465 print_overflow_msg(__func__, xdr); 469 print_overflow_msg(__func__, xdr);
@@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
686 warn_no_callback_path(clp, reason); 690 warn_no_callback_path(clp, reason);
687} 691}
688 692
693static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
694{
695 clp->cl_cb_state = NFSD4_CB_FAULT;
696 warn_no_callback_path(clp, reason);
697}
698
689static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) 699static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
690{ 700{
691 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); 701 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
@@ -787,7 +797,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
787{ 797{
788 struct nfsd4_callback *cb = calldata; 798 struct nfsd4_callback *cb = calldata;
789 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 799 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
790 struct nfs4_client *clp = dp->dl_client; 800 struct nfs4_client *clp = dp->dl_stid.sc_client;
791 u32 minorversion = clp->cl_minorversion; 801 u32 minorversion = clp->cl_minorversion;
792 802
793 cb->cb_minorversion = minorversion; 803 cb->cb_minorversion = minorversion;
@@ -809,7 +819,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
809{ 819{
810 struct nfsd4_callback *cb = calldata; 820 struct nfsd4_callback *cb = calldata;
811 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 821 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
812 struct nfs4_client *clp = dp->dl_client; 822 struct nfs4_client *clp = dp->dl_stid.sc_client;
813 823
814 dprintk("%s: minorversion=%d\n", __func__, 824 dprintk("%s: minorversion=%d\n", __func__,
815 clp->cl_minorversion); 825 clp->cl_minorversion);
@@ -832,7 +842,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
832{ 842{
833 struct nfsd4_callback *cb = calldata; 843 struct nfsd4_callback *cb = calldata;
834 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 844 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
835 struct nfs4_client *clp = dp->dl_client; 845 struct nfs4_client *clp = dp->dl_stid.sc_client;
836 struct rpc_clnt *current_rpc_client = clp->cl_cb_client; 846 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
837 847
838 nfsd4_cb_done(task, calldata); 848 nfsd4_cb_done(task, calldata);
@@ -1006,7 +1016,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
1006void nfsd4_cb_recall(struct nfs4_delegation *dp) 1016void nfsd4_cb_recall(struct nfs4_delegation *dp)
1007{ 1017{
1008 struct nfsd4_callback *cb = &dp->dl_recall; 1018 struct nfsd4_callback *cb = &dp->dl_recall;
1009 struct nfs4_client *clp = dp->dl_client; 1019 struct nfs4_client *clp = dp->dl_stid.sc_client;
1010 1020
1011 dp->dl_retries = 1; 1021 dp->dl_retries = 1;
1012 cb->cb_op = dp; 1022 cb->cb_op = dp;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e80777666618..fa383361bc61 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -35,6 +35,7 @@
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37 37
38#include "idmap.h"
38#include "cache.h" 39#include "cache.h"
39#include "xdr4.h" 40#include "xdr4.h"
40#include "vfs.h" 41#include "vfs.h"
@@ -156,6 +157,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
156 !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) 157 !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
157 return nfserr_inval; 158 return nfserr_inval;
158 159
160 accmode |= NFSD_MAY_READ_IF_EXEC;
161
159 if (open->op_share_access & NFS4_SHARE_ACCESS_READ) 162 if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
160 accmode |= NFSD_MAY_READ; 163 accmode |= NFSD_MAY_READ;
161 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 164 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
@@ -168,12 +171,29 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
168 return status; 171 return status;
169} 172}
170 173
174static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
175{
176 umode_t mode = fh->fh_dentry->d_inode->i_mode;
177
178 if (S_ISREG(mode))
179 return nfs_ok;
180 if (S_ISDIR(mode))
181 return nfserr_isdir;
182 /*
183 * Using err_symlink as our catch-all case may look odd; but
184 * there's no other obvious error for this case in 4.0, and we
185 * happen to know that it will cause the linux v4 client to do
186 * the right thing on attempts to open something other than a
187 * regular file.
188 */
189 return nfserr_symlink;
190}
191
171static __be32 192static __be32
172do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 193do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
173{ 194{
174 struct svc_fh resfh; 195 struct svc_fh resfh;
175 __be32 status; 196 __be32 status;
176 int created = 0;
177 197
178 fh_init(&resfh, NFS4_FHSIZE); 198 fh_init(&resfh, NFS4_FHSIZE);
179 open->op_truncate = 0; 199 open->op_truncate = 0;
@@ -202,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
202 open->op_fname.len, &open->op_iattr, 222 open->op_fname.len, &open->op_iattr,
203 &resfh, open->op_createmode, 223 &resfh, open->op_createmode,
204 (u32 *)open->op_verf.data, 224 (u32 *)open->op_verf.data,
205 &open->op_truncate, &created); 225 &open->op_truncate, &open->op_created);
206 226
207 /* 227 /*
208 * Following rfc 3530 14.2.16, use the returned bitmask 228 * Following rfc 3530 14.2.16, use the returned bitmask
@@ -216,6 +236,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
216 status = nfsd_lookup(rqstp, current_fh, 236 status = nfsd_lookup(rqstp, current_fh,
217 open->op_fname.data, open->op_fname.len, &resfh); 237 open->op_fname.data, open->op_fname.len, &resfh);
218 fh_unlock(current_fh); 238 fh_unlock(current_fh);
239 if (status)
240 goto out;
241 status = nfsd_check_obj_isreg(&resfh);
219 } 242 }
220 if (status) 243 if (status)
221 goto out; 244 goto out;
@@ -227,9 +250,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
227 fh_dup2(current_fh, &resfh); 250 fh_dup2(current_fh, &resfh);
228 251
229 /* set reply cache */ 252 /* set reply cache */
230 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, 253 fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
231 &resfh.fh_handle); 254 &resfh.fh_handle);
232 if (!created) 255 if (!open->op_created)
233 status = do_open_permission(rqstp, current_fh, open, 256 status = do_open_permission(rqstp, current_fh, open,
234 NFSD_MAY_NOP); 257 NFSD_MAY_NOP);
235 258
@@ -254,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
254 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); 277 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
255 278
256 /* set replay cache */ 279 /* set replay cache */
257 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, 280 fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
258 &current_fh->fh_handle); 281 &current_fh->fh_handle);
259 282
260 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && 283 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
@@ -283,14 +306,18 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
283 __be32 status; 306 __be32 status;
284 struct nfsd4_compoundres *resp; 307 struct nfsd4_compoundres *resp;
285 308
286 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", 309 dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
287 (int)open->op_fname.len, open->op_fname.data, 310 (int)open->op_fname.len, open->op_fname.data,
288 open->op_stateowner); 311 open->op_openowner);
289 312
290 /* This check required by spec. */ 313 /* This check required by spec. */
291 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) 314 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
292 return nfserr_inval; 315 return nfserr_inval;
293 316
317 /* We don't yet support WANT bits: */
318 open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
319
320 open->op_created = 0;
294 /* 321 /*
295 * RFC5661 18.51.3 322 * RFC5661 18.51.3
296 * Before RECLAIM_COMPLETE done, server should deny new lock 323 * Before RECLAIM_COMPLETE done, server should deny new lock
@@ -309,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
309 resp = rqstp->rq_resp; 336 resp = rqstp->rq_resp;
310 status = nfsd4_process_open1(&resp->cstate, open); 337 status = nfsd4_process_open1(&resp->cstate, open);
311 if (status == nfserr_replay_me) { 338 if (status == nfserr_replay_me) {
312 struct nfs4_replay *rp = &open->op_stateowner->so_replay; 339 struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
313 fh_put(&cstate->current_fh); 340 fh_put(&cstate->current_fh);
314 fh_copy_shallow(&cstate->current_fh.fh_handle, 341 fh_copy_shallow(&cstate->current_fh.fh_handle,
315 &rp->rp_openfh); 342 &rp->rp_openfh);
@@ -339,32 +366,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
339 switch (open->op_claim_type) { 366 switch (open->op_claim_type) {
340 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 367 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
341 case NFS4_OPEN_CLAIM_NULL: 368 case NFS4_OPEN_CLAIM_NULL:
342 /*
343 * (1) set CURRENT_FH to the file being opened,
344 * creating it if necessary, (2) set open->op_cinfo,
345 * (3) set open->op_truncate if the file is to be
346 * truncated after opening, (4) do permission checking.
347 */
348 status = do_open_lookup(rqstp, &cstate->current_fh, 369 status = do_open_lookup(rqstp, &cstate->current_fh,
349 open); 370 open);
350 if (status) 371 if (status)
351 goto out; 372 goto out;
352 break; 373 break;
353 case NFS4_OPEN_CLAIM_PREVIOUS: 374 case NFS4_OPEN_CLAIM_PREVIOUS:
354 open->op_stateowner->so_confirmed = 1; 375 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
355 /* 376 case NFS4_OPEN_CLAIM_FH:
356 * The CURRENT_FH is already set to the file being 377 case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
357 * opened. (1) set open->op_cinfo, (2) set
358 * open->op_truncate if the file is to be truncated
359 * after opening, (3) do permission checking.
360 */
361 status = do_open_fhandle(rqstp, &cstate->current_fh, 378 status = do_open_fhandle(rqstp, &cstate->current_fh,
362 open); 379 open);
363 if (status) 380 if (status)
364 goto out; 381 goto out;
365 break; 382 break;
383 case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
366 case NFS4_OPEN_CLAIM_DELEGATE_PREV: 384 case NFS4_OPEN_CLAIM_DELEGATE_PREV:
367 open->op_stateowner->so_confirmed = 1; 385 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
368 dprintk("NFSD: unsupported OPEN claim type %d\n", 386 dprintk("NFSD: unsupported OPEN claim type %d\n",
369 open->op_claim_type); 387 open->op_claim_type);
370 status = nfserr_notsupp; 388 status = nfserr_notsupp;
@@ -381,12 +399,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
381 * set, (2) sets open->op_stateid, (3) sets open->op_delegation. 399 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
382 */ 400 */
383 status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); 401 status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
402 WARN_ON(status && open->op_created);
384out: 403out:
385 if (open->op_stateowner) { 404 nfsd4_cleanup_open_state(open, status);
386 nfs4_get_stateowner(open->op_stateowner); 405 if (open->op_openowner)
387 cstate->replay_owner = open->op_stateowner; 406 cstate->replay_owner = &open->op_openowner->oo_owner;
388 } 407 else
389 nfs4_unlock_state(); 408 nfs4_unlock_state();
390 return status; 409 return status;
391} 410}
392 411
@@ -467,17 +486,12 @@ static __be32
467nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 486nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
468 struct nfsd4_commit *commit) 487 struct nfsd4_commit *commit)
469{ 488{
470 __be32 status;
471
472 u32 *p = (u32 *)commit->co_verf.data; 489 u32 *p = (u32 *)commit->co_verf.data;
473 *p++ = nfssvc_boot.tv_sec; 490 *p++ = nfssvc_boot.tv_sec;
474 *p++ = nfssvc_boot.tv_usec; 491 *p++ = nfssvc_boot.tv_usec;
475 492
476 status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, 493 return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
477 commit->co_count); 494 commit->co_count);
478 if (status == nfserr_symlink)
479 status = nfserr_inval;
480 return status;
481} 495}
482 496
483static __be32 497static __be32
@@ -492,8 +506,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
492 506
493 status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, 507 status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
494 NFSD_MAY_CREATE); 508 NFSD_MAY_CREATE);
495 if (status == nfserr_symlink)
496 status = nfserr_notdir;
497 if (status) 509 if (status)
498 return status; 510 return status;
499 511
@@ -691,7 +703,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
691 readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); 703 readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
692 readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); 704 readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
693 705
694 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || 706 if ((cookie == 1) || (cookie == 2) ||
695 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) 707 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
696 return nfserr_bad_cookie; 708 return nfserr_bad_cookie;
697 709
@@ -719,8 +731,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
719 return nfserr_grace; 731 return nfserr_grace;
720 status = nfsd_unlink(rqstp, &cstate->current_fh, 0, 732 status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
721 remove->rm_name, remove->rm_namelen); 733 remove->rm_name, remove->rm_namelen);
722 if (status == nfserr_symlink)
723 return nfserr_notdir;
724 if (!status) { 734 if (!status) {
725 fh_unlock(&cstate->current_fh); 735 fh_unlock(&cstate->current_fh);
726 set_change_info(&remove->rm_cinfo, &cstate->current_fh); 736 set_change_info(&remove->rm_cinfo, &cstate->current_fh);
@@ -751,8 +761,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
751 (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && 761 (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
752 S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) 762 S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
753 status = nfserr_exist; 763 status = nfserr_exist;
754 else if (status == nfserr_symlink)
755 status = nfserr_notdir;
756 764
757 if (!status) { 765 if (!status) {
758 set_change_info(&rename->rn_sinfo, &cstate->current_fh); 766 set_change_info(&rename->rn_sinfo, &cstate->current_fh);
@@ -892,8 +900,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
892 900
893 write->wr_bytes_written = cnt; 901 write->wr_bytes_written = cnt;
894 902
895 if (status == nfserr_symlink)
896 status = nfserr_inval;
897 return status; 903 return status;
898} 904}
899 905
@@ -930,7 +936,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
930 count = 4 + (verify->ve_attrlen >> 2); 936 count = 4 + (verify->ve_attrlen >> 2);
931 buf = kmalloc(count << 2, GFP_KERNEL); 937 buf = kmalloc(count << 2, GFP_KERNEL);
932 if (!buf) 938 if (!buf)
933 return nfserr_resource; 939 return nfserr_jukebox;
934 940
935 status = nfsd4_encode_fattr(&cstate->current_fh, 941 status = nfsd4_encode_fattr(&cstate->current_fh,
936 cstate->current_fh.fh_export, 942 cstate->current_fh.fh_export,
@@ -994,6 +1000,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
994 1000
995typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, 1001typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
996 void *); 1002 void *);
1003typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
1004
997enum nfsd4_op_flags { 1005enum nfsd4_op_flags {
998 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ 1006 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
999 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ 1007 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
@@ -1001,13 +1009,15 @@ enum nfsd4_op_flags {
1001 /* For rfc 5661 section 2.6.3.1.1: */ 1009 /* For rfc 5661 section 2.6.3.1.1: */
1002 OP_HANDLES_WRONGSEC = 1 << 3, 1010 OP_HANDLES_WRONGSEC = 1 << 3,
1003 OP_IS_PUTFH_LIKE = 1 << 4, 1011 OP_IS_PUTFH_LIKE = 1 << 4,
1004};
1005
1006struct nfsd4_operation {
1007 nfsd4op_func op_func;
1008 u32 op_flags;
1009 char *op_name;
1010 /* 1012 /*
1013 * These are the ops whose result size we estimate before
1014 * encoding, to avoid performing an op then not being able to
1015 * respond or cache a response. This includes writes and setattrs
1016 * as well as the operations usually called "nonidempotent":
1017 */
1018 OP_MODIFIES_SOMETHING = 1 << 5,
1019 /*
1020 * Cache compounds containing these ops in the xid-based drc:
1011 * We use the DRC for compounds containing non-idempotent 1021 * We use the DRC for compounds containing non-idempotent
1012 * operations, *except* those that are 4.1-specific (since 1022 * operations, *except* those that are 4.1-specific (since
1013 * sessions provide their own EOS), and except for stateful 1023 * sessions provide their own EOS), and except for stateful
@@ -1015,7 +1025,15 @@ struct nfsd4_operation {
1015 * (since sequence numbers provide EOS for open, lock, etc in 1025 * (since sequence numbers provide EOS for open, lock, etc in
1016 * the v4.0 case). 1026 * the v4.0 case).
1017 */ 1027 */
1018 bool op_cacheresult; 1028 OP_CACHEME = 1 << 6,
1029};
1030
1031struct nfsd4_operation {
1032 nfsd4op_func op_func;
1033 u32 op_flags;
1034 char *op_name;
1035 /* Try to get response size before operation */
1036 nfsd4op_rsize op_rsize_bop;
1019}; 1037};
1020 1038
1021static struct nfsd4_operation nfsd4_ops[]; 1039static struct nfsd4_operation nfsd4_ops[];
@@ -1062,7 +1080,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
1062 1080
1063bool nfsd4_cache_this_op(struct nfsd4_op *op) 1081bool nfsd4_cache_this_op(struct nfsd4_op *op)
1064{ 1082{
1065 return OPDESC(op)->op_cacheresult; 1083 return OPDESC(op)->op_flags & OP_CACHEME;
1066} 1084}
1067 1085
1068static bool need_wrongsec_check(struct svc_rqst *rqstp) 1086static bool need_wrongsec_check(struct svc_rqst *rqstp)
@@ -1110,6 +1128,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1110 struct nfsd4_operation *opdesc; 1128 struct nfsd4_operation *opdesc;
1111 struct nfsd4_compound_state *cstate = &resp->cstate; 1129 struct nfsd4_compound_state *cstate = &resp->cstate;
1112 int slack_bytes; 1130 int slack_bytes;
1131 u32 plen = 0;
1113 __be32 status; 1132 __be32 status;
1114 1133
1115 resp->xbuf = &rqstp->rq_res; 1134 resp->xbuf = &rqstp->rq_res;
@@ -1188,6 +1207,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1188 goto encode_op; 1207 goto encode_op;
1189 } 1208 }
1190 1209
1210 /* If op is non-idempotent */
1211 if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
1212 plen = opdesc->op_rsize_bop(rqstp, op);
1213 op->status = nfsd4_check_resp_size(resp, plen);
1214 }
1215
1216 if (op->status)
1217 goto encode_op;
1218
1191 if (opdesc->op_func) 1219 if (opdesc->op_func)
1192 op->status = opdesc->op_func(rqstp, cstate, &op->u); 1220 op->status = opdesc->op_func(rqstp, cstate, &op->u);
1193 else 1221 else
@@ -1217,7 +1245,7 @@ encode_op:
1217 be32_to_cpu(status)); 1245 be32_to_cpu(status));
1218 1246
1219 if (cstate->replay_owner) { 1247 if (cstate->replay_owner) {
1220 nfs4_put_stateowner(cstate->replay_owner); 1248 nfs4_unlock_state();
1221 cstate->replay_owner = NULL; 1249 cstate->replay_owner = NULL;
1222 } 1250 }
1223 /* XXX Ugh, we need to get rid of this kind of special case: */ 1251 /* XXX Ugh, we need to get rid of this kind of special case: */
@@ -1238,6 +1266,144 @@ out:
1238 return status; 1266 return status;
1239} 1267}
1240 1268
1269#define op_encode_hdr_size (2)
1270#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
1271#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
1272#define op_encode_change_info_maxsz (5)
1273#define nfs4_fattr_bitmap_maxsz (4)
1274
1275#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
1276#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz)
1277
1278#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
1279
1280#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz)
1281#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \
1282 op_encode_ace_maxsz)
1283
1284#define op_encode_channel_attrs_maxsz (6 + 1 + 1)
1285
1286static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1287{
1288 return (op_encode_hdr_size) * sizeof(__be32);
1289}
1290
1291static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1292{
1293 return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
1294}
1295
1296static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1297{
1298 return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
1299}
1300
1301static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1302{
1303 return (op_encode_hdr_size + op_encode_change_info_maxsz
1304 + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
1305}
1306
1307static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1308{
1309 return (op_encode_hdr_size + op_encode_change_info_maxsz)
1310 * sizeof(__be32);
1311}
1312
1313static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1314{
1315 return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
1316 * sizeof(__be32);
1317}
1318
1319static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1320{
1321 return (op_encode_hdr_size + op_encode_stateid_maxsz
1322 + op_encode_change_info_maxsz + 1
1323 + nfs4_fattr_bitmap_maxsz
1324 + op_encode_delegation_maxsz) * sizeof(__be32);
1325}
1326
1327static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1328{
1329 u32 maxcount = 0, rlen = 0;
1330
1331 maxcount = svc_max_payload(rqstp);
1332 rlen = op->u.read.rd_length;
1333
1334 if (rlen > maxcount)
1335 rlen = maxcount;
1336
1337 return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
1338}
1339
1340static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1341{
1342 u32 rlen = op->u.readdir.rd_maxcount;
1343
1344 if (rlen > PAGE_SIZE)
1345 rlen = PAGE_SIZE;
1346
1347 return (op_encode_hdr_size + op_encode_verifier_maxsz)
1348 * sizeof(__be32) + rlen;
1349}
1350
1351static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1352{
1353 return (op_encode_hdr_size + op_encode_change_info_maxsz)
1354 * sizeof(__be32);
1355}
1356
1357static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1358{
1359 return (op_encode_hdr_size + op_encode_change_info_maxsz
1360 + op_encode_change_info_maxsz) * sizeof(__be32);
1361}
1362
1363static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1364{
1365 return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
1366}
1367
1368static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1369{
1370 return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
1371}
1372
1373static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1374{
1375 return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
1376}
1377
1378static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1379{
1380 return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
1381 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
1382 2 + /*eir_server_owner.so_minor_id */\
1383 /* eir_server_owner.so_major_id<> */\
1384 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
1385 /* eir_server_scope<> */\
1386 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
1387 1 + /* eir_server_impl_id array length */\
1388 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
1389}
1390
1391static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1392{
1393 return (op_encode_hdr_size + \
1394 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
1395 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
1396}
1397
1398static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1399{
1400 return (op_encode_hdr_size + \
1401 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
1402 2 + /* csr_sequence, csr_flags */\
1403 op_encode_channel_attrs_maxsz + \
1404 op_encode_channel_attrs_maxsz) * sizeof(__be32);
1405}
1406
1241static struct nfsd4_operation nfsd4_ops[] = { 1407static struct nfsd4_operation nfsd4_ops[] = {
1242 [OP_ACCESS] = { 1408 [OP_ACCESS] = {
1243 .op_func = (nfsd4op_func)nfsd4_access, 1409 .op_func = (nfsd4op_func)nfsd4_access,
@@ -1245,20 +1411,27 @@ static struct nfsd4_operation nfsd4_ops[] = {
1245 }, 1411 },
1246 [OP_CLOSE] = { 1412 [OP_CLOSE] = {
1247 .op_func = (nfsd4op_func)nfsd4_close, 1413 .op_func = (nfsd4op_func)nfsd4_close,
1414 .op_flags = OP_MODIFIES_SOMETHING,
1248 .op_name = "OP_CLOSE", 1415 .op_name = "OP_CLOSE",
1416 .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
1249 }, 1417 },
1250 [OP_COMMIT] = { 1418 [OP_COMMIT] = {
1251 .op_func = (nfsd4op_func)nfsd4_commit, 1419 .op_func = (nfsd4op_func)nfsd4_commit,
1420 .op_flags = OP_MODIFIES_SOMETHING,
1252 .op_name = "OP_COMMIT", 1421 .op_name = "OP_COMMIT",
1422 .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize,
1253 }, 1423 },
1254 [OP_CREATE] = { 1424 [OP_CREATE] = {
1255 .op_func = (nfsd4op_func)nfsd4_create, 1425 .op_func = (nfsd4op_func)nfsd4_create,
1426 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1256 .op_name = "OP_CREATE", 1427 .op_name = "OP_CREATE",
1257 .op_cacheresult = true, 1428 .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
1258 }, 1429 },
1259 [OP_DELEGRETURN] = { 1430 [OP_DELEGRETURN] = {
1260 .op_func = (nfsd4op_func)nfsd4_delegreturn, 1431 .op_func = (nfsd4op_func)nfsd4_delegreturn,
1432 .op_flags = OP_MODIFIES_SOMETHING,
1261 .op_name = "OP_DELEGRETURN", 1433 .op_name = "OP_DELEGRETURN",
1434 .op_rsize_bop = nfsd4_only_status_rsize,
1262 }, 1435 },
1263 [OP_GETATTR] = { 1436 [OP_GETATTR] = {
1264 .op_func = (nfsd4op_func)nfsd4_getattr, 1437 .op_func = (nfsd4op_func)nfsd4_getattr,
@@ -1271,12 +1444,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
1271 }, 1444 },
1272 [OP_LINK] = { 1445 [OP_LINK] = {
1273 .op_func = (nfsd4op_func)nfsd4_link, 1446 .op_func = (nfsd4op_func)nfsd4_link,
1447 .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
1448 | OP_CACHEME,
1274 .op_name = "OP_LINK", 1449 .op_name = "OP_LINK",
1275 .op_cacheresult = true, 1450 .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
1276 }, 1451 },
1277 [OP_LOCK] = { 1452 [OP_LOCK] = {
1278 .op_func = (nfsd4op_func)nfsd4_lock, 1453 .op_func = (nfsd4op_func)nfsd4_lock,
1454 .op_flags = OP_MODIFIES_SOMETHING,
1279 .op_name = "OP_LOCK", 1455 .op_name = "OP_LOCK",
1456 .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
1280 }, 1457 },
1281 [OP_LOCKT] = { 1458 [OP_LOCKT] = {
1282 .op_func = (nfsd4op_func)nfsd4_lockt, 1459 .op_func = (nfsd4op_func)nfsd4_lockt,
@@ -1284,7 +1461,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
1284 }, 1461 },
1285 [OP_LOCKU] = { 1462 [OP_LOCKU] = {
1286 .op_func = (nfsd4op_func)nfsd4_locku, 1463 .op_func = (nfsd4op_func)nfsd4_locku,
1464 .op_flags = OP_MODIFIES_SOMETHING,
1287 .op_name = "OP_LOCKU", 1465 .op_name = "OP_LOCKU",
1466 .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
1288 }, 1467 },
1289 [OP_LOOKUP] = { 1468 [OP_LOOKUP] = {
1290 .op_func = (nfsd4op_func)nfsd4_lookup, 1469 .op_func = (nfsd4op_func)nfsd4_lookup,
@@ -1302,42 +1481,54 @@ static struct nfsd4_operation nfsd4_ops[] = {
1302 }, 1481 },
1303 [OP_OPEN] = { 1482 [OP_OPEN] = {
1304 .op_func = (nfsd4op_func)nfsd4_open, 1483 .op_func = (nfsd4op_func)nfsd4_open,
1305 .op_flags = OP_HANDLES_WRONGSEC, 1484 .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
1306 .op_name = "OP_OPEN", 1485 .op_name = "OP_OPEN",
1486 .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
1307 }, 1487 },
1308 [OP_OPEN_CONFIRM] = { 1488 [OP_OPEN_CONFIRM] = {
1309 .op_func = (nfsd4op_func)nfsd4_open_confirm, 1489 .op_func = (nfsd4op_func)nfsd4_open_confirm,
1490 .op_flags = OP_MODIFIES_SOMETHING,
1310 .op_name = "OP_OPEN_CONFIRM", 1491 .op_name = "OP_OPEN_CONFIRM",
1492 .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
1311 }, 1493 },
1312 [OP_OPEN_DOWNGRADE] = { 1494 [OP_OPEN_DOWNGRADE] = {
1313 .op_func = (nfsd4op_func)nfsd4_open_downgrade, 1495 .op_func = (nfsd4op_func)nfsd4_open_downgrade,
1496 .op_flags = OP_MODIFIES_SOMETHING,
1314 .op_name = "OP_OPEN_DOWNGRADE", 1497 .op_name = "OP_OPEN_DOWNGRADE",
1498 .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
1315 }, 1499 },
1316 [OP_PUTFH] = { 1500 [OP_PUTFH] = {
1317 .op_func = (nfsd4op_func)nfsd4_putfh, 1501 .op_func = (nfsd4op_func)nfsd4_putfh,
1318 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS 1502 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1319 | OP_IS_PUTFH_LIKE, 1503 | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
1320 .op_name = "OP_PUTFH", 1504 .op_name = "OP_PUTFH",
1505 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1321 }, 1506 },
1322 [OP_PUTPUBFH] = { 1507 [OP_PUTPUBFH] = {
1323 .op_func = (nfsd4op_func)nfsd4_putrootfh, 1508 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1324 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS 1509 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1325 | OP_IS_PUTFH_LIKE, 1510 | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
1326 .op_name = "OP_PUTPUBFH", 1511 .op_name = "OP_PUTPUBFH",
1512 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1327 }, 1513 },
1328 [OP_PUTROOTFH] = { 1514 [OP_PUTROOTFH] = {
1329 .op_func = (nfsd4op_func)nfsd4_putrootfh, 1515 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1330 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS 1516 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1331 | OP_IS_PUTFH_LIKE, 1517 | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
1332 .op_name = "OP_PUTROOTFH", 1518 .op_name = "OP_PUTROOTFH",
1519 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1333 }, 1520 },
1334 [OP_READ] = { 1521 [OP_READ] = {
1335 .op_func = (nfsd4op_func)nfsd4_read, 1522 .op_func = (nfsd4op_func)nfsd4_read,
1523 .op_flags = OP_MODIFIES_SOMETHING,
1336 .op_name = "OP_READ", 1524 .op_name = "OP_READ",
1525 .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
1337 }, 1526 },
1338 [OP_READDIR] = { 1527 [OP_READDIR] = {
1339 .op_func = (nfsd4op_func)nfsd4_readdir, 1528 .op_func = (nfsd4op_func)nfsd4_readdir,
1529 .op_flags = OP_MODIFIES_SOMETHING,
1340 .op_name = "OP_READDIR", 1530 .op_name = "OP_READDIR",
1531 .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
1341 }, 1532 },
1342 [OP_READLINK] = { 1533 [OP_READLINK] = {
1343 .op_func = (nfsd4op_func)nfsd4_readlink, 1534 .op_func = (nfsd4op_func)nfsd4_readlink,
@@ -1345,29 +1536,36 @@ static struct nfsd4_operation nfsd4_ops[] = {
1345 }, 1536 },
1346 [OP_REMOVE] = { 1537 [OP_REMOVE] = {
1347 .op_func = (nfsd4op_func)nfsd4_remove, 1538 .op_func = (nfsd4op_func)nfsd4_remove,
1539 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1348 .op_name = "OP_REMOVE", 1540 .op_name = "OP_REMOVE",
1349 .op_cacheresult = true, 1541 .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
1350 }, 1542 },
1351 [OP_RENAME] = { 1543 [OP_RENAME] = {
1352 .op_name = "OP_RENAME",
1353 .op_func = (nfsd4op_func)nfsd4_rename, 1544 .op_func = (nfsd4op_func)nfsd4_rename,
1354 .op_cacheresult = true, 1545 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1546 .op_name = "OP_RENAME",
1547 .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
1355 }, 1548 },
1356 [OP_RENEW] = { 1549 [OP_RENEW] = {
1357 .op_func = (nfsd4op_func)nfsd4_renew, 1550 .op_func = (nfsd4op_func)nfsd4_renew,
1358 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1551 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1552 | OP_MODIFIES_SOMETHING,
1359 .op_name = "OP_RENEW", 1553 .op_name = "OP_RENEW",
1554 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1555
1360 }, 1556 },
1361 [OP_RESTOREFH] = { 1557 [OP_RESTOREFH] = {
1362 .op_func = (nfsd4op_func)nfsd4_restorefh, 1558 .op_func = (nfsd4op_func)nfsd4_restorefh,
1363 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS 1559 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1364 | OP_IS_PUTFH_LIKE, 1560 | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
1365 .op_name = "OP_RESTOREFH", 1561 .op_name = "OP_RESTOREFH",
1562 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1366 }, 1563 },
1367 [OP_SAVEFH] = { 1564 [OP_SAVEFH] = {
1368 .op_func = (nfsd4op_func)nfsd4_savefh, 1565 .op_func = (nfsd4op_func)nfsd4_savefh,
1369 .op_flags = OP_HANDLES_WRONGSEC, 1566 .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
1370 .op_name = "OP_SAVEFH", 1567 .op_name = "OP_SAVEFH",
1568 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1371 }, 1569 },
1372 [OP_SECINFO] = { 1570 [OP_SECINFO] = {
1373 .op_func = (nfsd4op_func)nfsd4_secinfo, 1571 .op_func = (nfsd4op_func)nfsd4_secinfo,
@@ -1377,19 +1575,22 @@ static struct nfsd4_operation nfsd4_ops[] = {
1377 [OP_SETATTR] = { 1575 [OP_SETATTR] = {
1378 .op_func = (nfsd4op_func)nfsd4_setattr, 1576 .op_func = (nfsd4op_func)nfsd4_setattr,
1379 .op_name = "OP_SETATTR", 1577 .op_name = "OP_SETATTR",
1380 .op_cacheresult = true, 1578 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1579 .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
1381 }, 1580 },
1382 [OP_SETCLIENTID] = { 1581 [OP_SETCLIENTID] = {
1383 .op_func = (nfsd4op_func)nfsd4_setclientid, 1582 .op_func = (nfsd4op_func)nfsd4_setclientid,
1384 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1583 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1584 | OP_MODIFIES_SOMETHING | OP_CACHEME,
1385 .op_name = "OP_SETCLIENTID", 1585 .op_name = "OP_SETCLIENTID",
1386 .op_cacheresult = true, 1586 .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
1387 }, 1587 },
1388 [OP_SETCLIENTID_CONFIRM] = { 1588 [OP_SETCLIENTID_CONFIRM] = {
1389 .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, 1589 .op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
1390 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1590 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1591 | OP_MODIFIES_SOMETHING | OP_CACHEME,
1391 .op_name = "OP_SETCLIENTID_CONFIRM", 1592 .op_name = "OP_SETCLIENTID_CONFIRM",
1392 .op_cacheresult = true, 1593 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1393 }, 1594 },
1394 [OP_VERIFY] = { 1595 [OP_VERIFY] = {
1395 .op_func = (nfsd4op_func)nfsd4_verify, 1596 .op_func = (nfsd4op_func)nfsd4_verify,
@@ -1397,35 +1598,46 @@ static struct nfsd4_operation nfsd4_ops[] = {
1397 }, 1598 },
1398 [OP_WRITE] = { 1599 [OP_WRITE] = {
1399 .op_func = (nfsd4op_func)nfsd4_write, 1600 .op_func = (nfsd4op_func)nfsd4_write,
1601 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
1400 .op_name = "OP_WRITE", 1602 .op_name = "OP_WRITE",
1401 .op_cacheresult = true, 1603 .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
1402 }, 1604 },
1403 [OP_RELEASE_LOCKOWNER] = { 1605 [OP_RELEASE_LOCKOWNER] = {
1404 .op_func = (nfsd4op_func)nfsd4_release_lockowner, 1606 .op_func = (nfsd4op_func)nfsd4_release_lockowner,
1405 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1607 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1608 | OP_MODIFIES_SOMETHING,
1406 .op_name = "OP_RELEASE_LOCKOWNER", 1609 .op_name = "OP_RELEASE_LOCKOWNER",
1610 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1407 }, 1611 },
1408 1612
1409 /* NFSv4.1 operations */ 1613 /* NFSv4.1 operations */
1410 [OP_EXCHANGE_ID] = { 1614 [OP_EXCHANGE_ID] = {
1411 .op_func = (nfsd4op_func)nfsd4_exchange_id, 1615 .op_func = (nfsd4op_func)nfsd4_exchange_id,
1412 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1616 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1617 | OP_MODIFIES_SOMETHING,
1413 .op_name = "OP_EXCHANGE_ID", 1618 .op_name = "OP_EXCHANGE_ID",
1619 .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
1414 }, 1620 },
1415 [OP_BIND_CONN_TO_SESSION] = { 1621 [OP_BIND_CONN_TO_SESSION] = {
1416 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, 1622 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
1417 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1623 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1624 | OP_MODIFIES_SOMETHING,
1418 .op_name = "OP_BIND_CONN_TO_SESSION", 1625 .op_name = "OP_BIND_CONN_TO_SESSION",
1626 .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize,
1419 }, 1627 },
1420 [OP_CREATE_SESSION] = { 1628 [OP_CREATE_SESSION] = {
1421 .op_func = (nfsd4op_func)nfsd4_create_session, 1629 .op_func = (nfsd4op_func)nfsd4_create_session,
1422 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1630 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1631 | OP_MODIFIES_SOMETHING,
1423 .op_name = "OP_CREATE_SESSION", 1632 .op_name = "OP_CREATE_SESSION",
1633 .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize,
1424 }, 1634 },
1425 [OP_DESTROY_SESSION] = { 1635 [OP_DESTROY_SESSION] = {
1426 .op_func = (nfsd4op_func)nfsd4_destroy_session, 1636 .op_func = (nfsd4op_func)nfsd4_destroy_session,
1427 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1637 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1638 | OP_MODIFIES_SOMETHING,
1428 .op_name = "OP_DESTROY_SESSION", 1639 .op_name = "OP_DESTROY_SESSION",
1640 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1429 }, 1641 },
1430 [OP_SEQUENCE] = { 1642 [OP_SEQUENCE] = {
1431 .op_func = (nfsd4op_func)nfsd4_sequence, 1643 .op_func = (nfsd4op_func)nfsd4_sequence,
@@ -1433,14 +1645,17 @@ static struct nfsd4_operation nfsd4_ops[] = {
1433 .op_name = "OP_SEQUENCE", 1645 .op_name = "OP_SEQUENCE",
1434 }, 1646 },
1435 [OP_DESTROY_CLIENTID] = { 1647 [OP_DESTROY_CLIENTID] = {
1436 .op_func = NULL, 1648 .op_func = (nfsd4op_func)nfsd4_destroy_clientid,
1437 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1649 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
1650 | OP_MODIFIES_SOMETHING,
1438 .op_name = "OP_DESTROY_CLIENTID", 1651 .op_name = "OP_DESTROY_CLIENTID",
1652 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1439 }, 1653 },
1440 [OP_RECLAIM_COMPLETE] = { 1654 [OP_RECLAIM_COMPLETE] = {
1441 .op_func = (nfsd4op_func)nfsd4_reclaim_complete, 1655 .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
1442 .op_flags = ALLOWED_WITHOUT_FH, 1656 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
1443 .op_name = "OP_RECLAIM_COMPLETE", 1657 .op_name = "OP_RECLAIM_COMPLETE",
1658 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1444 }, 1659 },
1445 [OP_SECINFO_NO_NAME] = { 1660 [OP_SECINFO_NO_NAME] = {
1446 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, 1661 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
@@ -1454,8 +1669,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
1454 }, 1669 },
1455 [OP_FREE_STATEID] = { 1670 [OP_FREE_STATEID] = {
1456 .op_func = (nfsd4op_func)nfsd4_free_stateid, 1671 .op_func = (nfsd4op_func)nfsd4_free_stateid,
1457 .op_flags = ALLOWED_WITHOUT_FH, 1672 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
1458 .op_name = "OP_FREE_STATEID", 1673 .op_name = "OP_FREE_STATEID",
1674 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1459 }, 1675 },
1460}; 1676};
1461 1677
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 29d77f60585b..ed083b9a731b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -45,6 +45,7 @@
45 45
46/* Globals */ 46/* Globals */
47static struct file *rec_file; 47static struct file *rec_file;
48static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
48 49
49static int 50static int
50nfs4_save_creds(const struct cred **original_creds) 51nfs4_save_creds(const struct cred **original_creds)
@@ -88,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
88 struct xdr_netobj cksum; 89 struct xdr_netobj cksum;
89 struct hash_desc desc; 90 struct hash_desc desc;
90 struct scatterlist sg; 91 struct scatterlist sg;
91 __be32 status = nfserr_resource; 92 __be32 status = nfserr_jukebox;
92 93
93 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", 94 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
94 clname->len, clname->data); 95 clname->len, clname->data);
@@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
129 if (!rec_file || clp->cl_firststate) 130 if (!rec_file || clp->cl_firststate)
130 return 0; 131 return 0;
131 132
133 clp->cl_firststate = 1;
132 status = nfs4_save_creds(&original_cred); 134 status = nfs4_save_creds(&original_cred);
133 if (status < 0) 135 if (status < 0)
134 return status; 136 return status;
@@ -143,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
143 goto out_unlock; 145 goto out_unlock;
144 } 146 }
145 status = -EEXIST; 147 status = -EEXIST;
146 if (dentry->d_inode) { 148 if (dentry->d_inode)
147 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
148 goto out_put; 149 goto out_put;
149 }
150 status = mnt_want_write(rec_file->f_path.mnt); 150 status = mnt_want_write(rec_file->f_path.mnt);
151 if (status) 151 if (status)
152 goto out_put; 152 goto out_put;
@@ -156,12 +156,14 @@ out_put:
156 dput(dentry); 156 dput(dentry);
157out_unlock: 157out_unlock:
158 mutex_unlock(&dir->d_inode->i_mutex); 158 mutex_unlock(&dir->d_inode->i_mutex);
159 if (status == 0) { 159 if (status == 0)
160 clp->cl_firststate = 1;
161 vfs_fsync(rec_file, 0); 160 vfs_fsync(rec_file, 0);
162 } 161 else
162 printk(KERN_ERR "NFSD: failed to write recovery record"
163 " (err %d); please check that %s exists"
164 " and is writeable", status,
165 user_recovery_dirname);
163 nfs4_reset_creds(original_cred); 166 nfs4_reset_creds(original_cred);
164 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
165 return status; 167 return status;
166} 168}
167 169
@@ -354,13 +356,13 @@ nfsd4_recdir_load(void) {
354 */ 356 */
355 357
356void 358void
357nfsd4_init_recdir(char *rec_dirname) 359nfsd4_init_recdir()
358{ 360{
359 const struct cred *original_cred; 361 const struct cred *original_cred;
360 int status; 362 int status;
361 363
362 printk("NFSD: Using %s as the NFSv4 state recovery directory\n", 364 printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
363 rec_dirname); 365 user_recovery_dirname);
364 366
365 BUG_ON(rec_file); 367 BUG_ON(rec_file);
366 368
@@ -372,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname)
372 return; 374 return;
373 } 375 }
374 376
375 rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); 377 rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
376 if (IS_ERR(rec_file)) { 378 if (IS_ERR(rec_file)) {
377 printk("NFSD: unable to find recovery directory %s\n", 379 printk("NFSD: unable to find recovery directory %s\n",
378 rec_dirname); 380 user_recovery_dirname);
379 rec_file = NULL; 381 rec_file = NULL;
380 } 382 }
381 383
@@ -390,3 +392,30 @@ nfsd4_shutdown_recdir(void)
390 fput(rec_file); 392 fput(rec_file);
391 rec_file = NULL; 393 rec_file = NULL;
392} 394}
395
396/*
397 * Change the NFSv4 recovery directory to recdir.
398 */
399int
400nfs4_reset_recoverydir(char *recdir)
401{
402 int status;
403 struct path path;
404
405 status = kern_path(recdir, LOOKUP_FOLLOW, &path);
406 if (status)
407 return status;
408 status = -ENOTDIR;
409 if (S_ISDIR(path.dentry->d_inode->i_mode)) {
410 strcpy(user_recovery_dirname, recdir);
411 status = 0;
412 }
413 path_put(&path);
414 return status;
415}
416
417char *
418nfs4_recoverydir(void)
419{
420 return user_recovery_dirname;
421}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3787ec117400..47e94e33a975 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,9 +49,6 @@
49time_t nfsd4_lease = 90; /* default lease time */ 49time_t nfsd4_lease = 90; /* default lease time */
50time_t nfsd4_grace = 90; 50time_t nfsd4_grace = 90;
51static time_t boot_time; 51static time_t boot_time;
52static u32 current_ownerid = 1;
53static u32 current_fileid = 1;
54static u32 current_delegid = 1;
55static stateid_t zerostateid; /* bits all 0 */ 52static stateid_t zerostateid; /* bits all 0 */
56static stateid_t onestateid; /* bits all 1 */ 53static stateid_t onestateid; /* bits all 1 */
57static u64 current_sessionid = 1; 54static u64 current_sessionid = 1;
@@ -60,13 +57,7 @@ static u64 current_sessionid = 1;
60#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) 57#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
61 58
62/* forward declarations */ 59/* forward declarations */
63static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); 60static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
64static struct nfs4_stateid * search_for_stateid(stateid_t *stid);
65static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
66static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
67static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
68static void nfs4_set_recdir(char *recdir);
69static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner);
70 61
71/* Locking: */ 62/* Locking: */
72 63
@@ -80,7 +71,8 @@ static DEFINE_MUTEX(client_mutex);
80 */ 71 */
81static DEFINE_SPINLOCK(recall_lock); 72static DEFINE_SPINLOCK(recall_lock);
82 73
83static struct kmem_cache *stateowner_slab = NULL; 74static struct kmem_cache *openowner_slab = NULL;
75static struct kmem_cache *lockowner_slab = NULL;
84static struct kmem_cache *file_slab = NULL; 76static struct kmem_cache *file_slab = NULL;
85static struct kmem_cache *stateid_slab = NULL; 77static struct kmem_cache *stateid_slab = NULL;
86static struct kmem_cache *deleg_slab = NULL; 78static struct kmem_cache *deleg_slab = NULL;
@@ -112,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes)
112 104
113static struct list_head del_recall_lru; 105static struct list_head del_recall_lru;
114 106
107static void nfsd4_free_file(struct nfs4_file *f)
108{
109 kmem_cache_free(file_slab, f);
110}
111
115static inline void 112static inline void
116put_nfs4_file(struct nfs4_file *fi) 113put_nfs4_file(struct nfs4_file *fi)
117{ 114{
@@ -119,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi)
119 list_del(&fi->fi_hash); 116 list_del(&fi->fi_hash);
120 spin_unlock(&recall_lock); 117 spin_unlock(&recall_lock);
121 iput(fi->fi_inode); 118 iput(fi->fi_inode);
122 kmem_cache_free(file_slab, fi); 119 nfsd4_free_file(fi);
123 } 120 }
124} 121}
125 122
@@ -136,35 +133,33 @@ unsigned int max_delegations;
136 * Open owner state (share locks) 133 * Open owner state (share locks)
137 */ 134 */
138 135
139/* hash tables for nfs4_stateowner */ 136/* hash tables for open owners */
140#define OWNER_HASH_BITS 8 137#define OPEN_OWNER_HASH_BITS 8
141#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) 138#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS)
142#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) 139#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1)
143 140
144#define ownerid_hashval(id) \ 141static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
145 ((id) & OWNER_HASH_MASK) 142{
146#define ownerstr_hashval(clientid, ownername) \ 143 unsigned int ret;
147 (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
148 144
149static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; 145 ret = opaque_hashval(ownername->data, ownername->len);
150static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; 146 ret += clientid;
147 return ret & OPEN_OWNER_HASH_MASK;
148}
149
150static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
151 151
152/* hash table for nfs4_file */ 152/* hash table for nfs4_file */
153#define FILE_HASH_BITS 8 153#define FILE_HASH_BITS 8
154#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) 154#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
155 155
156/* hash table for (open)nfs4_stateid */ 156static unsigned int file_hashval(struct inode *ino)
157#define STATEID_HASH_BITS 10 157{
158#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) 158 /* XXX: why are we hashing on inode pointer, anyway? */
159#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) 159 return hash_ptr(ino, FILE_HASH_BITS);
160 160}
161#define file_hashval(x) \
162 hash_ptr(x, FILE_HASH_BITS)
163#define stateid_hashval(owner_id, file_id) \
164 (((owner_id) + (file_id)) & STATEID_HASH_MASK)
165 161
166static struct list_head file_hashtbl[FILE_HASH_SIZE]; 162static struct list_head file_hashtbl[FILE_HASH_SIZE];
167static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
168 163
169static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) 164static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
170{ 165{
@@ -192,8 +187,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
192static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) 187static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
193{ 188{
194 if (atomic_dec_and_test(&fp->fi_access[oflag])) { 189 if (atomic_dec_and_test(&fp->fi_access[oflag])) {
195 nfs4_file_put_fd(fp, O_RDWR);
196 nfs4_file_put_fd(fp, oflag); 190 nfs4_file_put_fd(fp, oflag);
191 /*
192 * It's also safe to get rid of the RDWR open *if*
193 * we no longer have need of the other kind of access
194 * or if we already have the other kind of open:
195 */
196 if (fp->fi_fds[1-oflag]
197 || atomic_read(&fp->fi_access[1 - oflag]) == 0)
198 nfs4_file_put_fd(fp, O_RDWR);
197 } 199 }
198} 200}
199 201
@@ -206,8 +208,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
206 __nfs4_file_put_access(fp, oflag); 208 __nfs4_file_put_access(fp, oflag);
207} 209}
208 210
211static inline int get_new_stid(struct nfs4_stid *stid)
212{
213 static int min_stateid = 0;
214 struct idr *stateids = &stid->sc_client->cl_stateids;
215 int new_stid;
216 int error;
217
218 error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
219 /*
220 * Note: the necessary preallocation was done in
221 * nfs4_alloc_stateid(). The idr code caps the number of
222 * preallocations that can exist at a time, but the state lock
223 * prevents anyone from using ours before we get here:
224 */
225 BUG_ON(error);
226 /*
227 * It shouldn't be a problem to reuse an opaque stateid value.
228 * I don't think it is for 4.1. But with 4.0 I worry that, for
229 * example, a stray write retransmission could be accepted by
230 * the server when it should have been rejected. Therefore,
231 * adopt a trick from the sctp code to attempt to maximize the
232 * amount of time until an id is reused, by ensuring they always
233 * "increase" (mod INT_MAX):
234 */
235
236 min_stateid = new_stid+1;
237 if (min_stateid == INT_MAX)
238 min_stateid = 0;
239 return new_stid;
240}
241
242static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
243{
244 stateid_t *s = &stid->sc_stateid;
245 int new_id;
246
247 stid->sc_type = type;
248 stid->sc_client = cl;
249 s->si_opaque.so_clid = cl->cl_clientid;
250 new_id = get_new_stid(stid);
251 s->si_opaque.so_id = (u32)new_id;
252 /* Will be incremented before return to client: */
253 s->si_generation = 0;
254}
255
256static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
257{
258 struct idr *stateids = &cl->cl_stateids;
259
260 if (!idr_pre_get(stateids, GFP_KERNEL))
261 return NULL;
262 /*
263 * Note: if we fail here (or any time between now and the time
264 * we actually get the new idr), we won't need to undo the idr
265 * preallocation, since the idr code caps the number of
266 * preallocated entries.
267 */
268 return kmem_cache_alloc(slab, GFP_KERNEL);
269}
270
271static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
272{
273 return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
274}
275
209static struct nfs4_delegation * 276static struct nfs4_delegation *
210alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) 277alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
211{ 278{
212 struct nfs4_delegation *dp; 279 struct nfs4_delegation *dp;
213 struct nfs4_file *fp = stp->st_file; 280 struct nfs4_file *fp = stp->st_file;
@@ -224,21 +291,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
224 return NULL; 291 return NULL;
225 if (num_delegations > max_delegations) 292 if (num_delegations > max_delegations)
226 return NULL; 293 return NULL;
227 dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); 294 dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
228 if (dp == NULL) 295 if (dp == NULL)
229 return dp; 296 return dp;
297 init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
298 /*
299 * delegation seqid's are never incremented. The 4.1 special
300 * meaning of seqid 0 isn't meaningful, really, but let's avoid
301 * 0 anyway just for consistency and use 1:
302 */
303 dp->dl_stid.sc_stateid.si_generation = 1;
230 num_delegations++; 304 num_delegations++;
231 INIT_LIST_HEAD(&dp->dl_perfile); 305 INIT_LIST_HEAD(&dp->dl_perfile);
232 INIT_LIST_HEAD(&dp->dl_perclnt); 306 INIT_LIST_HEAD(&dp->dl_perclnt);
233 INIT_LIST_HEAD(&dp->dl_recall_lru); 307 INIT_LIST_HEAD(&dp->dl_recall_lru);
234 dp->dl_client = clp;
235 get_nfs4_file(fp); 308 get_nfs4_file(fp);
236 dp->dl_file = fp; 309 dp->dl_file = fp;
237 dp->dl_type = type; 310 dp->dl_type = type;
238 dp->dl_stateid.si_boot = boot_time;
239 dp->dl_stateid.si_stateownerid = current_delegid++;
240 dp->dl_stateid.si_fileid = 0;
241 dp->dl_stateid.si_generation = 0;
242 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 311 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
243 dp->dl_time = 0; 312 dp->dl_time = 0;
244 atomic_set(&dp->dl_count, 1); 313 atomic_set(&dp->dl_count, 1);
@@ -267,10 +336,18 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
267 } 336 }
268} 337}
269 338
339static void unhash_stid(struct nfs4_stid *s)
340{
341 struct idr *stateids = &s->sc_client->cl_stateids;
342
343 idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
344}
345
270/* Called under the state lock. */ 346/* Called under the state lock. */
271static void 347static void
272unhash_delegation(struct nfs4_delegation *dp) 348unhash_delegation(struct nfs4_delegation *dp)
273{ 349{
350 unhash_stid(&dp->dl_stid);
274 list_del_init(&dp->dl_perclnt); 351 list_del_init(&dp->dl_perclnt);
275 spin_lock(&recall_lock); 352 spin_lock(&recall_lock);
276 list_del_init(&dp->dl_perfile); 353 list_del_init(&dp->dl_perfile);
@@ -292,10 +369,16 @@ static DEFINE_SPINLOCK(client_lock);
292#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) 369#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
293#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) 370#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
294 371
295#define clientid_hashval(id) \ 372static unsigned int clientid_hashval(u32 id)
296 ((id) & CLIENT_HASH_MASK) 373{
297#define clientstr_hashval(name) \ 374 return id & CLIENT_HASH_MASK;
298 (opaque_hashval((name), 8) & CLIENT_HASH_MASK) 375}
376
377static unsigned int clientstr_hashval(const char *name)
378{
379 return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
380}
381
299/* 382/*
300 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot 383 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
301 * used in reboot/reset lease grace period processing 384 * used in reboot/reset lease grace period processing
@@ -362,7 +445,7 @@ set_deny(unsigned int *deny, unsigned long bmap) {
362} 445}
363 446
364static int 447static int
365test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { 448test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
366 unsigned int access, deny; 449 unsigned int access, deny;
367 450
368 set_access(&access, stp->st_access_bmap); 451 set_access(&access, stp->st_access_bmap);
@@ -385,14 +468,13 @@ static int nfs4_access_to_omode(u32 access)
385 BUG(); 468 BUG();
386} 469}
387 470
388static void unhash_generic_stateid(struct nfs4_stateid *stp) 471static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
389{ 472{
390 list_del(&stp->st_hash);
391 list_del(&stp->st_perfile); 473 list_del(&stp->st_perfile);
392 list_del(&stp->st_perstateowner); 474 list_del(&stp->st_perstateowner);
393} 475}
394 476
395static void free_generic_stateid(struct nfs4_stateid *stp) 477static void close_generic_stateid(struct nfs4_ol_stateid *stp)
396{ 478{
397 int i; 479 int i;
398 480
@@ -401,84 +483,106 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
401 if (test_bit(i, &stp->st_access_bmap)) 483 if (test_bit(i, &stp->st_access_bmap))
402 nfs4_file_put_access(stp->st_file, 484 nfs4_file_put_access(stp->st_file,
403 nfs4_access_to_omode(i)); 485 nfs4_access_to_omode(i));
486 __clear_bit(i, &stp->st_access_bmap);
404 } 487 }
405 } 488 }
406 put_nfs4_file(stp->st_file); 489 put_nfs4_file(stp->st_file);
490 stp->st_file = NULL;
491}
492
493static void free_generic_stateid(struct nfs4_ol_stateid *stp)
494{
407 kmem_cache_free(stateid_slab, stp); 495 kmem_cache_free(stateid_slab, stp);
408} 496}
409 497
410static void release_lock_stateid(struct nfs4_stateid *stp) 498static void release_lock_stateid(struct nfs4_ol_stateid *stp)
411{ 499{
412 struct file *file; 500 struct file *file;
413 501
414 unhash_generic_stateid(stp); 502 unhash_generic_stateid(stp);
503 unhash_stid(&stp->st_stid);
415 file = find_any_file(stp->st_file); 504 file = find_any_file(stp->st_file);
416 if (file) 505 if (file)
417 locks_remove_posix(file, (fl_owner_t)stp->st_stateowner); 506 locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
507 close_generic_stateid(stp);
418 free_generic_stateid(stp); 508 free_generic_stateid(stp);
419} 509}
420 510
421static void unhash_lockowner(struct nfs4_stateowner *sop) 511static void unhash_lockowner(struct nfs4_lockowner *lo)
422{ 512{
423 struct nfs4_stateid *stp; 513 struct nfs4_ol_stateid *stp;
424 514
425 list_del(&sop->so_idhash); 515 list_del(&lo->lo_owner.so_strhash);
426 list_del(&sop->so_strhash); 516 list_del(&lo->lo_perstateid);
427 list_del(&sop->so_perstateid); 517 while (!list_empty(&lo->lo_owner.so_stateids)) {
428 while (!list_empty(&sop->so_stateids)) { 518 stp = list_first_entry(&lo->lo_owner.so_stateids,
429 stp = list_first_entry(&sop->so_stateids, 519 struct nfs4_ol_stateid, st_perstateowner);
430 struct nfs4_stateid, st_perstateowner);
431 release_lock_stateid(stp); 520 release_lock_stateid(stp);
432 } 521 }
433} 522}
434 523
435static void release_lockowner(struct nfs4_stateowner *sop) 524static void release_lockowner(struct nfs4_lockowner *lo)
436{ 525{
437 unhash_lockowner(sop); 526 unhash_lockowner(lo);
438 nfs4_put_stateowner(sop); 527 nfs4_free_lockowner(lo);
439} 528}
440 529
441static void 530static void
442release_stateid_lockowners(struct nfs4_stateid *open_stp) 531release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
443{ 532{
444 struct nfs4_stateowner *lock_sop; 533 struct nfs4_lockowner *lo;
445 534
446 while (!list_empty(&open_stp->st_lockowners)) { 535 while (!list_empty(&open_stp->st_lockowners)) {
447 lock_sop = list_entry(open_stp->st_lockowners.next, 536 lo = list_entry(open_stp->st_lockowners.next,
448 struct nfs4_stateowner, so_perstateid); 537 struct nfs4_lockowner, lo_perstateid);
449 /* list_del(&open_stp->st_lockowners); */ 538 release_lockowner(lo);
450 BUG_ON(lock_sop->so_is_open_owner);
451 release_lockowner(lock_sop);
452 } 539 }
453} 540}
454 541
455static void release_open_stateid(struct nfs4_stateid *stp) 542static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
456{ 543{
457 unhash_generic_stateid(stp); 544 unhash_generic_stateid(stp);
458 release_stateid_lockowners(stp); 545 release_stateid_lockowners(stp);
546 close_generic_stateid(stp);
547}
548
549static void release_open_stateid(struct nfs4_ol_stateid *stp)
550{
551 unhash_open_stateid(stp);
552 unhash_stid(&stp->st_stid);
459 free_generic_stateid(stp); 553 free_generic_stateid(stp);
460} 554}
461 555
462static void unhash_openowner(struct nfs4_stateowner *sop) 556static void unhash_openowner(struct nfs4_openowner *oo)
463{ 557{
464 struct nfs4_stateid *stp; 558 struct nfs4_ol_stateid *stp;
465 559
466 list_del(&sop->so_idhash); 560 list_del(&oo->oo_owner.so_strhash);
467 list_del(&sop->so_strhash); 561 list_del(&oo->oo_perclient);
468 list_del(&sop->so_perclient); 562 while (!list_empty(&oo->oo_owner.so_stateids)) {
469 list_del(&sop->so_perstateid); /* XXX: necessary? */ 563 stp = list_first_entry(&oo->oo_owner.so_stateids,
470 while (!list_empty(&sop->so_stateids)) { 564 struct nfs4_ol_stateid, st_perstateowner);
471 stp = list_first_entry(&sop->so_stateids,
472 struct nfs4_stateid, st_perstateowner);
473 release_open_stateid(stp); 565 release_open_stateid(stp);
474 } 566 }
475} 567}
476 568
477static void release_openowner(struct nfs4_stateowner *sop) 569static void release_last_closed_stateid(struct nfs4_openowner *oo)
478{ 570{
479 unhash_openowner(sop); 571 struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
480 list_del(&sop->so_close_lru); 572
481 nfs4_put_stateowner(sop); 573 if (s) {
574 unhash_stid(&s->st_stid);
575 free_generic_stateid(s);
576 oo->oo_last_closed_stid = NULL;
577 }
578}
579
580static void release_openowner(struct nfs4_openowner *oo)
581{
582 unhash_openowner(oo);
583 list_del(&oo->oo_close_lru);
584 release_last_closed_stateid(oo);
585 nfs4_free_openowner(oo);
482} 586}
483 587
484#define SESSION_HASH_SIZE 512 588#define SESSION_HASH_SIZE 512
@@ -843,9 +947,6 @@ renew_client_locked(struct nfs4_client *clp)
843 return; 947 return;
844 } 948 }
845 949
846 /*
847 * Move client to the end to the LRU list.
848 */
849 dprintk("renewing client (clientid %08x/%08x)\n", 950 dprintk("renewing client (clientid %08x/%08x)\n",
850 clp->cl_clientid.cl_boot, 951 clp->cl_clientid.cl_boot,
851 clp->cl_clientid.cl_id); 952 clp->cl_clientid.cl_id);
@@ -943,7 +1044,7 @@ unhash_client_locked(struct nfs4_client *clp)
943static void 1044static void
944expire_client(struct nfs4_client *clp) 1045expire_client(struct nfs4_client *clp)
945{ 1046{
946 struct nfs4_stateowner *sop; 1047 struct nfs4_openowner *oo;
947 struct nfs4_delegation *dp; 1048 struct nfs4_delegation *dp;
948 struct list_head reaplist; 1049 struct list_head reaplist;
949 1050
@@ -961,8 +1062,8 @@ expire_client(struct nfs4_client *clp)
961 unhash_delegation(dp); 1062 unhash_delegation(dp);
962 } 1063 }
963 while (!list_empty(&clp->cl_openowners)) { 1064 while (!list_empty(&clp->cl_openowners)) {
964 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 1065 oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
965 release_openowner(sop); 1066 release_openowner(oo);
966 } 1067 }
967 nfsd4_shutdown_callback(clp); 1068 nfsd4_shutdown_callback(clp);
968 if (clp->cl_cb_conn.cb_xprt) 1069 if (clp->cl_cb_conn.cb_xprt)
@@ -1038,6 +1139,23 @@ static void gen_confirm(struct nfs4_client *clp)
1038 *p++ = i++; 1139 *p++ = i++;
1039} 1140}
1040 1141
1142static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
1143{
1144 return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
1145}
1146
1147static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
1148{
1149 struct nfs4_stid *s;
1150
1151 s = find_stateid(cl, t);
1152 if (!s)
1153 return NULL;
1154 if (typemask & s->sc_type)
1155 return s;
1156 return NULL;
1157}
1158
1041static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, 1159static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1042 struct svc_rqst *rqstp, nfs4_verifier *verf) 1160 struct svc_rqst *rqstp, nfs4_verifier *verf)
1043{ 1161{
@@ -1060,6 +1178,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1060 } 1178 }
1061 } 1179 }
1062 1180
1181 idr_init(&clp->cl_stateids);
1063 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 1182 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
1064 atomic_set(&clp->cl_refcount, 0); 1183 atomic_set(&clp->cl_refcount, 0);
1065 clp->cl_cb_state = NFSD4_CB_UNKNOWN; 1184 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
@@ -1083,17 +1202,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1083 return clp; 1202 return clp;
1084} 1203}
1085 1204
1086static int check_name(struct xdr_netobj name)
1087{
1088 if (name.len == 0)
1089 return 0;
1090 if (name.len > NFS4_OPAQUE_LIMIT) {
1091 dprintk("NFSD: check_name: name too long(%d)!\n", name.len);
1092 return 0;
1093 }
1094 return 1;
1095}
1096
1097static void 1205static void
1098add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) 1206add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
1099{ 1207{
@@ -1125,8 +1233,10 @@ find_confirmed_client(clientid_t *clid)
1125 unsigned int idhashval = clientid_hashval(clid->cl_id); 1233 unsigned int idhashval = clientid_hashval(clid->cl_id);
1126 1234
1127 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { 1235 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
1128 if (same_clid(&clp->cl_clientid, clid)) 1236 if (same_clid(&clp->cl_clientid, clid)) {
1237 renew_client(clp);
1129 return clp; 1238 return clp;
1239 }
1130 } 1240 }
1131 return NULL; 1241 return NULL;
1132} 1242}
@@ -1173,20 +1283,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
1173 return NULL; 1283 return NULL;
1174} 1284}
1175 1285
1176static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
1177{
1178 switch (family) {
1179 case AF_INET:
1180 ((struct sockaddr_in *)sa)->sin_family = AF_INET;
1181 ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
1182 return;
1183 case AF_INET6:
1184 ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
1185 ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
1186 return;
1187 }
1188}
1189
1190static void 1286static void
1191gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) 1287gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
1192{ 1288{
@@ -1218,7 +1314,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
1218 1314
1219 conn->cb_prog = se->se_callback_prog; 1315 conn->cb_prog = se->se_callback_prog;
1220 conn->cb_ident = se->se_callback_ident; 1316 conn->cb_ident = se->se_callback_ident;
1221 rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr); 1317 memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
1222 return; 1318 return;
1223out_err: 1319out_err:
1224 conn->cb_addr.ss_family = AF_UNSPEC; 1320 conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1350,7 +1446,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1350 __func__, rqstp, exid, exid->clname.len, exid->clname.data, 1446 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1351 addr_str, exid->flags, exid->spa_how); 1447 addr_str, exid->flags, exid->spa_how);
1352 1448
1353 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) 1449 if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
1354 return nfserr_inval; 1450 return nfserr_inval;
1355 1451
1356 /* Currently only support SP4_NONE */ 1452 /* Currently only support SP4_NONE */
@@ -1849,8 +1945,16 @@ out:
1849 1945
1850 nfsd4_get_session(cstate->session); 1946 nfsd4_get_session(cstate->session);
1851 atomic_inc(&clp->cl_refcount); 1947 atomic_inc(&clp->cl_refcount);
1852 if (clp->cl_cb_state == NFSD4_CB_DOWN) 1948 switch (clp->cl_cb_state) {
1853 seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; 1949 case NFSD4_CB_DOWN:
1950 seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
1951 break;
1952 case NFSD4_CB_FAULT:
1953 seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
1954 break;
1955 default:
1956 seq->status_flags = 0;
1957 }
1854 } 1958 }
1855 kfree(conn); 1959 kfree(conn);
1856 spin_unlock(&client_lock); 1960 spin_unlock(&client_lock);
@@ -1858,6 +1962,50 @@ out:
1858 return status; 1962 return status;
1859} 1963}
1860 1964
1965static inline bool has_resources(struct nfs4_client *clp)
1966{
1967 return !list_empty(&clp->cl_openowners)
1968 || !list_empty(&clp->cl_delegations)
1969 || !list_empty(&clp->cl_sessions);
1970}
1971
1972__be32
1973nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
1974{
1975 struct nfs4_client *conf, *unconf, *clp;
1976 int status = 0;
1977
1978 nfs4_lock_state();
1979 unconf = find_unconfirmed_client(&dc->clientid);
1980 conf = find_confirmed_client(&dc->clientid);
1981
1982 if (conf) {
1983 clp = conf;
1984
1985 if (!is_client_expired(conf) && has_resources(conf)) {
1986 status = nfserr_clientid_busy;
1987 goto out;
1988 }
1989
1990 /* rfc5661 18.50.3 */
1991 if (cstate->session && conf == cstate->session->se_client) {
1992 status = nfserr_clientid_busy;
1993 goto out;
1994 }
1995 } else if (unconf)
1996 clp = unconf;
1997 else {
1998 status = nfserr_stale_clientid;
1999 goto out;
2000 }
2001
2002 expire_client(clp);
2003out:
2004 nfs4_unlock_state();
2005 dprintk("%s return %d\n", __func__, ntohl(status));
2006 return status;
2007}
2008
1861__be32 2009__be32
1862nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) 2010nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
1863{ 2011{
@@ -1900,19 +2048,13 @@ __be32
1900nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 2048nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1901 struct nfsd4_setclientid *setclid) 2049 struct nfsd4_setclientid *setclid)
1902{ 2050{
1903 struct xdr_netobj clname = { 2051 struct xdr_netobj clname = setclid->se_name;
1904 .len = setclid->se_namelen,
1905 .data = setclid->se_name,
1906 };
1907 nfs4_verifier clverifier = setclid->se_verf; 2052 nfs4_verifier clverifier = setclid->se_verf;
1908 unsigned int strhashval; 2053 unsigned int strhashval;
1909 struct nfs4_client *conf, *unconf, *new; 2054 struct nfs4_client *conf, *unconf, *new;
1910 __be32 status; 2055 __be32 status;
1911 char dname[HEXDIR_LEN]; 2056 char dname[HEXDIR_LEN];
1912 2057
1913 if (!check_name(clname))
1914 return nfserr_inval;
1915
1916 status = nfs4_make_rec_clidname(dname, &clname); 2058 status = nfs4_make_rec_clidname(dname, &clname);
1917 if (status) 2059 if (status)
1918 return status; 2060 return status;
@@ -1946,7 +2088,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1946 * of 5 bullet points, labeled as CASE0 - CASE4 below. 2088 * of 5 bullet points, labeled as CASE0 - CASE4 below.
1947 */ 2089 */
1948 unconf = find_unconfirmed_client_by_str(dname, strhashval); 2090 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1949 status = nfserr_resource; 2091 status = nfserr_jukebox;
1950 if (!conf) { 2092 if (!conf) {
1951 /* 2093 /*
1952 * RFC 3530 14.2.33 CASE 4: 2094 * RFC 3530 14.2.33 CASE 4:
@@ -2116,31 +2258,28 @@ out:
2116 return status; 2258 return status;
2117} 2259}
2118 2260
2261static struct nfs4_file *nfsd4_alloc_file(void)
2262{
2263 return kmem_cache_alloc(file_slab, GFP_KERNEL);
2264}
2265
2119/* OPEN Share state helper functions */ 2266/* OPEN Share state helper functions */
2120static inline struct nfs4_file * 2267static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
2121alloc_init_file(struct inode *ino)
2122{ 2268{
2123 struct nfs4_file *fp;
2124 unsigned int hashval = file_hashval(ino); 2269 unsigned int hashval = file_hashval(ino);
2125 2270
2126 fp = kmem_cache_alloc(file_slab, GFP_KERNEL); 2271 atomic_set(&fp->fi_ref, 1);
2127 if (fp) { 2272 INIT_LIST_HEAD(&fp->fi_hash);
2128 atomic_set(&fp->fi_ref, 1); 2273 INIT_LIST_HEAD(&fp->fi_stateids);
2129 INIT_LIST_HEAD(&fp->fi_hash); 2274 INIT_LIST_HEAD(&fp->fi_delegations);
2130 INIT_LIST_HEAD(&fp->fi_stateids); 2275 fp->fi_inode = igrab(ino);
2131 INIT_LIST_HEAD(&fp->fi_delegations); 2276 fp->fi_had_conflict = false;
2132 fp->fi_inode = igrab(ino); 2277 fp->fi_lease = NULL;
2133 fp->fi_id = current_fileid++; 2278 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
2134 fp->fi_had_conflict = false; 2279 memset(fp->fi_access, 0, sizeof(fp->fi_access));
2135 fp->fi_lease = NULL; 2280 spin_lock(&recall_lock);
2136 memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); 2281 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
2137 memset(fp->fi_access, 0, sizeof(fp->fi_access)); 2282 spin_unlock(&recall_lock);
2138 spin_lock(&recall_lock);
2139 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
2140 spin_unlock(&recall_lock);
2141 return fp;
2142 }
2143 return NULL;
2144} 2283}
2145 2284
2146static void 2285static void
@@ -2155,7 +2294,8 @@ nfsd4_free_slab(struct kmem_cache **slab)
2155void 2294void
2156nfsd4_free_slabs(void) 2295nfsd4_free_slabs(void)
2157{ 2296{
2158 nfsd4_free_slab(&stateowner_slab); 2297 nfsd4_free_slab(&openowner_slab);
2298 nfsd4_free_slab(&lockowner_slab);
2159 nfsd4_free_slab(&file_slab); 2299 nfsd4_free_slab(&file_slab);
2160 nfsd4_free_slab(&stateid_slab); 2300 nfsd4_free_slab(&stateid_slab);
2161 nfsd4_free_slab(&deleg_slab); 2301 nfsd4_free_slab(&deleg_slab);
@@ -2164,16 +2304,20 @@ nfsd4_free_slabs(void)
2164static int 2304static int
2165nfsd4_init_slabs(void) 2305nfsd4_init_slabs(void)
2166{ 2306{
2167 stateowner_slab = kmem_cache_create("nfsd4_stateowners", 2307 openowner_slab = kmem_cache_create("nfsd4_openowners",
2168 sizeof(struct nfs4_stateowner), 0, 0, NULL); 2308 sizeof(struct nfs4_openowner), 0, 0, NULL);
2169 if (stateowner_slab == NULL) 2309 if (openowner_slab == NULL)
2310 goto out_nomem;
2311 lockowner_slab = kmem_cache_create("nfsd4_lockowners",
2312 sizeof(struct nfs4_openowner), 0, 0, NULL);
2313 if (lockowner_slab == NULL)
2170 goto out_nomem; 2314 goto out_nomem;
2171 file_slab = kmem_cache_create("nfsd4_files", 2315 file_slab = kmem_cache_create("nfsd4_files",
2172 sizeof(struct nfs4_file), 0, 0, NULL); 2316 sizeof(struct nfs4_file), 0, 0, NULL);
2173 if (file_slab == NULL) 2317 if (file_slab == NULL)
2174 goto out_nomem; 2318 goto out_nomem;
2175 stateid_slab = kmem_cache_create("nfsd4_stateids", 2319 stateid_slab = kmem_cache_create("nfsd4_stateids",
2176 sizeof(struct nfs4_stateid), 0, 0, NULL); 2320 sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
2177 if (stateid_slab == NULL) 2321 if (stateid_slab == NULL)
2178 goto out_nomem; 2322 goto out_nomem;
2179 deleg_slab = kmem_cache_create("nfsd4_delegations", 2323 deleg_slab = kmem_cache_create("nfsd4_delegations",
@@ -2187,97 +2331,94 @@ out_nomem:
2187 return -ENOMEM; 2331 return -ENOMEM;
2188} 2332}
2189 2333
2190void 2334void nfs4_free_openowner(struct nfs4_openowner *oo)
2191nfs4_free_stateowner(struct kref *kref)
2192{ 2335{
2193 struct nfs4_stateowner *sop = 2336 kfree(oo->oo_owner.so_owner.data);
2194 container_of(kref, struct nfs4_stateowner, so_ref); 2337 kmem_cache_free(openowner_slab, oo);
2195 kfree(sop->so_owner.data);
2196 kmem_cache_free(stateowner_slab, sop);
2197} 2338}
2198 2339
2199static inline struct nfs4_stateowner * 2340void nfs4_free_lockowner(struct nfs4_lockowner *lo)
2200alloc_stateowner(struct xdr_netobj *owner)
2201{ 2341{
2202 struct nfs4_stateowner *sop; 2342 kfree(lo->lo_owner.so_owner.data);
2343 kmem_cache_free(lockowner_slab, lo);
2344}
2203 2345
2204 if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) { 2346static void init_nfs4_replay(struct nfs4_replay *rp)
2205 if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) { 2347{
2206 memcpy(sop->so_owner.data, owner->data, owner->len); 2348 rp->rp_status = nfserr_serverfault;
2207 sop->so_owner.len = owner->len; 2349 rp->rp_buflen = 0;
2208 kref_init(&sop->so_ref); 2350 rp->rp_buf = rp->rp_ibuf;
2209 return sop;
2210 }
2211 kmem_cache_free(stateowner_slab, sop);
2212 }
2213 return NULL;
2214} 2351}
2215 2352
2216static struct nfs4_stateowner * 2353static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
2217alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { 2354{
2218 struct nfs4_stateowner *sop; 2355 struct nfs4_stateowner *sop;
2219 struct nfs4_replay *rp;
2220 unsigned int idhashval;
2221 2356
2222 if (!(sop = alloc_stateowner(&open->op_owner))) 2357 sop = kmem_cache_alloc(slab, GFP_KERNEL);
2358 if (!sop)
2359 return NULL;
2360
2361 sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
2362 if (!sop->so_owner.data) {
2363 kmem_cache_free(slab, sop);
2223 return NULL; 2364 return NULL;
2224 idhashval = ownerid_hashval(current_ownerid); 2365 }
2225 INIT_LIST_HEAD(&sop->so_idhash); 2366 sop->so_owner.len = owner->len;
2226 INIT_LIST_HEAD(&sop->so_strhash); 2367
2227 INIT_LIST_HEAD(&sop->so_perclient);
2228 INIT_LIST_HEAD(&sop->so_stateids); 2368 INIT_LIST_HEAD(&sop->so_stateids);
2229 INIT_LIST_HEAD(&sop->so_perstateid); /* not used */
2230 INIT_LIST_HEAD(&sop->so_close_lru);
2231 sop->so_time = 0;
2232 list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]);
2233 list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]);
2234 list_add(&sop->so_perclient, &clp->cl_openowners);
2235 sop->so_is_open_owner = 1;
2236 sop->so_id = current_ownerid++;
2237 sop->so_client = clp; 2369 sop->so_client = clp;
2238 sop->so_seqid = open->op_seqid; 2370 init_nfs4_replay(&sop->so_replay);
2239 sop->so_confirmed = 0;
2240 rp = &sop->so_replay;
2241 rp->rp_status = nfserr_serverfault;
2242 rp->rp_buflen = 0;
2243 rp->rp_buf = rp->rp_ibuf;
2244 return sop; 2371 return sop;
2245} 2372}
2246 2373
2247static inline void 2374static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
2248init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { 2375{
2249 struct nfs4_stateowner *sop = open->op_stateowner; 2376 list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
2250 unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); 2377 list_add(&oo->oo_perclient, &clp->cl_openowners);
2378}
2251 2379
2252 INIT_LIST_HEAD(&stp->st_hash); 2380static struct nfs4_openowner *
2253 INIT_LIST_HEAD(&stp->st_perstateowner); 2381alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
2382 struct nfs4_openowner *oo;
2383
2384 oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
2385 if (!oo)
2386 return NULL;
2387 oo->oo_owner.so_is_open_owner = 1;
2388 oo->oo_owner.so_seqid = open->op_seqid;
2389 oo->oo_flags = NFS4_OO_NEW;
2390 oo->oo_time = 0;
2391 oo->oo_last_closed_stid = NULL;
2392 INIT_LIST_HEAD(&oo->oo_close_lru);
2393 hash_openowner(oo, clp, strhashval);
2394 return oo;
2395}
2396
2397static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
2398 struct nfs4_openowner *oo = open->op_openowner;
2399 struct nfs4_client *clp = oo->oo_owner.so_client;
2400
2401 init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
2254 INIT_LIST_HEAD(&stp->st_lockowners); 2402 INIT_LIST_HEAD(&stp->st_lockowners);
2255 INIT_LIST_HEAD(&stp->st_perfile); 2403 list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
2256 list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
2257 list_add(&stp->st_perstateowner, &sop->so_stateids);
2258 list_add(&stp->st_perfile, &fp->fi_stateids); 2404 list_add(&stp->st_perfile, &fp->fi_stateids);
2259 stp->st_stateowner = sop; 2405 stp->st_stateowner = &oo->oo_owner;
2260 get_nfs4_file(fp); 2406 get_nfs4_file(fp);
2261 stp->st_file = fp; 2407 stp->st_file = fp;
2262 stp->st_stateid.si_boot = boot_time;
2263 stp->st_stateid.si_stateownerid = sop->so_id;
2264 stp->st_stateid.si_fileid = fp->fi_id;
2265 stp->st_stateid.si_generation = 0;
2266 stp->st_access_bmap = 0; 2408 stp->st_access_bmap = 0;
2267 stp->st_deny_bmap = 0; 2409 stp->st_deny_bmap = 0;
2268 __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, 2410 __set_bit(open->op_share_access, &stp->st_access_bmap);
2269 &stp->st_access_bmap);
2270 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 2411 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
2271 stp->st_openstp = NULL; 2412 stp->st_openstp = NULL;
2272} 2413}
2273 2414
2274static void 2415static void
2275move_to_close_lru(struct nfs4_stateowner *sop) 2416move_to_close_lru(struct nfs4_openowner *oo)
2276{ 2417{
2277 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); 2418 dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
2278 2419
2279 list_move_tail(&sop->so_close_lru, &close_lru); 2420 list_move_tail(&oo->oo_close_lru, &close_lru);
2280 sop->so_time = get_seconds(); 2421 oo->oo_time = get_seconds();
2281} 2422}
2282 2423
2283static int 2424static int
@@ -2289,14 +2430,18 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
2289 (sop->so_client->cl_clientid.cl_id == clid->cl_id); 2430 (sop->so_client->cl_clientid.cl_id == clid->cl_id);
2290} 2431}
2291 2432
2292static struct nfs4_stateowner * 2433static struct nfs4_openowner *
2293find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) 2434find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
2294{ 2435{
2295 struct nfs4_stateowner *so = NULL; 2436 struct nfs4_stateowner *so;
2437 struct nfs4_openowner *oo;
2296 2438
2297 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { 2439 list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
2298 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) 2440 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
2299 return so; 2441 oo = openowner(so);
2442 renew_client(oo->oo_owner.so_client);
2443 return oo;
2444 }
2300 } 2445 }
2301 return NULL; 2446 return NULL;
2302} 2447}
@@ -2320,31 +2465,6 @@ find_file(struct inode *ino)
2320 return NULL; 2465 return NULL;
2321} 2466}
2322 2467
2323static inline int access_valid(u32 x, u32 minorversion)
2324{
2325 if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
2326 return 0;
2327 if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
2328 return 0;
2329 x &= ~NFS4_SHARE_ACCESS_MASK;
2330 if (minorversion && x) {
2331 if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
2332 return 0;
2333 if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
2334 return 0;
2335 x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
2336 }
2337 if (x)
2338 return 0;
2339 return 1;
2340}
2341
2342static inline int deny_valid(u32 x)
2343{
2344 /* Note: unlike access bits, deny bits may be zero. */
2345 return x <= NFS4_SHARE_DENY_BOTH;
2346}
2347
2348/* 2468/*
2349 * Called to check deny when READ with all zero stateid or 2469 * Called to check deny when READ with all zero stateid or
2350 * WRITE with all zero or all one stateid 2470 * WRITE with all zero or all one stateid
@@ -2354,7 +2474,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
2354{ 2474{
2355 struct inode *ino = current_fh->fh_dentry->d_inode; 2475 struct inode *ino = current_fh->fh_dentry->d_inode;
2356 struct nfs4_file *fp; 2476 struct nfs4_file *fp;
2357 struct nfs4_stateid *stp; 2477 struct nfs4_ol_stateid *stp;
2358 __be32 ret; 2478 __be32 ret;
2359 2479
2360 dprintk("NFSD: nfs4_share_conflict\n"); 2480 dprintk("NFSD: nfs4_share_conflict\n");
@@ -2429,6 +2549,16 @@ static const struct lock_manager_operations nfsd_lease_mng_ops = {
2429 .lm_change = nfsd_change_deleg_cb, 2549 .lm_change = nfsd_change_deleg_cb,
2430}; 2550};
2431 2551
2552static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid)
2553{
2554 if (nfsd4_has_session(cstate))
2555 return nfs_ok;
2556 if (seqid == so->so_seqid - 1)
2557 return nfserr_replay_me;
2558 if (seqid == so->so_seqid)
2559 return nfs_ok;
2560 return nfserr_bad_seqid;
2561}
2432 2562
2433__be32 2563__be32
2434nfsd4_process_open1(struct nfsd4_compound_state *cstate, 2564nfsd4_process_open1(struct nfsd4_compound_state *cstate,
@@ -2437,57 +2567,49 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2437 clientid_t *clientid = &open->op_clientid; 2567 clientid_t *clientid = &open->op_clientid;
2438 struct nfs4_client *clp = NULL; 2568 struct nfs4_client *clp = NULL;
2439 unsigned int strhashval; 2569 unsigned int strhashval;
2440 struct nfs4_stateowner *sop = NULL; 2570 struct nfs4_openowner *oo = NULL;
2441 2571 __be32 status;
2442 if (!check_name(open->op_owner))
2443 return nfserr_inval;
2444 2572
2445 if (STALE_CLIENTID(&open->op_clientid)) 2573 if (STALE_CLIENTID(&open->op_clientid))
2446 return nfserr_stale_clientid; 2574 return nfserr_stale_clientid;
2575 /*
2576 * In case we need it later, after we've already created the
2577 * file and don't want to risk a further failure:
2578 */
2579 open->op_file = nfsd4_alloc_file();
2580 if (open->op_file == NULL)
2581 return nfserr_jukebox;
2447 2582
2448 strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner); 2583 strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
2449 sop = find_openstateowner_str(strhashval, open); 2584 oo = find_openstateowner_str(strhashval, open);
2450 open->op_stateowner = sop; 2585 open->op_openowner = oo;
2451 if (!sop) { 2586 if (!oo) {
2452 /* Make sure the client's lease hasn't expired. */
2453 clp = find_confirmed_client(clientid); 2587 clp = find_confirmed_client(clientid);
2454 if (clp == NULL) 2588 if (clp == NULL)
2455 return nfserr_expired; 2589 return nfserr_expired;
2456 goto renew; 2590 goto new_owner;
2457 } 2591 }
2458 /* When sessions are used, skip open sequenceid processing */ 2592 if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
2459 if (nfsd4_has_session(cstate))
2460 goto renew;
2461 if (!sop->so_confirmed) {
2462 /* Replace unconfirmed owners without checking for replay. */ 2593 /* Replace unconfirmed owners without checking for replay. */
2463 clp = sop->so_client; 2594 clp = oo->oo_owner.so_client;
2464 release_openowner(sop); 2595 release_openowner(oo);
2465 open->op_stateowner = NULL; 2596 open->op_openowner = NULL;
2466 goto renew; 2597 goto new_owner;
2467 }
2468 if (open->op_seqid == sop->so_seqid - 1) {
2469 if (sop->so_replay.rp_buflen)
2470 return nfserr_replay_me;
2471 /* The original OPEN failed so spectacularly
2472 * that we don't even have replay data saved!
2473 * Therefore, we have no choice but to continue
2474 * processing this OPEN; presumably, we'll
2475 * fail again for the same reason.
2476 */
2477 dprintk("nfsd4_process_open1: replay with no replay cache\n");
2478 goto renew;
2479 }
2480 if (open->op_seqid != sop->so_seqid)
2481 return nfserr_bad_seqid;
2482renew:
2483 if (open->op_stateowner == NULL) {
2484 sop = alloc_init_open_stateowner(strhashval, clp, open);
2485 if (sop == NULL)
2486 return nfserr_resource;
2487 open->op_stateowner = sop;
2488 } 2598 }
2489 list_del_init(&sop->so_close_lru); 2599 status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
2490 renew_client(sop->so_client); 2600 if (status)
2601 return status;
2602 clp = oo->oo_owner.so_client;
2603 goto alloc_stateid;
2604new_owner:
2605 oo = alloc_init_open_stateowner(strhashval, clp, open);
2606 if (oo == NULL)
2607 return nfserr_jukebox;
2608 open->op_openowner = oo;
2609alloc_stateid:
2610 open->op_stp = nfs4_alloc_stateid(clp);
2611 if (!open->op_stp)
2612 return nfserr_jukebox;
2491 return nfs_ok; 2613 return nfs_ok;
2492} 2614}
2493 2615
@@ -2500,36 +2622,37 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
2500 return nfs_ok; 2622 return nfs_ok;
2501} 2623}
2502 2624
2503static struct nfs4_delegation * 2625static int share_access_to_flags(u32 share_access)
2504find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
2505{ 2626{
2506 struct nfs4_delegation *dp; 2627 share_access &= ~NFS4_SHARE_WANT_MASK;
2507 2628
2508 spin_lock(&recall_lock); 2629 return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
2509 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
2510 if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
2511 spin_unlock(&recall_lock);
2512 return dp;
2513 }
2514 spin_unlock(&recall_lock);
2515 return NULL;
2516} 2630}
2517 2631
2518static int share_access_to_flags(u32 share_access) 2632static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
2519{ 2633{
2520 share_access &= ~NFS4_SHARE_WANT_MASK; 2634 struct nfs4_stid *ret;
2521 2635
2522 return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; 2636 ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
2637 if (!ret)
2638 return NULL;
2639 return delegstateid(ret);
2640}
2641
2642static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
2643{
2644 return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
2645 open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH;
2523} 2646}
2524 2647
2525static __be32 2648static __be32
2526nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, 2649nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
2527 struct nfs4_delegation **dp) 2650 struct nfs4_delegation **dp)
2528{ 2651{
2529 int flags; 2652 int flags;
2530 __be32 status = nfserr_bad_stateid; 2653 __be32 status = nfserr_bad_stateid;
2531 2654
2532 *dp = find_delegation_file(fp, &open->op_delegate_stateid); 2655 *dp = find_deleg_stateid(cl, &open->op_delegate_stateid);
2533 if (*dp == NULL) 2656 if (*dp == NULL)
2534 goto out; 2657 goto out;
2535 flags = share_access_to_flags(open->op_share_access); 2658 flags = share_access_to_flags(open->op_share_access);
@@ -2537,41 +2660,37 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
2537 if (status) 2660 if (status)
2538 *dp = NULL; 2661 *dp = NULL;
2539out: 2662out:
2540 if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR) 2663 if (!nfsd4_is_deleg_cur(open))
2541 return nfs_ok; 2664 return nfs_ok;
2542 if (status) 2665 if (status)
2543 return status; 2666 return status;
2544 open->op_stateowner->so_confirmed = 1; 2667 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
2545 return nfs_ok; 2668 return nfs_ok;
2546} 2669}
2547 2670
2548static __be32 2671static __be32
2549nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) 2672nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp)
2550{ 2673{
2551 struct nfs4_stateid *local; 2674 struct nfs4_ol_stateid *local;
2552 __be32 status = nfserr_share_denied; 2675 struct nfs4_openowner *oo = open->op_openowner;
2553 struct nfs4_stateowner *sop = open->op_stateowner;
2554 2676
2555 list_for_each_entry(local, &fp->fi_stateids, st_perfile) { 2677 list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
2556 /* ignore lock owners */ 2678 /* ignore lock owners */
2557 if (local->st_stateowner->so_is_open_owner == 0) 2679 if (local->st_stateowner->so_is_open_owner == 0)
2558 continue; 2680 continue;
2559 /* remember if we have seen this open owner */ 2681 /* remember if we have seen this open owner */
2560 if (local->st_stateowner == sop) 2682 if (local->st_stateowner == &oo->oo_owner)
2561 *stpp = local; 2683 *stpp = local;
2562 /* check for conflicting share reservations */ 2684 /* check for conflicting share reservations */
2563 if (!test_share(local, open)) 2685 if (!test_share(local, open))
2564 goto out; 2686 return nfserr_share_denied;
2565 } 2687 }
2566 status = 0; 2688 return nfs_ok;
2567out:
2568 return status;
2569} 2689}
2570 2690
2571static inline struct nfs4_stateid * 2691static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
2572nfs4_alloc_stateid(void)
2573{ 2692{
2574 return kmem_cache_alloc(stateid_slab, GFP_KERNEL); 2693 kmem_cache_free(stateid_slab, s);
2575} 2694}
2576 2695
2577static inline int nfs4_access_to_access(u32 nfs4_access) 2696static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -2592,12 +2711,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
2592 int oflag = nfs4_access_to_omode(open->op_share_access); 2711 int oflag = nfs4_access_to_omode(open->op_share_access);
2593 int access = nfs4_access_to_access(open->op_share_access); 2712 int access = nfs4_access_to_access(open->op_share_access);
2594 2713
2595 /* CLAIM_DELEGATE_CUR is used in response to a broken lease;
2596 * allowing it to break the lease and return EAGAIN leaves the
2597 * client unable to make progress in returning the delegation */
2598 if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
2599 access |= NFSD_MAY_NOT_BREAK_LEASE;
2600
2601 if (!fp->fi_fds[oflag]) { 2714 if (!fp->fi_fds[oflag]) {
2602 status = nfsd_open(rqstp, cur_fh, S_IFREG, access, 2715 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
2603 &fp->fi_fds[oflag]); 2716 &fp->fi_fds[oflag]);
@@ -2609,27 +2722,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
2609 return nfs_ok; 2722 return nfs_ok;
2610} 2723}
2611 2724
2612static __be32
2613nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
2614 struct nfs4_file *fp, struct svc_fh *cur_fh,
2615 struct nfsd4_open *open)
2616{
2617 struct nfs4_stateid *stp;
2618 __be32 status;
2619
2620 stp = nfs4_alloc_stateid();
2621 if (stp == NULL)
2622 return nfserr_resource;
2623
2624 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
2625 if (status) {
2626 kmem_cache_free(stateid_slab, stp);
2627 return status;
2628 }
2629 *stpp = stp;
2630 return 0;
2631}
2632
2633static inline __be32 2725static inline __be32
2634nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, 2726nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
2635 struct nfsd4_open *open) 2727 struct nfsd4_open *open)
@@ -2646,9 +2738,9 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
2646} 2738}
2647 2739
2648static __be32 2740static __be32
2649nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) 2741nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
2650{ 2742{
2651 u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; 2743 u32 op_share_access = open->op_share_access;
2652 bool new_access; 2744 bool new_access;
2653 __be32 status; 2745 __be32 status;
2654 2746
@@ -2677,8 +2769,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
2677static void 2769static void
2678nfs4_set_claim_prev(struct nfsd4_open *open) 2770nfs4_set_claim_prev(struct nfsd4_open *open)
2679{ 2771{
2680 open->op_stateowner->so_confirmed = 1; 2772 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
2681 open->op_stateowner->so_client->cl_firststate = 1; 2773 open->op_openowner->oo_owner.so_client->cl_firststate = 1;
2682} 2774}
2683 2775
2684/* Should we give out recallable state?: */ 2776/* Should we give out recallable state?: */
@@ -2721,7 +2813,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
2721 if (!fl) 2813 if (!fl)
2722 return -ENOMEM; 2814 return -ENOMEM;
2723 fl->fl_file = find_readable_file(fp); 2815 fl->fl_file = find_readable_file(fp);
2724 list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); 2816 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
2725 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); 2817 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
2726 if (status) { 2818 if (status) {
2727 list_del_init(&dp->dl_perclnt); 2819 list_del_init(&dp->dl_perclnt);
@@ -2750,7 +2842,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
2750 atomic_inc(&fp->fi_delegees); 2842 atomic_inc(&fp->fi_delegees);
2751 list_add(&dp->dl_perfile, &fp->fi_delegations); 2843 list_add(&dp->dl_perfile, &fp->fi_delegations);
2752 spin_unlock(&recall_lock); 2844 spin_unlock(&recall_lock);
2753 list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); 2845 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
2754 return 0; 2846 return 0;
2755} 2847}
2756 2848
@@ -2758,14 +2850,14 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
2758 * Attempt to hand out a delegation. 2850 * Attempt to hand out a delegation.
2759 */ 2851 */
2760static void 2852static void
2761nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp) 2853nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp)
2762{ 2854{
2763 struct nfs4_delegation *dp; 2855 struct nfs4_delegation *dp;
2764 struct nfs4_stateowner *sop = stp->st_stateowner; 2856 struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
2765 int cb_up; 2857 int cb_up;
2766 int status, flag = 0; 2858 int status, flag = 0;
2767 2859
2768 cb_up = nfsd4_cb_channel_good(sop->so_client); 2860 cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
2769 flag = NFS4_OPEN_DELEGATE_NONE; 2861 flag = NFS4_OPEN_DELEGATE_NONE;
2770 open->op_recall = 0; 2862 open->op_recall = 0;
2771 switch (open->op_claim_type) { 2863 switch (open->op_claim_type) {
@@ -2781,7 +2873,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2781 * had the chance to reclaim theirs.... */ 2873 * had the chance to reclaim theirs.... */
2782 if (locks_in_grace()) 2874 if (locks_in_grace())
2783 goto out; 2875 goto out;
2784 if (!cb_up || !sop->so_confirmed) 2876 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
2785 goto out; 2877 goto out;
2786 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 2878 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
2787 flag = NFS4_OPEN_DELEGATE_WRITE; 2879 flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2792,17 +2884,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2792 goto out; 2884 goto out;
2793 } 2885 }
2794 2886
2795 dp = alloc_init_deleg(sop->so_client, stp, fh, flag); 2887 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
2796 if (dp == NULL) 2888 if (dp == NULL)
2797 goto out_no_deleg; 2889 goto out_no_deleg;
2798 status = nfs4_set_delegation(dp, flag); 2890 status = nfs4_set_delegation(dp, flag);
2799 if (status) 2891 if (status)
2800 goto out_free; 2892 goto out_free;
2801 2893
2802 memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); 2894 memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
2803 2895
2804 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", 2896 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
2805 STATEID_VAL(&dp->dl_stateid)); 2897 STATEID_VAL(&dp->dl_stid.sc_stateid));
2806out: 2898out:
2807 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS 2899 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
2808 && flag == NFS4_OPEN_DELEGATE_NONE 2900 && flag == NFS4_OPEN_DELEGATE_NONE
@@ -2824,16 +2916,13 @@ __be32
2824nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 2916nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
2825{ 2917{
2826 struct nfsd4_compoundres *resp = rqstp->rq_resp; 2918 struct nfsd4_compoundres *resp = rqstp->rq_resp;
2919 struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
2827 struct nfs4_file *fp = NULL; 2920 struct nfs4_file *fp = NULL;
2828 struct inode *ino = current_fh->fh_dentry->d_inode; 2921 struct inode *ino = current_fh->fh_dentry->d_inode;
2829 struct nfs4_stateid *stp = NULL; 2922 struct nfs4_ol_stateid *stp = NULL;
2830 struct nfs4_delegation *dp = NULL; 2923 struct nfs4_delegation *dp = NULL;
2831 __be32 status; 2924 __be32 status;
2832 2925
2833 status = nfserr_inval;
2834 if (!access_valid(open->op_share_access, resp->cstate.minorversion)
2835 || !deny_valid(open->op_share_deny))
2836 goto out;
2837 /* 2926 /*
2838 * Lookup file; if found, lookup stateid and check open request, 2927 * Lookup file; if found, lookup stateid and check open request,
2839 * and check for delegations in the process of being recalled. 2928 * and check for delegations in the process of being recalled.
@@ -2843,17 +2932,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2843 if (fp) { 2932 if (fp) {
2844 if ((status = nfs4_check_open(fp, open, &stp))) 2933 if ((status = nfs4_check_open(fp, open, &stp)))
2845 goto out; 2934 goto out;
2846 status = nfs4_check_deleg(fp, open, &dp); 2935 status = nfs4_check_deleg(cl, fp, open, &dp);
2847 if (status) 2936 if (status)
2848 goto out; 2937 goto out;
2849 } else { 2938 } else {
2850 status = nfserr_bad_stateid; 2939 status = nfserr_bad_stateid;
2851 if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) 2940 if (nfsd4_is_deleg_cur(open))
2852 goto out;
2853 status = nfserr_resource;
2854 fp = alloc_init_file(ino);
2855 if (fp == NULL)
2856 goto out; 2941 goto out;
2942 status = nfserr_jukebox;
2943 fp = open->op_file;
2944 open->op_file = NULL;
2945 nfsd4_init_file(fp, ino);
2857 } 2946 }
2858 2947
2859 /* 2948 /*
@@ -2865,24 +2954,24 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2865 status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); 2954 status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
2866 if (status) 2955 if (status)
2867 goto out; 2956 goto out;
2868 update_stateid(&stp->st_stateid);
2869 } else { 2957 } else {
2870 status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); 2958 status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
2871 if (status) 2959 if (status)
2872 goto out; 2960 goto out;
2873 init_stateid(stp, fp, open); 2961 stp = open->op_stp;
2962 open->op_stp = NULL;
2963 init_open_stateid(stp, fp, open);
2874 status = nfsd4_truncate(rqstp, current_fh, open); 2964 status = nfsd4_truncate(rqstp, current_fh, open);
2875 if (status) { 2965 if (status) {
2876 release_open_stateid(stp); 2966 release_open_stateid(stp);
2877 goto out; 2967 goto out;
2878 } 2968 }
2879 if (nfsd4_has_session(&resp->cstate))
2880 update_stateid(&stp->st_stateid);
2881 } 2969 }
2882 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2970 update_stateid(&stp->st_stid.sc_stateid);
2971 memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
2883 2972
2884 if (nfsd4_has_session(&resp->cstate)) 2973 if (nfsd4_has_session(&resp->cstate))
2885 open->op_stateowner->so_confirmed = 1; 2974 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
2886 2975
2887 /* 2976 /*
2888 * Attempt to hand out a delegation. No error return, because the 2977 * Attempt to hand out a delegation. No error return, because the
@@ -2893,7 +2982,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2893 status = nfs_ok; 2982 status = nfs_ok;
2894 2983
2895 dprintk("%s: stateid=" STATEID_FMT "\n", __func__, 2984 dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
2896 STATEID_VAL(&stp->st_stateid)); 2985 STATEID_VAL(&stp->st_stid.sc_stateid));
2897out: 2986out:
2898 if (fp) 2987 if (fp)
2899 put_nfs4_file(fp); 2988 put_nfs4_file(fp);
@@ -2903,13 +2992,34 @@ out:
2903 * To finish the open response, we just need to set the rflags. 2992 * To finish the open response, we just need to set the rflags.
2904 */ 2993 */
2905 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; 2994 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
2906 if (!open->op_stateowner->so_confirmed && 2995 if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
2907 !nfsd4_has_session(&resp->cstate)) 2996 !nfsd4_has_session(&resp->cstate))
2908 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; 2997 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
2909 2998
2910 return status; 2999 return status;
2911} 3000}
2912 3001
3002void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
3003{
3004 if (open->op_openowner) {
3005 struct nfs4_openowner *oo = open->op_openowner;
3006
3007 if (!list_empty(&oo->oo_owner.so_stateids))
3008 list_del_init(&oo->oo_close_lru);
3009 if (oo->oo_flags & NFS4_OO_NEW) {
3010 if (status) {
3011 release_openowner(oo);
3012 open->op_openowner = NULL;
3013 } else
3014 oo->oo_flags &= ~NFS4_OO_NEW;
3015 }
3016 }
3017 if (open->op_file)
3018 nfsd4_free_file(open->op_file);
3019 if (open->op_stp)
3020 nfs4_free_stateid(open->op_stp);
3021}
3022
2913__be32 3023__be32
2914nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3024nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2915 clientid_t *clid) 3025 clientid_t *clid)
@@ -2930,7 +3040,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2930 dprintk("nfsd4_renew: clientid not found!\n"); 3040 dprintk("nfsd4_renew: clientid not found!\n");
2931 goto out; 3041 goto out;
2932 } 3042 }
2933 renew_client(clp);
2934 status = nfserr_cb_path_down; 3043 status = nfserr_cb_path_down;
2935 if (!list_empty(&clp->cl_delegations) 3044 if (!list_empty(&clp->cl_delegations)
2936 && clp->cl_cb_state != NFSD4_CB_UP) 3045 && clp->cl_cb_state != NFSD4_CB_UP)
@@ -2962,7 +3071,7 @@ static time_t
2962nfs4_laundromat(void) 3071nfs4_laundromat(void)
2963{ 3072{
2964 struct nfs4_client *clp; 3073 struct nfs4_client *clp;
2965 struct nfs4_stateowner *sop; 3074 struct nfs4_openowner *oo;
2966 struct nfs4_delegation *dp; 3075 struct nfs4_delegation *dp;
2967 struct list_head *pos, *next, reaplist; 3076 struct list_head *pos, *next, reaplist;
2968 time_t cutoff = get_seconds() - nfsd4_lease; 3077 time_t cutoff = get_seconds() - nfsd4_lease;
@@ -3019,16 +3128,14 @@ nfs4_laundromat(void)
3019 } 3128 }
3020 test_val = nfsd4_lease; 3129 test_val = nfsd4_lease;
3021 list_for_each_safe(pos, next, &close_lru) { 3130 list_for_each_safe(pos, next, &close_lru) {
3022 sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); 3131 oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
3023 if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { 3132 if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
3024 u = sop->so_time - cutoff; 3133 u = oo->oo_time - cutoff;
3025 if (test_val > u) 3134 if (test_val > u)
3026 test_val = u; 3135 test_val = u;
3027 break; 3136 break;
3028 } 3137 }
3029 dprintk("NFSD: purging unused open stateowner (so_id %d)\n", 3138 release_openowner(oo);
3030 sop->so_id);
3031 release_openowner(sop);
3032 } 3139 }
3033 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) 3140 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
3034 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; 3141 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -3050,30 +3157,17 @@ laundromat_main(struct work_struct *not_used)
3050 queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); 3157 queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
3051} 3158}
3052 3159
3053static struct nfs4_stateowner * 3160static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
3054search_close_lru(u32 st_id, int flags)
3055{ 3161{
3056 struct nfs4_stateowner *local = NULL; 3162 if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode)
3057 3163 return nfserr_bad_stateid;
3058 if (flags & CLOSE_STATE) { 3164 return nfs_ok;
3059 list_for_each_entry(local, &close_lru, so_close_lru) {
3060 if (local->so_id == st_id)
3061 return local;
3062 }
3063 }
3064 return NULL;
3065}
3066
3067static inline int
3068nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
3069{
3070 return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
3071} 3165}
3072 3166
3073static int 3167static int
3074STALE_STATEID(stateid_t *stateid) 3168STALE_STATEID(stateid_t *stateid)
3075{ 3169{
3076 if (stateid->si_boot == boot_time) 3170 if (stateid->si_opaque.so_clid.cl_boot == boot_time)
3077 return 0; 3171 return 0;
3078 dprintk("NFSD: stale stateid " STATEID_FMT "!\n", 3172 dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
3079 STATEID_VAL(stateid)); 3173 STATEID_VAL(stateid));
@@ -3096,7 +3190,7 @@ access_permit_write(unsigned long access_bmap)
3096} 3190}
3097 3191
3098static 3192static
3099__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags) 3193__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
3100{ 3194{
3101 __be32 status = nfserr_openmode; 3195 __be32 status = nfserr_openmode;
3102 3196
@@ -3139,68 +3233,80 @@ grace_disallows_io(struct inode *inode)
3139 return locks_in_grace() && mandatory_lock(inode); 3233 return locks_in_grace() && mandatory_lock(inode);
3140} 3234}
3141 3235
3142static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags) 3236/* Returns true iff a is later than b: */
3237static bool stateid_generation_after(stateid_t *a, stateid_t *b)
3238{
3239 return (s32)a->si_generation - (s32)b->si_generation > 0;
3240}
3241
3242static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
3143{ 3243{
3144 /* 3244 /*
3145 * When sessions are used the stateid generation number is ignored 3245 * When sessions are used the stateid generation number is ignored
3146 * when it is zero. 3246 * when it is zero.
3147 */ 3247 */
3148 if ((flags & HAS_SESSION) && in->si_generation == 0) 3248 if (has_session && in->si_generation == 0)
3149 goto out; 3249 return nfs_ok;
3250
3251 if (in->si_generation == ref->si_generation)
3252 return nfs_ok;
3150 3253
3151 /* If the client sends us a stateid from the future, it's buggy: */ 3254 /* If the client sends us a stateid from the future, it's buggy: */
3152 if (in->si_generation > ref->si_generation) 3255 if (stateid_generation_after(in, ref))
3153 return nfserr_bad_stateid; 3256 return nfserr_bad_stateid;
3154 /* 3257 /*
3155 * The following, however, can happen. For example, if the 3258 * However, we could see a stateid from the past, even from a
3156 * client sends an open and some IO at the same time, the open 3259 * non-buggy client. For example, if the client sends a lock
3157 * may bump si_generation while the IO is still in flight. 3260 * while some IO is outstanding, the lock may bump si_generation
3158 * Thanks to hard links and renames, the client never knows what 3261 * while the IO is still in flight. The client could avoid that
3159 * file an open will affect. So it could avoid that situation 3262 * situation by waiting for responses on all the IO requests,
3160 * only by serializing all opens and IO from the same open 3263 * but better performance may result in retrying IO that
3161 * owner. To recover from the old_stateid error, the client 3264 * receives an old_stateid error if requests are rarely
3162 * will just have to retry the IO: 3265 * reordered in flight:
3163 */ 3266 */
3164 if (in->si_generation < ref->si_generation) 3267 return nfserr_old_stateid;
3165 return nfserr_old_stateid;
3166out:
3167 return nfs_ok;
3168} 3268}
3169 3269
3170static int is_delegation_stateid(stateid_t *stateid) 3270__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
3171{ 3271{
3172 return stateid->si_fileid == 0; 3272 struct nfs4_stid *s;
3173} 3273 struct nfs4_ol_stateid *ols;
3274 __be32 status;
3174 3275
3175static int is_open_stateid(struct nfs4_stateid *stateid) 3276 if (STALE_STATEID(stateid))
3176{ 3277 return nfserr_stale_stateid;
3177 return stateid->st_openstp == NULL; 3278
3279 s = find_stateid(cl, stateid);
3280 if (!s)
3281 return nfserr_stale_stateid;
3282 status = check_stateid_generation(stateid, &s->sc_stateid, 1);
3283 if (status)
3284 return status;
3285 if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
3286 return nfs_ok;
3287 ols = openlockstateid(s);
3288 if (ols->st_stateowner->so_is_open_owner
3289 && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
3290 return nfserr_bad_stateid;
3291 return nfs_ok;
3178} 3292}
3179 3293
3180__be32 nfs4_validate_stateid(stateid_t *stateid, int flags) 3294static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s)
3181{ 3295{
3182 struct nfs4_stateid *stp = NULL; 3296 struct nfs4_client *cl;
3183 __be32 status = nfserr_stale_stateid;
3184 3297
3298 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3299 return nfserr_bad_stateid;
3185 if (STALE_STATEID(stateid)) 3300 if (STALE_STATEID(stateid))
3186 goto out; 3301 return nfserr_stale_stateid;
3187 3302 cl = find_confirmed_client(&stateid->si_opaque.so_clid);
3188 status = nfserr_expired; 3303 if (!cl)
3189 stp = search_for_stateid(stateid); 3304 return nfserr_expired;
3190 if (!stp) 3305 *s = find_stateid_by_type(cl, stateid, typemask);
3191 goto out; 3306 if (!*s)
3192 status = nfserr_bad_stateid; 3307 return nfserr_bad_stateid;
3193 3308 return nfs_ok;
3194 if (!stp->st_stateowner->so_confirmed)
3195 goto out;
3196
3197 status = check_stateid_generation(stateid, &stp->st_stateid, flags);
3198 if (status)
3199 goto out;
3200 3309
3201 status = nfs_ok;
3202out:
3203 return status;
3204} 3310}
3205 3311
3206/* 3312/*
@@ -3210,7 +3316,8 @@ __be32
3210nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, 3316nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
3211 stateid_t *stateid, int flags, struct file **filpp) 3317 stateid_t *stateid, int flags, struct file **filpp)
3212{ 3318{
3213 struct nfs4_stateid *stp = NULL; 3319 struct nfs4_stid *s;
3320 struct nfs4_ol_stateid *stp = NULL;
3214 struct nfs4_delegation *dp = NULL; 3321 struct nfs4_delegation *dp = NULL;
3215 struct svc_fh *current_fh = &cstate->current_fh; 3322 struct svc_fh *current_fh = &cstate->current_fh;
3216 struct inode *ino = current_fh->fh_dentry->d_inode; 3323 struct inode *ino = current_fh->fh_dentry->d_inode;
@@ -3222,60 +3329,47 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
3222 if (grace_disallows_io(ino)) 3329 if (grace_disallows_io(ino))
3223 return nfserr_grace; 3330 return nfserr_grace;
3224 3331
3225 if (nfsd4_has_session(cstate))
3226 flags |= HAS_SESSION;
3227
3228 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3332 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3229 return check_special_stateids(current_fh, stateid, flags); 3333 return check_special_stateids(current_fh, stateid, flags);
3230 3334
3231 status = nfserr_stale_stateid; 3335 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s);
3232 if (STALE_STATEID(stateid)) 3336 if (status)
3337 return status;
3338 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
3339 if (status)
3233 goto out; 3340 goto out;
3234 3341 switch (s->sc_type) {
3235 /* 3342 case NFS4_DELEG_STID:
3236 * We assume that any stateid that has the current boot time, 3343 dp = delegstateid(s);
3237 * but that we can't find, is expired:
3238 */
3239 status = nfserr_expired;
3240 if (is_delegation_stateid(stateid)) {
3241 dp = find_delegation_stateid(ino, stateid);
3242 if (!dp)
3243 goto out;
3244 status = check_stateid_generation(stateid, &dp->dl_stateid,
3245 flags);
3246 if (status)
3247 goto out;
3248 status = nfs4_check_delegmode(dp, flags); 3344 status = nfs4_check_delegmode(dp, flags);
3249 if (status) 3345 if (status)
3250 goto out; 3346 goto out;
3251 renew_client(dp->dl_client);
3252 if (filpp) { 3347 if (filpp) {
3253 *filpp = dp->dl_file->fi_deleg_file; 3348 *filpp = dp->dl_file->fi_deleg_file;
3254 BUG_ON(!*filpp); 3349 BUG_ON(!*filpp);
3255 } 3350 }
3256 } else { /* open or lock stateid */ 3351 break;
3257 stp = find_stateid(stateid, flags); 3352 case NFS4_OPEN_STID:
3258 if (!stp) 3353 case NFS4_LOCK_STID:
3259 goto out; 3354 stp = openlockstateid(s);
3260 status = nfserr_bad_stateid; 3355 status = nfs4_check_fh(current_fh, stp);
3261 if (nfs4_check_fh(current_fh, stp))
3262 goto out;
3263 if (!stp->st_stateowner->so_confirmed)
3264 goto out;
3265 status = check_stateid_generation(stateid, &stp->st_stateid,
3266 flags);
3267 if (status) 3356 if (status)
3268 goto out; 3357 goto out;
3358 if (stp->st_stateowner->so_is_open_owner
3359 && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
3360 goto out;
3269 status = nfs4_check_openmode(stp, flags); 3361 status = nfs4_check_openmode(stp, flags);
3270 if (status) 3362 if (status)
3271 goto out; 3363 goto out;
3272 renew_client(stp->st_stateowner->so_client);
3273 if (filpp) { 3364 if (filpp) {
3274 if (flags & RD_STATE) 3365 if (flags & RD_STATE)
3275 *filpp = find_readable_file(stp->st_file); 3366 *filpp = find_readable_file(stp->st_file);
3276 else 3367 else
3277 *filpp = find_writeable_file(stp->st_file); 3368 *filpp = find_writeable_file(stp->st_file);
3278 } 3369 }
3370 break;
3371 default:
3372 return nfserr_bad_stateid;
3279 } 3373 }
3280 status = nfs_ok; 3374 status = nfs_ok;
3281out: 3375out:
@@ -3283,18 +3377,9 @@ out:
3283} 3377}
3284 3378
3285static __be32 3379static __be32
3286nfsd4_free_delegation_stateid(stateid_t *stateid) 3380nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
3287{ 3381{
3288 struct nfs4_delegation *dp = search_for_delegation(stateid); 3382 if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
3289 if (dp)
3290 return nfserr_locks_held;
3291 return nfserr_bad_stateid;
3292}
3293
3294static __be32
3295nfsd4_free_lock_stateid(struct nfs4_stateid *stp)
3296{
3297 if (check_for_locks(stp->st_file, stp->st_stateowner))
3298 return nfserr_locks_held; 3383 return nfserr_locks_held;
3299 release_lock_stateid(stp); 3384 release_lock_stateid(stp);
3300 return nfs_ok; 3385 return nfs_ok;
@@ -3307,51 +3392,40 @@ __be32
3307nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3392nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3308 struct nfsd4_test_stateid *test_stateid) 3393 struct nfsd4_test_stateid *test_stateid)
3309{ 3394{
3310 test_stateid->ts_has_session = nfsd4_has_session(cstate); 3395 /* real work is done during encoding */
3311 return nfs_ok; 3396 return nfs_ok;
3312} 3397}
3313 3398
3314/*
3315 * Free a state id
3316 */
3317__be32 3399__be32
3318nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3400nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3319 struct nfsd4_free_stateid *free_stateid) 3401 struct nfsd4_free_stateid *free_stateid)
3320{ 3402{
3321 stateid_t *stateid = &free_stateid->fr_stateid; 3403 stateid_t *stateid = &free_stateid->fr_stateid;
3322 struct nfs4_stateid *stp; 3404 struct nfs4_stid *s;
3323 __be32 ret; 3405 struct nfs4_client *cl = cstate->session->se_client;
3406 __be32 ret = nfserr_bad_stateid;
3324 3407
3325 nfs4_lock_state(); 3408 nfs4_lock_state();
3326 if (is_delegation_stateid(stateid)) { 3409 s = find_stateid(cl, stateid);
3327 ret = nfsd4_free_delegation_stateid(stateid); 3410 if (!s)
3328 goto out;
3329 }
3330
3331 stp = search_for_stateid(stateid);
3332 if (!stp) {
3333 ret = nfserr_bad_stateid;
3334 goto out; 3411 goto out;
3335 } 3412 switch (s->sc_type) {
3336 if (stateid->si_generation != 0) { 3413 case NFS4_DELEG_STID:
3337 if (stateid->si_generation < stp->st_stateid.si_generation) {
3338 ret = nfserr_old_stateid;
3339 goto out;
3340 }
3341 if (stateid->si_generation > stp->st_stateid.si_generation) {
3342 ret = nfserr_bad_stateid;
3343 goto out;
3344 }
3345 }
3346
3347 if (is_open_stateid(stp)) {
3348 ret = nfserr_locks_held; 3414 ret = nfserr_locks_held;
3349 goto out; 3415 goto out;
3350 } else { 3416 case NFS4_OPEN_STID:
3351 ret = nfsd4_free_lock_stateid(stp); 3417 case NFS4_LOCK_STID:
3352 goto out; 3418 ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
3419 if (ret)
3420 goto out;
3421 if (s->sc_type == NFS4_LOCK_STID)
3422 ret = nfsd4_free_lock_stateid(openlockstateid(s));
3423 else
3424 ret = nfserr_locks_held;
3425 break;
3426 default:
3427 ret = nfserr_bad_stateid;
3353 } 3428 }
3354
3355out: 3429out:
3356 nfs4_unlock_state(); 3430 nfs4_unlock_state();
3357 return ret; 3431 return ret;
@@ -3364,124 +3438,64 @@ setlkflg (int type)
3364 RD_STATE : WR_STATE; 3438 RD_STATE : WR_STATE;
3365} 3439}
3366 3440
3441static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
3442{
3443 struct svc_fh *current_fh = &cstate->current_fh;
3444 struct nfs4_stateowner *sop = stp->st_stateowner;
3445 __be32 status;
3446
3447 status = nfsd4_check_seqid(cstate, sop, seqid);
3448 if (status)
3449 return status;
3450 if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
3451 /*
3452 * "Closed" stateid's exist *only* to return
3453 * nfserr_replay_me from the previous step.
3454 */
3455 return nfserr_bad_stateid;
3456 status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
3457 if (status)
3458 return status;
3459 return nfs4_check_fh(current_fh, stp);
3460}
3461
3367/* 3462/*
3368 * Checks for sequence id mutating operations. 3463 * Checks for sequence id mutating operations.
3369 */ 3464 */
3370static __be32 3465static __be32
3371nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, 3466nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3372 stateid_t *stateid, int flags, 3467 stateid_t *stateid, char typemask,
3373 struct nfs4_stateowner **sopp, 3468 struct nfs4_ol_stateid **stpp)
3374 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
3375{ 3469{
3376 struct nfs4_stateid *stp;
3377 struct nfs4_stateowner *sop;
3378 struct svc_fh *current_fh = &cstate->current_fh;
3379 __be32 status; 3470 __be32 status;
3471 struct nfs4_stid *s;
3380 3472
3381 dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, 3473 dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
3382 seqid, STATEID_VAL(stateid)); 3474 seqid, STATEID_VAL(stateid));
3383 3475
3384 *stpp = NULL; 3476 *stpp = NULL;
3385 *sopp = NULL; 3477 status = nfsd4_lookup_stateid(stateid, typemask, &s);
3386 3478 if (status)
3387 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { 3479 return status;
3388 dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); 3480 *stpp = openlockstateid(s);
3389 return nfserr_bad_stateid; 3481 cstate->replay_owner = (*stpp)->st_stateowner;
3390 }
3391
3392 if (STALE_STATEID(stateid))
3393 return nfserr_stale_stateid;
3394
3395 if (nfsd4_has_session(cstate))
3396 flags |= HAS_SESSION;
3397
3398 /*
3399 * We return BAD_STATEID if filehandle doesn't match stateid,
3400 * the confirmed flag is incorrecly set, or the generation
3401 * number is incorrect.
3402 */
3403 stp = find_stateid(stateid, flags);
3404 if (stp == NULL) {
3405 /*
3406 * Also, we should make sure this isn't just the result of
3407 * a replayed close:
3408 */
3409 sop = search_close_lru(stateid->si_stateownerid, flags);
3410 /* It's not stale; let's assume it's expired: */
3411 if (sop == NULL)
3412 return nfserr_expired;
3413 *sopp = sop;
3414 goto check_replay;
3415 }
3416
3417 *stpp = stp;
3418 *sopp = sop = stp->st_stateowner;
3419
3420 if (lock) {
3421 clientid_t *lockclid = &lock->v.new.clientid;
3422 struct nfs4_client *clp = sop->so_client;
3423 int lkflg = 0;
3424 __be32 status;
3425
3426 lkflg = setlkflg(lock->lk_type);
3427
3428 if (lock->lk_is_new) {
3429 if (!sop->so_is_open_owner)
3430 return nfserr_bad_stateid;
3431 if (!(flags & HAS_SESSION) &&
3432 !same_clid(&clp->cl_clientid, lockclid))
3433 return nfserr_bad_stateid;
3434 /* stp is the open stateid */
3435 status = nfs4_check_openmode(stp, lkflg);
3436 if (status)
3437 return status;
3438 } else {
3439 /* stp is the lock stateid */
3440 status = nfs4_check_openmode(stp->st_openstp, lkflg);
3441 if (status)
3442 return status;
3443 }
3444 }
3445 3482
3446 if (nfs4_check_fh(current_fh, stp)) { 3483 return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
3447 dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); 3484}
3448 return nfserr_bad_stateid;
3449 }
3450 3485
3451 /* 3486static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
3452 * We now validate the seqid and stateid generation numbers. 3487{
3453 * For the moment, we ignore the possibility of 3488 __be32 status;
3454 * generation number wraparound. 3489 struct nfs4_openowner *oo;
3455 */
3456 if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
3457 goto check_replay;
3458 3490
3459 if (sop->so_confirmed && flags & CONFIRM) { 3491 status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
3460 dprintk("NFSD: preprocess_seqid_op: expected" 3492 NFS4_OPEN_STID, stpp);
3461 " unconfirmed stateowner!\n");
3462 return nfserr_bad_stateid;
3463 }
3464 if (!sop->so_confirmed && !(flags & CONFIRM)) {
3465 dprintk("NFSD: preprocess_seqid_op: stateowner not"
3466 " confirmed yet!\n");
3467 return nfserr_bad_stateid;
3468 }
3469 status = check_stateid_generation(stateid, &stp->st_stateid, flags);
3470 if (status) 3493 if (status)
3471 return status; 3494 return status;
3472 renew_client(sop->so_client); 3495 oo = openowner((*stpp)->st_stateowner);
3496 if (!(oo->oo_flags & NFS4_OO_CONFIRMED))
3497 return nfserr_bad_stateid;
3473 return nfs_ok; 3498 return nfs_ok;
3474
3475check_replay:
3476 if (seqid == sop->so_seqid - 1) {
3477 dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
3478 /* indicate replay to calling function */
3479 return nfserr_replay_me;
3480 }
3481 dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
3482 sop->so_seqid, seqid);
3483 *sopp = NULL;
3484 return nfserr_bad_seqid;
3485} 3499}
3486 3500
3487__be32 3501__be32
@@ -3489,8 +3503,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3489 struct nfsd4_open_confirm *oc) 3503 struct nfsd4_open_confirm *oc)
3490{ 3504{
3491 __be32 status; 3505 __be32 status;
3492 struct nfs4_stateowner *sop; 3506 struct nfs4_openowner *oo;
3493 struct nfs4_stateid *stp; 3507 struct nfs4_ol_stateid *stp;
3494 3508
3495 dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", 3509 dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
3496 (int)cstate->current_fh.fh_dentry->d_name.len, 3510 (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3502,38 +3516,52 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3502 3516
3503 nfs4_lock_state(); 3517 nfs4_lock_state();
3504 3518
3505 if ((status = nfs4_preprocess_seqid_op(cstate, 3519 status = nfs4_preprocess_seqid_op(cstate,
3506 oc->oc_seqid, &oc->oc_req_stateid, 3520 oc->oc_seqid, &oc->oc_req_stateid,
3507 CONFIRM | OPEN_STATE, 3521 NFS4_OPEN_STID, &stp);
3508 &oc->oc_stateowner, &stp, NULL))) 3522 if (status)
3509 goto out; 3523 goto out;
3510 3524 oo = openowner(stp->st_stateowner);
3511 sop = oc->oc_stateowner; 3525 status = nfserr_bad_stateid;
3512 sop->so_confirmed = 1; 3526 if (oo->oo_flags & NFS4_OO_CONFIRMED)
3513 update_stateid(&stp->st_stateid); 3527 goto out;
3514 memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); 3528 oo->oo_flags |= NFS4_OO_CONFIRMED;
3529 update_stateid(&stp->st_stid.sc_stateid);
3530 memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
3515 dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", 3531 dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
3516 __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); 3532 __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
3517 3533
3518 nfsd4_create_clid_dir(sop->so_client); 3534 nfsd4_create_clid_dir(oo->oo_owner.so_client);
3535 status = nfs_ok;
3519out: 3536out:
3520 if (oc->oc_stateowner) { 3537 if (!cstate->replay_owner)
3521 nfs4_get_stateowner(oc->oc_stateowner); 3538 nfs4_unlock_state();
3522 cstate->replay_owner = oc->oc_stateowner;
3523 }
3524 nfs4_unlock_state();
3525 return status; 3539 return status;
3526} 3540}
3527 3541
3528static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access) 3542static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
3529{ 3543{
3530 int i; 3544 if (!test_bit(access, &stp->st_access_bmap))
3545 return;
3546 nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
3547 __clear_bit(access, &stp->st_access_bmap);
3548}
3531 3549
3532 for (i = 1; i < 4; i++) { 3550static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
3533 if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) { 3551{
3534 nfs4_file_put_access(stp->st_file, i); 3552 switch (to_access) {
3535 __clear_bit(i, &stp->st_access_bmap); 3553 case NFS4_SHARE_ACCESS_READ:
3536 } 3554 nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE);
3555 nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
3556 break;
3557 case NFS4_SHARE_ACCESS_WRITE:
3558 nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ);
3559 nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
3560 break;
3561 case NFS4_SHARE_ACCESS_BOTH:
3562 break;
3563 default:
3564 BUG();
3537 } 3565 }
3538} 3566}
3539 3567
@@ -3553,24 +3581,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3553 struct nfsd4_open_downgrade *od) 3581 struct nfsd4_open_downgrade *od)
3554{ 3582{
3555 __be32 status; 3583 __be32 status;
3556 struct nfs4_stateid *stp; 3584 struct nfs4_ol_stateid *stp;
3557 3585
3558 dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 3586 dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n",
3559 (int)cstate->current_fh.fh_dentry->d_name.len, 3587 (int)cstate->current_fh.fh_dentry->d_name.len,
3560 cstate->current_fh.fh_dentry->d_name.name); 3588 cstate->current_fh.fh_dentry->d_name.name);
3561 3589
3562 if (!access_valid(od->od_share_access, cstate->minorversion) 3590 /* We don't yet support WANT bits: */
3563 || !deny_valid(od->od_share_deny)) 3591 od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
3564 return nfserr_inval;
3565 3592
3566 nfs4_lock_state(); 3593 nfs4_lock_state();
3567 if ((status = nfs4_preprocess_seqid_op(cstate, 3594 status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
3568 od->od_seqid, 3595 &od->od_stateid, &stp);
3569 &od->od_stateid, 3596 if (status)
3570 OPEN_STATE,
3571 &od->od_stateowner, &stp, NULL)))
3572 goto out; 3597 goto out;
3573
3574 status = nfserr_inval; 3598 status = nfserr_inval;
3575 if (!test_bit(od->od_share_access, &stp->st_access_bmap)) { 3599 if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
3576 dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n", 3600 dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
@@ -3582,22 +3606,45 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3582 stp->st_deny_bmap, od->od_share_deny); 3606 stp->st_deny_bmap, od->od_share_deny);
3583 goto out; 3607 goto out;
3584 } 3608 }
3585 nfs4_file_downgrade(stp, od->od_share_access); 3609 nfs4_stateid_downgrade(stp, od->od_share_access);
3586 3610
3587 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); 3611 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
3588 3612
3589 update_stateid(&stp->st_stateid); 3613 update_stateid(&stp->st_stid.sc_stateid);
3590 memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t)); 3614 memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
3591 status = nfs_ok; 3615 status = nfs_ok;
3592out: 3616out:
3593 if (od->od_stateowner) { 3617 if (!cstate->replay_owner)
3594 nfs4_get_stateowner(od->od_stateowner); 3618 nfs4_unlock_state();
3595 cstate->replay_owner = od->od_stateowner;
3596 }
3597 nfs4_unlock_state();
3598 return status; 3619 return status;
3599} 3620}
3600 3621
3622void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
3623{
3624 struct nfs4_openowner *oo;
3625 struct nfs4_ol_stateid *s;
3626
3627 if (!so->so_is_open_owner)
3628 return;
3629 oo = openowner(so);
3630 s = oo->oo_last_closed_stid;
3631 if (!s)
3632 return;
3633 if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
3634 /* Release the last_closed_stid on the next seqid bump: */
3635 oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
3636 return;
3637 }
3638 oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
3639 release_last_closed_stateid(oo);
3640}
3641
3642static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
3643{
3644 unhash_open_stateid(s);
3645 s->st_stid.sc_type = NFS4_CLOSED_STID;
3646}
3647
3601/* 3648/*
3602 * nfs4_unlock_state() called after encode 3649 * nfs4_unlock_state() called after encode
3603 */ 3650 */
@@ -3606,39 +3653,37 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3606 struct nfsd4_close *close) 3653 struct nfsd4_close *close)
3607{ 3654{
3608 __be32 status; 3655 __be32 status;
3609 struct nfs4_stateid *stp; 3656 struct nfs4_openowner *oo;
3657 struct nfs4_ol_stateid *stp;
3610 3658
3611 dprintk("NFSD: nfsd4_close on file %.*s\n", 3659 dprintk("NFSD: nfsd4_close on file %.*s\n",
3612 (int)cstate->current_fh.fh_dentry->d_name.len, 3660 (int)cstate->current_fh.fh_dentry->d_name.len,
3613 cstate->current_fh.fh_dentry->d_name.name); 3661 cstate->current_fh.fh_dentry->d_name.name);
3614 3662
3615 nfs4_lock_state(); 3663 nfs4_lock_state();
3616 /* check close_lru for replay */ 3664 status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
3617 if ((status = nfs4_preprocess_seqid_op(cstate, 3665 &close->cl_stateid,
3618 close->cl_seqid, 3666 NFS4_OPEN_STID|NFS4_CLOSED_STID,
3619 &close->cl_stateid, 3667 &stp);
3620 OPEN_STATE | CLOSE_STATE, 3668 if (status)
3621 &close->cl_stateowner, &stp, NULL)))
3622 goto out; 3669 goto out;
3670 oo = openowner(stp->st_stateowner);
3623 status = nfs_ok; 3671 status = nfs_ok;
3624 update_stateid(&stp->st_stateid); 3672 update_stateid(&stp->st_stid.sc_stateid);
3625 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); 3673 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
3626 3674
3627 /* release_stateid() calls nfsd_close() if needed */ 3675 nfsd4_close_open_stateid(stp);
3628 release_open_stateid(stp); 3676 oo->oo_last_closed_stid = stp;
3629 3677
3630 /* place unused nfs4_stateowners on so_close_lru list to be 3678 /* place unused nfs4_stateowners on so_close_lru list to be
3631 * released by the laundromat service after the lease period 3679 * released by the laundromat service after the lease period
3632 * to enable us to handle CLOSE replay 3680 * to enable us to handle CLOSE replay
3633 */ 3681 */
3634 if (list_empty(&close->cl_stateowner->so_stateids)) 3682 if (list_empty(&oo->oo_owner.so_stateids))
3635 move_to_close_lru(close->cl_stateowner); 3683 move_to_close_lru(oo);
3636out: 3684out:
3637 if (close->cl_stateowner) { 3685 if (!cstate->replay_owner)
3638 nfs4_get_stateowner(close->cl_stateowner); 3686 nfs4_unlock_state();
3639 cstate->replay_owner = close->cl_stateowner;
3640 }
3641 nfs4_unlock_state();
3642 return status; 3687 return status;
3643} 3688}
3644 3689
@@ -3648,34 +3693,22 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3648{ 3693{
3649 struct nfs4_delegation *dp; 3694 struct nfs4_delegation *dp;
3650 stateid_t *stateid = &dr->dr_stateid; 3695 stateid_t *stateid = &dr->dr_stateid;
3696 struct nfs4_stid *s;
3651 struct inode *inode; 3697 struct inode *inode;
3652 __be32 status; 3698 __be32 status;
3653 int flags = 0;
3654 3699
3655 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 3700 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
3656 return status; 3701 return status;
3657 inode = cstate->current_fh.fh_dentry->d_inode; 3702 inode = cstate->current_fh.fh_dentry->d_inode;
3658 3703
3659 if (nfsd4_has_session(cstate))
3660 flags |= HAS_SESSION;
3661 nfs4_lock_state(); 3704 nfs4_lock_state();
3662 status = nfserr_bad_stateid; 3705 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s);
3663 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3706 if (status)
3664 goto out;
3665 status = nfserr_stale_stateid;
3666 if (STALE_STATEID(stateid))
3667 goto out;
3668 status = nfserr_bad_stateid;
3669 if (!is_delegation_stateid(stateid))
3670 goto out;
3671 status = nfserr_expired;
3672 dp = find_delegation_stateid(inode, stateid);
3673 if (!dp)
3674 goto out; 3707 goto out;
3675 status = check_stateid_generation(stateid, &dp->dl_stateid, flags); 3708 dp = delegstateid(s);
3709 status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
3676 if (status) 3710 if (status)
3677 goto out; 3711 goto out;
3678 renew_client(dp->dl_client);
3679 3712
3680 unhash_delegation(dp); 3713 unhash_delegation(dp);
3681out: 3714out:
@@ -3713,9 +3746,6 @@ last_byte_offset(u64 start, u64 len)
3713 return end > start ? end - 1: NFS4_MAX_UINT64; 3746 return end > start ? end - 1: NFS4_MAX_UINT64;
3714} 3747}
3715 3748
3716#define lockownerid_hashval(id) \
3717 ((id) & LOCK_HASH_MASK)
3718
3719static inline unsigned int 3749static inline unsigned int
3720lock_ownerstr_hashval(struct inode *inode, u32 cl_id, 3750lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
3721 struct xdr_netobj *ownername) 3751 struct xdr_netobj *ownername)
@@ -3725,101 +3755,7 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
3725 & LOCK_HASH_MASK; 3755 & LOCK_HASH_MASK;
3726} 3756}
3727 3757
3728static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
3729static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; 3758static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
3730static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
3731
3732static int
3733same_stateid(stateid_t *id_one, stateid_t *id_two)
3734{
3735 if (id_one->si_stateownerid != id_two->si_stateownerid)
3736 return 0;
3737 return id_one->si_fileid == id_two->si_fileid;
3738}
3739
3740static struct nfs4_stateid *
3741find_stateid(stateid_t *stid, int flags)
3742{
3743 struct nfs4_stateid *local;
3744 u32 st_id = stid->si_stateownerid;
3745 u32 f_id = stid->si_fileid;
3746 unsigned int hashval;
3747
3748 dprintk("NFSD: find_stateid flags 0x%x\n",flags);
3749 if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
3750 hashval = stateid_hashval(st_id, f_id);
3751 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
3752 if ((local->st_stateid.si_stateownerid == st_id) &&
3753 (local->st_stateid.si_fileid == f_id))
3754 return local;
3755 }
3756 }
3757
3758 if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
3759 hashval = stateid_hashval(st_id, f_id);
3760 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
3761 if ((local->st_stateid.si_stateownerid == st_id) &&
3762 (local->st_stateid.si_fileid == f_id))
3763 return local;
3764 }
3765 }
3766 return NULL;
3767}
3768
3769static struct nfs4_stateid *
3770search_for_stateid(stateid_t *stid)
3771{
3772 struct nfs4_stateid *local;
3773 unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid);
3774
3775 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
3776 if (same_stateid(&local->st_stateid, stid))
3777 return local;
3778 }
3779
3780 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
3781 if (same_stateid(&local->st_stateid, stid))
3782 return local;
3783 }
3784 return NULL;
3785}
3786
3787static struct nfs4_delegation *
3788search_for_delegation(stateid_t *stid)
3789{
3790 struct nfs4_file *fp;
3791 struct nfs4_delegation *dp;
3792 struct list_head *pos;
3793 int i;
3794
3795 for (i = 0; i < FILE_HASH_SIZE; i++) {
3796 list_for_each_entry(fp, &file_hashtbl[i], fi_hash) {
3797 list_for_each(pos, &fp->fi_delegations) {
3798 dp = list_entry(pos, struct nfs4_delegation, dl_perfile);
3799 if (same_stateid(&dp->dl_stateid, stid))
3800 return dp;
3801 }
3802 }
3803 }
3804 return NULL;
3805}
3806
3807static struct nfs4_delegation *
3808find_delegation_stateid(struct inode *ino, stateid_t *stid)
3809{
3810 struct nfs4_file *fp;
3811 struct nfs4_delegation *dl;
3812
3813 dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
3814 STATEID_VAL(stid));
3815
3816 fp = find_file(ino);
3817 if (!fp)
3818 return NULL;
3819 dl = find_delegation_file(fp, stid);
3820 put_nfs4_file(fp);
3821 return dl;
3822}
3823 3759
3824/* 3760/*
3825 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that 3761 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3846,15 +3782,21 @@ static const struct lock_manager_operations nfsd_posix_mng_ops = {
3846static inline void 3782static inline void
3847nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) 3783nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
3848{ 3784{
3849 struct nfs4_stateowner *sop; 3785 struct nfs4_lockowner *lo;
3850 3786
3851 if (fl->fl_lmops == &nfsd_posix_mng_ops) { 3787 if (fl->fl_lmops == &nfsd_posix_mng_ops) {
3852 sop = (struct nfs4_stateowner *) fl->fl_owner; 3788 lo = (struct nfs4_lockowner *) fl->fl_owner;
3853 kref_get(&sop->so_ref); 3789 deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
3854 deny->ld_sop = sop; 3790 lo->lo_owner.so_owner.len, GFP_KERNEL);
3855 deny->ld_clientid = sop->so_client->cl_clientid; 3791 if (!deny->ld_owner.data)
3792 /* We just don't care that much */
3793 goto nevermind;
3794 deny->ld_owner.len = lo->lo_owner.so_owner.len;
3795 deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
3856 } else { 3796 } else {
3857 deny->ld_sop = NULL; 3797nevermind:
3798 deny->ld_owner.len = 0;
3799 deny->ld_owner.data = NULL;
3858 deny->ld_clientid.cl_boot = 0; 3800 deny->ld_clientid.cl_boot = 0;
3859 deny->ld_clientid.cl_id = 0; 3801 deny->ld_clientid.cl_id = 0;
3860 } 3802 }
@@ -3867,8 +3809,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
3867 deny->ld_type = NFS4_WRITE_LT; 3809 deny->ld_type = NFS4_WRITE_LT;
3868} 3810}
3869 3811
3870static struct nfs4_stateowner * 3812static struct nfs4_lockowner *
3871find_lockstateowner_str(struct inode *inode, clientid_t *clid, 3813find_lockowner_str(struct inode *inode, clientid_t *clid,
3872 struct xdr_netobj *owner) 3814 struct xdr_netobj *owner)
3873{ 3815{
3874 unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); 3816 unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
@@ -3876,11 +3818,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
3876 3818
3877 list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { 3819 list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
3878 if (same_owner_str(op, owner, clid)) 3820 if (same_owner_str(op, owner, clid))
3879 return op; 3821 return lockowner(op);
3880 } 3822 }
3881 return NULL; 3823 return NULL;
3882} 3824}
3883 3825
3826static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
3827{
3828 list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
3829 list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
3830}
3831
3884/* 3832/*
3885 * Alloc a lock owner structure. 3833 * Alloc a lock owner structure.
3886 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 3834 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
@@ -3889,67 +3837,40 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
3889 * strhashval = lock_ownerstr_hashval 3837 * strhashval = lock_ownerstr_hashval
3890 */ 3838 */
3891 3839
3892static struct nfs4_stateowner * 3840static struct nfs4_lockowner *
3893alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) { 3841alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) {
3894 struct nfs4_stateowner *sop; 3842 struct nfs4_lockowner *lo;
3895 struct nfs4_replay *rp;
3896 unsigned int idhashval;
3897 3843
3898 if (!(sop = alloc_stateowner(&lock->lk_new_owner))) 3844 lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
3845 if (!lo)
3899 return NULL; 3846 return NULL;
3900 idhashval = lockownerid_hashval(current_ownerid); 3847 INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
3901 INIT_LIST_HEAD(&sop->so_idhash); 3848 lo->lo_owner.so_is_open_owner = 0;
3902 INIT_LIST_HEAD(&sop->so_strhash);
3903 INIT_LIST_HEAD(&sop->so_perclient);
3904 INIT_LIST_HEAD(&sop->so_stateids);
3905 INIT_LIST_HEAD(&sop->so_perstateid);
3906 INIT_LIST_HEAD(&sop->so_close_lru); /* not used */
3907 sop->so_time = 0;
3908 list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
3909 list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
3910 list_add(&sop->so_perstateid, &open_stp->st_lockowners);
3911 sop->so_is_open_owner = 0;
3912 sop->so_id = current_ownerid++;
3913 sop->so_client = clp;
3914 /* It is the openowner seqid that will be incremented in encode in the 3849 /* It is the openowner seqid that will be incremented in encode in the
3915 * case of new lockowners; so increment the lock seqid manually: */ 3850 * case of new lockowners; so increment the lock seqid manually: */
3916 sop->so_seqid = lock->lk_new_lock_seqid + 1; 3851 lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1;
3917 sop->so_confirmed = 1; 3852 hash_lockowner(lo, strhashval, clp, open_stp);
3918 rp = &sop->so_replay; 3853 return lo;
3919 rp->rp_status = nfserr_serverfault;
3920 rp->rp_buflen = 0;
3921 rp->rp_buf = rp->rp_ibuf;
3922 return sop;
3923} 3854}
3924 3855
3925static struct nfs4_stateid * 3856static struct nfs4_ol_stateid *
3926alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp) 3857alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
3927{ 3858{
3928 struct nfs4_stateid *stp; 3859 struct nfs4_ol_stateid *stp;
3929 unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); 3860 struct nfs4_client *clp = lo->lo_owner.so_client;
3930 3861
3931 stp = nfs4_alloc_stateid(); 3862 stp = nfs4_alloc_stateid(clp);
3932 if (stp == NULL) 3863 if (stp == NULL)
3933 goto out; 3864 return NULL;
3934 INIT_LIST_HEAD(&stp->st_hash); 3865 init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
3935 INIT_LIST_HEAD(&stp->st_perfile);
3936 INIT_LIST_HEAD(&stp->st_perstateowner);
3937 INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
3938 list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
3939 list_add(&stp->st_perfile, &fp->fi_stateids); 3866 list_add(&stp->st_perfile, &fp->fi_stateids);
3940 list_add(&stp->st_perstateowner, &sop->so_stateids); 3867 list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
3941 stp->st_stateowner = sop; 3868 stp->st_stateowner = &lo->lo_owner;
3942 get_nfs4_file(fp); 3869 get_nfs4_file(fp);
3943 stp->st_file = fp; 3870 stp->st_file = fp;
3944 stp->st_stateid.si_boot = boot_time;
3945 stp->st_stateid.si_stateownerid = sop->so_id;
3946 stp->st_stateid.si_fileid = fp->fi_id;
3947 stp->st_stateid.si_generation = 0;
3948 stp->st_access_bmap = 0; 3871 stp->st_access_bmap = 0;
3949 stp->st_deny_bmap = open_stp->st_deny_bmap; 3872 stp->st_deny_bmap = open_stp->st_deny_bmap;
3950 stp->st_openstp = open_stp; 3873 stp->st_openstp = open_stp;
3951
3952out:
3953 return stp; 3874 return stp;
3954} 3875}
3955 3876
@@ -3960,7 +3881,7 @@ check_lock_length(u64 offset, u64 length)
3960 LOFF_OVERFLOW(offset, length))); 3881 LOFF_OVERFLOW(offset, length)));
3961} 3882}
3962 3883
3963static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access) 3884static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
3964{ 3885{
3965 struct nfs4_file *fp = lock_stp->st_file; 3886 struct nfs4_file *fp = lock_stp->st_file;
3966 int oflag = nfs4_access_to_omode(access); 3887 int oflag = nfs4_access_to_omode(access);
@@ -3978,15 +3899,16 @@ __be32
3978nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3899nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3979 struct nfsd4_lock *lock) 3900 struct nfsd4_lock *lock)
3980{ 3901{
3981 struct nfs4_stateowner *open_sop = NULL; 3902 struct nfs4_openowner *open_sop = NULL;
3982 struct nfs4_stateowner *lock_sop = NULL; 3903 struct nfs4_lockowner *lock_sop = NULL;
3983 struct nfs4_stateid *lock_stp; 3904 struct nfs4_ol_stateid *lock_stp;
3984 struct nfs4_file *fp; 3905 struct nfs4_file *fp;
3985 struct file *filp = NULL; 3906 struct file *filp = NULL;
3986 struct file_lock file_lock; 3907 struct file_lock file_lock;
3987 struct file_lock conflock; 3908 struct file_lock conflock;
3988 __be32 status = 0; 3909 __be32 status = 0;
3989 unsigned int strhashval; 3910 unsigned int strhashval;
3911 int lkflg;
3990 int err; 3912 int err;
3991 3913
3992 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", 3914 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -4010,7 +3932,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4010 * Use open owner and open stateid to create lock owner and 3932 * Use open owner and open stateid to create lock owner and
4011 * lock stateid. 3933 * lock stateid.
4012 */ 3934 */
4013 struct nfs4_stateid *open_stp = NULL; 3935 struct nfs4_ol_stateid *open_stp = NULL;
4014 3936
4015 status = nfserr_stale_clientid; 3937 status = nfserr_stale_clientid;
4016 if (!nfsd4_has_session(cstate) && 3938 if (!nfsd4_has_session(cstate) &&
@@ -4018,26 +3940,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4018 goto out; 3940 goto out;
4019 3941
4020 /* validate and update open stateid and open seqid */ 3942 /* validate and update open stateid and open seqid */
4021 status = nfs4_preprocess_seqid_op(cstate, 3943 status = nfs4_preprocess_confirmed_seqid_op(cstate,
4022 lock->lk_new_open_seqid, 3944 lock->lk_new_open_seqid,
4023 &lock->lk_new_open_stateid, 3945 &lock->lk_new_open_stateid,
4024 OPEN_STATE, 3946 &open_stp);
4025 &lock->lk_replay_owner, &open_stp,
4026 lock);
4027 if (status) 3947 if (status)
4028 goto out; 3948 goto out;
4029 open_sop = lock->lk_replay_owner; 3949 open_sop = openowner(open_stp->st_stateowner);
3950 status = nfserr_bad_stateid;
3951 if (!nfsd4_has_session(cstate) &&
3952 !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
3953 &lock->v.new.clientid))
3954 goto out;
4030 /* create lockowner and lock stateid */ 3955 /* create lockowner and lock stateid */
4031 fp = open_stp->st_file; 3956 fp = open_stp->st_file;
4032 strhashval = lock_ownerstr_hashval(fp->fi_inode, 3957 strhashval = lock_ownerstr_hashval(fp->fi_inode,
4033 open_sop->so_client->cl_clientid.cl_id, 3958 open_sop->oo_owner.so_client->cl_clientid.cl_id,
4034 &lock->v.new.owner); 3959 &lock->v.new.owner);
4035 /* XXX: Do we need to check for duplicate stateowners on 3960 /* XXX: Do we need to check for duplicate stateowners on
4036 * the same file, or should they just be allowed (and 3961 * the same file, or should they just be allowed (and
4037 * create new stateids)? */ 3962 * create new stateids)? */
4038 status = nfserr_resource; 3963 status = nfserr_jukebox;
4039 lock_sop = alloc_init_lock_stateowner(strhashval, 3964 lock_sop = alloc_init_lock_stateowner(strhashval,
4040 open_sop->so_client, open_stp, lock); 3965 open_sop->oo_owner.so_client, open_stp, lock);
4041 if (lock_sop == NULL) 3966 if (lock_sop == NULL)
4042 goto out; 3967 goto out;
4043 lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); 3968 lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
@@ -4046,16 +3971,20 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4046 } else { 3971 } else {
4047 /* lock (lock owner + lock stateid) already exists */ 3972 /* lock (lock owner + lock stateid) already exists */
4048 status = nfs4_preprocess_seqid_op(cstate, 3973 status = nfs4_preprocess_seqid_op(cstate,
4049 lock->lk_old_lock_seqid, 3974 lock->lk_old_lock_seqid,
4050 &lock->lk_old_lock_stateid, 3975 &lock->lk_old_lock_stateid,
4051 LOCK_STATE, 3976 NFS4_LOCK_STID, &lock_stp);
4052 &lock->lk_replay_owner, &lock_stp, lock);
4053 if (status) 3977 if (status)
4054 goto out; 3978 goto out;
4055 lock_sop = lock->lk_replay_owner; 3979 lock_sop = lockowner(lock_stp->st_stateowner);
4056 fp = lock_stp->st_file; 3980 fp = lock_stp->st_file;
4057 } 3981 }
4058 /* lock->lk_replay_owner and lock_stp have been created or found */ 3982 /* lock_sop and lock_stp have been created or found */
3983
3984 lkflg = setlkflg(lock->lk_type);
3985 status = nfs4_check_openmode(lock_stp, lkflg);
3986 if (status)
3987 goto out;
4059 3988
4060 status = nfserr_grace; 3989 status = nfserr_grace;
4061 if (locks_in_grace() && !lock->lk_reclaim) 3990 if (locks_in_grace() && !lock->lk_reclaim)
@@ -4106,8 +4035,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4106 err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock); 4035 err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
4107 switch (-err) { 4036 switch (-err) {
4108 case 0: /* success! */ 4037 case 0: /* success! */
4109 update_stateid(&lock_stp->st_stateid); 4038 update_stateid(&lock_stp->st_stid.sc_stateid);
4110 memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, 4039 memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid,
4111 sizeof(stateid_t)); 4040 sizeof(stateid_t));
4112 status = 0; 4041 status = 0;
4113 break; 4042 break;
@@ -4119,19 +4048,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4119 case (EDEADLK): 4048 case (EDEADLK):
4120 status = nfserr_deadlock; 4049 status = nfserr_deadlock;
4121 break; 4050 break;
4122 default: 4051 default:
4123 dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); 4052 dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
4124 status = nfserr_resource; 4053 status = nfserrno(err);
4125 break; 4054 break;
4126 } 4055 }
4127out: 4056out:
4128 if (status && lock->lk_is_new && lock_sop) 4057 if (status && lock->lk_is_new && lock_sop)
4129 release_lockowner(lock_sop); 4058 release_lockowner(lock_sop);
4130 if (lock->lk_replay_owner) { 4059 if (!cstate->replay_owner)
4131 nfs4_get_stateowner(lock->lk_replay_owner); 4060 nfs4_unlock_state();
4132 cstate->replay_owner = lock->lk_replay_owner;
4133 }
4134 nfs4_unlock_state();
4135 return status; 4061 return status;
4136} 4062}
4137 4063
@@ -4163,6 +4089,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4163{ 4089{
4164 struct inode *inode; 4090 struct inode *inode;
4165 struct file_lock file_lock; 4091 struct file_lock file_lock;
4092 struct nfs4_lockowner *lo;
4166 int error; 4093 int error;
4167 __be32 status; 4094 __be32 status;
4168 4095
@@ -4172,19 +4099,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4172 if (check_lock_length(lockt->lt_offset, lockt->lt_length)) 4099 if (check_lock_length(lockt->lt_offset, lockt->lt_length))
4173 return nfserr_inval; 4100 return nfserr_inval;
4174 4101
4175 lockt->lt_stateowner = NULL;
4176 nfs4_lock_state(); 4102 nfs4_lock_state();
4177 4103
4178 status = nfserr_stale_clientid; 4104 status = nfserr_stale_clientid;
4179 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) 4105 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
4180 goto out; 4106 goto out;
4181 4107
4182 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { 4108 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
4183 dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n");
4184 if (status == nfserr_symlink)
4185 status = nfserr_inval;
4186 goto out; 4109 goto out;
4187 }
4188 4110
4189 inode = cstate->current_fh.fh_dentry->d_inode; 4111 inode = cstate->current_fh.fh_dentry->d_inode;
4190 locks_init_lock(&file_lock); 4112 locks_init_lock(&file_lock);
@@ -4203,10 +4125,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4203 goto out; 4125 goto out;
4204 } 4126 }
4205 4127
4206 lockt->lt_stateowner = find_lockstateowner_str(inode, 4128 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
4207 &lockt->lt_clientid, &lockt->lt_owner); 4129 if (lo)
4208 if (lockt->lt_stateowner) 4130 file_lock.fl_owner = (fl_owner_t)lo;
4209 file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
4210 file_lock.fl_pid = current->tgid; 4131 file_lock.fl_pid = current->tgid;
4211 file_lock.fl_flags = FL_POSIX; 4132 file_lock.fl_flags = FL_POSIX;
4212 4133
@@ -4234,7 +4155,7 @@ __be32
4234nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 4155nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4235 struct nfsd4_locku *locku) 4156 struct nfsd4_locku *locku)
4236{ 4157{
4237 struct nfs4_stateid *stp; 4158 struct nfs4_ol_stateid *stp;
4238 struct file *filp = NULL; 4159 struct file *filp = NULL;
4239 struct file_lock file_lock; 4160 struct file_lock file_lock;
4240 __be32 status; 4161 __be32 status;
@@ -4249,13 +4170,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4249 4170
4250 nfs4_lock_state(); 4171 nfs4_lock_state();
4251 4172
4252 if ((status = nfs4_preprocess_seqid_op(cstate, 4173 status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
4253 locku->lu_seqid, 4174 &locku->lu_stateid, NFS4_LOCK_STID, &stp);
4254 &locku->lu_stateid, 4175 if (status)
4255 LOCK_STATE,
4256 &locku->lu_stateowner, &stp, NULL)))
4257 goto out; 4176 goto out;
4258
4259 filp = find_any_file(stp->st_file); 4177 filp = find_any_file(stp->st_file);
4260 if (!filp) { 4178 if (!filp) {
4261 status = nfserr_lock_range; 4179 status = nfserr_lock_range;
@@ -4264,7 +4182,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4264 BUG_ON(!filp); 4182 BUG_ON(!filp);
4265 locks_init_lock(&file_lock); 4183 locks_init_lock(&file_lock);
4266 file_lock.fl_type = F_UNLCK; 4184 file_lock.fl_type = F_UNLCK;
4267 file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner; 4185 file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
4268 file_lock.fl_pid = current->tgid; 4186 file_lock.fl_pid = current->tgid;
4269 file_lock.fl_file = filp; 4187 file_lock.fl_file = filp;
4270 file_lock.fl_flags = FL_POSIX; 4188 file_lock.fl_flags = FL_POSIX;
@@ -4285,15 +4203,12 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4285 /* 4203 /*
4286 * OK, unlock succeeded; the only thing left to do is update the stateid. 4204 * OK, unlock succeeded; the only thing left to do is update the stateid.
4287 */ 4205 */
4288 update_stateid(&stp->st_stateid); 4206 update_stateid(&stp->st_stid.sc_stateid);
4289 memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t)); 4207 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4290 4208
4291out: 4209out:
4292 if (locku->lu_stateowner) { 4210 if (!cstate->replay_owner)
4293 nfs4_get_stateowner(locku->lu_stateowner); 4211 nfs4_unlock_state();
4294 cstate->replay_owner = locku->lu_stateowner;
4295 }
4296 nfs4_unlock_state();
4297 return status; 4212 return status;
4298 4213
4299out_nfserr: 4214out_nfserr:
@@ -4307,7 +4222,7 @@ out_nfserr:
4307 * 0: no locks held by lockowner 4222 * 0: no locks held by lockowner
4308 */ 4223 */
4309static int 4224static int
4310check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner) 4225check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
4311{ 4226{
4312 struct file_lock **flpp; 4227 struct file_lock **flpp;
4313 struct inode *inode = filp->fi_inode; 4228 struct inode *inode = filp->fi_inode;
@@ -4332,7 +4247,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4332{ 4247{
4333 clientid_t *clid = &rlockowner->rl_clientid; 4248 clientid_t *clid = &rlockowner->rl_clientid;
4334 struct nfs4_stateowner *sop; 4249 struct nfs4_stateowner *sop;
4335 struct nfs4_stateid *stp; 4250 struct nfs4_lockowner *lo;
4251 struct nfs4_ol_stateid *stp;
4336 struct xdr_netobj *owner = &rlockowner->rl_owner; 4252 struct xdr_netobj *owner = &rlockowner->rl_owner;
4337 struct list_head matches; 4253 struct list_head matches;
4338 int i; 4254 int i;
@@ -4356,16 +4272,15 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4356 * data structures. */ 4272 * data structures. */
4357 INIT_LIST_HEAD(&matches); 4273 INIT_LIST_HEAD(&matches);
4358 for (i = 0; i < LOCK_HASH_SIZE; i++) { 4274 for (i = 0; i < LOCK_HASH_SIZE; i++) {
4359 list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) { 4275 list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
4360 if (!same_owner_str(sop, owner, clid)) 4276 if (!same_owner_str(sop, owner, clid))
4361 continue; 4277 continue;
4362 list_for_each_entry(stp, &sop->so_stateids, 4278 list_for_each_entry(stp, &sop->so_stateids,
4363 st_perstateowner) { 4279 st_perstateowner) {
4364 if (check_for_locks(stp->st_file, sop)) 4280 lo = lockowner(sop);
4281 if (check_for_locks(stp->st_file, lo))
4365 goto out; 4282 goto out;
4366 /* Note: so_perclient unused for lockowners, 4283 list_add(&lo->lo_list, &matches);
4367 * so it's OK to fool with here. */
4368 list_add(&sop->so_perclient, &matches);
4369 } 4284 }
4370 } 4285 }
4371 } 4286 }
@@ -4374,12 +4289,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4374 * have been checked. */ 4289 * have been checked. */
4375 status = nfs_ok; 4290 status = nfs_ok;
4376 while (!list_empty(&matches)) { 4291 while (!list_empty(&matches)) {
4377 sop = list_entry(matches.next, struct nfs4_stateowner, 4292 lo = list_entry(matches.next, struct nfs4_lockowner,
4378 so_perclient); 4293 lo_list);
4379 /* unhash_stateowner deletes so_perclient only 4294 /* unhash_stateowner deletes so_perclient only
4380 * for openowners. */ 4295 * for openowners. */
4381 list_del(&sop->so_perclient); 4296 list_del(&lo->lo_list);
4382 release_lockowner(sop); 4297 release_lockowner(lo);
4383 } 4298 }
4384out: 4299out:
4385 nfs4_unlock_state(); 4300 nfs4_unlock_state();
@@ -4501,16 +4416,10 @@ nfs4_state_init(void)
4501 for (i = 0; i < FILE_HASH_SIZE; i++) { 4416 for (i = 0; i < FILE_HASH_SIZE; i++) {
4502 INIT_LIST_HEAD(&file_hashtbl[i]); 4417 INIT_LIST_HEAD(&file_hashtbl[i]);
4503 } 4418 }
4504 for (i = 0; i < OWNER_HASH_SIZE; i++) { 4419 for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
4505 INIT_LIST_HEAD(&ownerstr_hashtbl[i]); 4420 INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
4506 INIT_LIST_HEAD(&ownerid_hashtbl[i]);
4507 }
4508 for (i = 0; i < STATEID_HASH_SIZE; i++) {
4509 INIT_LIST_HEAD(&stateid_hashtbl[i]);
4510 INIT_LIST_HEAD(&lockstateid_hashtbl[i]);
4511 } 4421 }
4512 for (i = 0; i < LOCK_HASH_SIZE; i++) { 4422 for (i = 0; i < LOCK_HASH_SIZE; i++) {
4513 INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
4514 INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); 4423 INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
4515 } 4424 }
4516 memset(&onestateid, ~0, sizeof(stateid_t)); 4425 memset(&onestateid, ~0, sizeof(stateid_t));
@@ -4527,7 +4436,7 @@ nfsd4_load_reboot_recovery_data(void)
4527 int status; 4436 int status;
4528 4437
4529 nfs4_lock_state(); 4438 nfs4_lock_state();
4530 nfsd4_init_recdir(user_recovery_dirname); 4439 nfsd4_init_recdir();
4531 status = nfsd4_recdir_load(); 4440 status = nfsd4_recdir_load();
4532 nfs4_unlock_state(); 4441 nfs4_unlock_state();
4533 if (status) 4442 if (status)
@@ -4636,40 +4545,3 @@ nfs4_state_shutdown(void)
4636 nfs4_unlock_state(); 4545 nfs4_unlock_state();
4637 nfsd4_destroy_callback_queue(); 4546 nfsd4_destroy_callback_queue();
4638} 4547}
4639
4640/*
4641 * user_recovery_dirname is protected by the nfsd_mutex since it's only
4642 * accessed when nfsd is starting.
4643 */
4644static void
4645nfs4_set_recdir(char *recdir)
4646{
4647 strcpy(user_recovery_dirname, recdir);
4648}
4649
4650/*
4651 * Change the NFSv4 recovery directory to recdir.
4652 */
4653int
4654nfs4_reset_recoverydir(char *recdir)
4655{
4656 int status;
4657 struct path path;
4658
4659 status = kern_path(recdir, LOOKUP_FOLLOW, &path);
4660 if (status)
4661 return status;
4662 status = -ENOTDIR;
4663 if (S_ISDIR(path.dentry->d_inode->i_mode)) {
4664 nfs4_set_recdir(recdir);
4665 status = 0;
4666 }
4667 path_put(&path);
4668 return status;
4669}
4670
4671char *
4672nfs4_recoverydir(void)
4673{
4674 return user_recovery_dirname;
4675}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c8bf405d19de..66d095d7955e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -456,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
456{ 456{
457 DECODE_HEAD; 457 DECODE_HEAD;
458 458
459 close->cl_stateowner = NULL;
460 READ_BUF(4); 459 READ_BUF(4);
461 READ32(close->cl_seqid); 460 READ32(close->cl_seqid);
462 return nfsd4_decode_stateid(argp, &close->cl_stateid); 461 return nfsd4_decode_stateid(argp, &close->cl_stateid);
@@ -551,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
551{ 550{
552 DECODE_HEAD; 551 DECODE_HEAD;
553 552
554 lock->lk_replay_owner = NULL;
555 /* 553 /*
556 * type, reclaim(boolean), offset, length, new_lock_owner(boolean) 554 * type, reclaim(boolean), offset, length, new_lock_owner(boolean)
557 */ 555 */
@@ -611,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
611{ 609{
612 DECODE_HEAD; 610 DECODE_HEAD;
613 611
614 locku->lu_stateowner = NULL;
615 READ_BUF(8); 612 READ_BUF(8);
616 READ32(locku->lu_type); 613 READ32(locku->lu_type);
617 if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) 614 if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
@@ -642,6 +639,83 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
642 DECODE_TAIL; 639 DECODE_TAIL;
643} 640}
644 641
642static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
643{
644 __be32 *p;
645 u32 w;
646
647 READ_BUF(4);
648 READ32(w);
649 *x = w;
650 switch (w & NFS4_SHARE_ACCESS_MASK) {
651 case NFS4_SHARE_ACCESS_READ:
652 case NFS4_SHARE_ACCESS_WRITE:
653 case NFS4_SHARE_ACCESS_BOTH:
654 break;
655 default:
656 return nfserr_bad_xdr;
657 }
658 w &= !NFS4_SHARE_ACCESS_MASK;
659 if (!w)
660 return nfs_ok;
661 if (!argp->minorversion)
662 return nfserr_bad_xdr;
663 switch (w & NFS4_SHARE_WANT_MASK) {
664 case NFS4_SHARE_WANT_NO_PREFERENCE:
665 case NFS4_SHARE_WANT_READ_DELEG:
666 case NFS4_SHARE_WANT_WRITE_DELEG:
667 case NFS4_SHARE_WANT_ANY_DELEG:
668 case NFS4_SHARE_WANT_NO_DELEG:
669 case NFS4_SHARE_WANT_CANCEL:
670 break;
671 default:
672 return nfserr_bad_xdr;
673 }
674 w &= ~NFS4_SHARE_WANT_MASK;
675 if (!w)
676 return nfs_ok;
677 switch (w) {
678 case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
679 case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
680 case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
681 NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
682 return nfs_ok;
683 }
684xdr_error:
685 return nfserr_bad_xdr;
686}
687
688static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
689{
690 __be32 *p;
691
692 READ_BUF(4);
693 READ32(*x);
694 /* Note: unlinke access bits, deny bits may be zero. */
695 if (*x & ~NFS4_SHARE_DENY_BOTH)
696 return nfserr_bad_xdr;
697 return nfs_ok;
698xdr_error:
699 return nfserr_bad_xdr;
700}
701
702static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
703{
704 __be32 *p;
705
706 READ_BUF(4);
707 READ32(o->len);
708
709 if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
710 return nfserr_bad_xdr;
711
712 READ_BUF(o->len);
713 SAVEMEM(o->data, o->len);
714 return nfs_ok;
715xdr_error:
716 return nfserr_bad_xdr;
717}
718
645static __be32 719static __be32
646nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) 720nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
647{ 721{
@@ -649,19 +723,23 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
649 723
650 memset(open->op_bmval, 0, sizeof(open->op_bmval)); 724 memset(open->op_bmval, 0, sizeof(open->op_bmval));
651 open->op_iattr.ia_valid = 0; 725 open->op_iattr.ia_valid = 0;
652 open->op_stateowner = NULL; 726 open->op_openowner = NULL;
653 727
654 /* seqid, share_access, share_deny, clientid, ownerlen */ 728 /* seqid, share_access, share_deny, clientid, ownerlen */
655 READ_BUF(16 + sizeof(clientid_t)); 729 READ_BUF(4);
656 READ32(open->op_seqid); 730 READ32(open->op_seqid);
657 READ32(open->op_share_access); 731 status = nfsd4_decode_share_access(argp, &open->op_share_access);
658 READ32(open->op_share_deny); 732 if (status)
733 goto xdr_error;
734 status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
735 if (status)
736 goto xdr_error;
737 READ_BUF(sizeof(clientid_t));
659 COPYMEM(&open->op_clientid, sizeof(clientid_t)); 738 COPYMEM(&open->op_clientid, sizeof(clientid_t));
660 READ32(open->op_owner.len); 739 status = nfsd4_decode_opaque(argp, &open->op_owner);
661 740 if (status)
662 /* owner, open_flag */ 741 goto xdr_error;
663 READ_BUF(open->op_owner.len + 4); 742 READ_BUF(4);
664 SAVEMEM(open->op_owner.data, open->op_owner.len);
665 READ32(open->op_create); 743 READ32(open->op_create);
666 switch (open->op_create) { 744 switch (open->op_create) {
667 case NFS4_OPEN_NOCREATE: 745 case NFS4_OPEN_NOCREATE:
@@ -727,6 +805,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
727 if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) 805 if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
728 return status; 806 return status;
729 break; 807 break;
808 case NFS4_OPEN_CLAIM_FH:
809 case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
810 if (argp->minorversion < 1)
811 goto xdr_error;
812 /* void */
813 break;
814 case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
815 if (argp->minorversion < 1)
816 goto xdr_error;
817 status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
818 if (status)
819 return status;
820 break;
730 default: 821 default:
731 goto xdr_error; 822 goto xdr_error;
732 } 823 }
@@ -739,7 +830,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
739{ 830{
740 DECODE_HEAD; 831 DECODE_HEAD;
741 832
742 open_conf->oc_stateowner = NULL;
743 status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); 833 status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
744 if (status) 834 if (status)
745 return status; 835 return status;
@@ -754,15 +844,17 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
754{ 844{
755 DECODE_HEAD; 845 DECODE_HEAD;
756 846
757 open_down->od_stateowner = NULL;
758 status = nfsd4_decode_stateid(argp, &open_down->od_stateid); 847 status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
759 if (status) 848 if (status)
760 return status; 849 return status;
761 READ_BUF(12); 850 READ_BUF(4);
762 READ32(open_down->od_seqid); 851 READ32(open_down->od_seqid);
763 READ32(open_down->od_share_access); 852 status = nfsd4_decode_share_access(argp, &open_down->od_share_access);
764 READ32(open_down->od_share_deny); 853 if (status)
765 854 return status;
855 status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
856 if (status)
857 return status;
766 DECODE_TAIL; 858 DECODE_TAIL;
767} 859}
768 860
@@ -903,12 +995,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
903{ 995{
904 DECODE_HEAD; 996 DECODE_HEAD;
905 997
906 READ_BUF(12); 998 READ_BUF(8);
907 COPYMEM(setclientid->se_verf.data, 8); 999 COPYMEM(setclientid->se_verf.data, 8);
908 READ32(setclientid->se_namelen);
909 1000
910 READ_BUF(setclientid->se_namelen + 8); 1001 status = nfsd4_decode_opaque(argp, &setclientid->se_name);
911 SAVEMEM(setclientid->se_name, setclientid->se_namelen); 1002 if (status)
1003 return nfserr_bad_xdr;
1004 READ_BUF(8);
912 READ32(setclientid->se_callback_prog); 1005 READ32(setclientid->se_callback_prog);
913 READ32(setclientid->se_callback_netid_len); 1006 READ32(setclientid->se_callback_netid_len);
914 1007
@@ -1051,11 +1144,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1051 READ_BUF(NFS4_VERIFIER_SIZE); 1144 READ_BUF(NFS4_VERIFIER_SIZE);
1052 COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); 1145 COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
1053 1146
1054 READ_BUF(4); 1147 status = nfsd4_decode_opaque(argp, &exid->clname);
1055 READ32(exid->clname.len); 1148 if (status)
1056 1149 return nfserr_bad_xdr;
1057 READ_BUF(exid->clname.len);
1058 SAVEMEM(exid->clname.data, exid->clname.len);
1059 1150
1060 READ_BUF(4); 1151 READ_BUF(4);
1061 READ32(exid->flags); 1152 READ32(exid->flags);
@@ -1326,6 +1417,16 @@ xdr_error:
1326 goto out; 1417 goto out;
1327} 1418}
1328 1419
1420static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
1421{
1422 DECODE_HEAD;
1423
1424 READ_BUF(8);
1425 COPYMEM(&dc->clientid, 8);
1426
1427 DECODE_TAIL;
1428}
1429
1329static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) 1430static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
1330{ 1431{
1331 DECODE_HEAD; 1432 DECODE_HEAD;
@@ -1447,7 +1548,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1447 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, 1548 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
1448 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, 1549 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid,
1449 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1550 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1450 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, 1551 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
1451 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, 1552 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
1452}; 1553};
1453 1554
@@ -1630,15 +1731,20 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
1630 * we know whether the error to be returned is a sequence id mutating error. 1731 * we know whether the error to be returned is a sequence id mutating error.
1631 */ 1732 */
1632 1733
1633#define ENCODE_SEQID_OP_TAIL(stateowner) do { \ 1734static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
1634 if (seqid_mutating_err(nfserr) && stateowner) { \ 1735{
1635 stateowner->so_seqid++; \ 1736 struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
1636 stateowner->so_replay.rp_status = nfserr; \ 1737
1637 stateowner->so_replay.rp_buflen = \ 1738 if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
1638 (((char *)(resp)->p - (char *)save)); \ 1739 stateowner->so_seqid++;
1639 memcpy(stateowner->so_replay.rp_buf, save, \ 1740 stateowner->so_replay.rp_status = nfserr;
1640 stateowner->so_replay.rp_buflen); \ 1741 stateowner->so_replay.rp_buflen =
1641 } } while (0); 1742 (char *)resp->p - (char *)save;
1743 memcpy(stateowner->so_replay.rp_buf, save,
1744 stateowner->so_replay.rp_buflen);
1745 nfsd4_purge_closed_stateid(stateowner);
1746 }
1747}
1642 1748
1643/* Encode as an array of strings the string given with components 1749/* Encode as an array of strings the string given with components
1644 * separated @sep. 1750 * separated @sep.
@@ -1697,36 +1803,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
1697} 1803}
1698 1804
1699/* 1805/*
1700 * Return the path to an export point in the pseudo filesystem namespace 1806 * Encode a path in RFC3530 'pathname4' format
1701 * Returned string is safe to use as long as the caller holds a reference
1702 * to @exp.
1703 */ 1807 */
1704static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) 1808static __be32 nfsd4_encode_path(const struct path *root,
1809 const struct path *path, __be32 **pp, int *buflen)
1705{ 1810{
1706 struct svc_fh tmp_fh; 1811 struct path cur = {
1707 char *path = NULL, *rootpath; 1812 .mnt = path->mnt,
1708 size_t rootlen; 1813 .dentry = path->dentry,
1814 };
1815 __be32 *p = *pp;
1816 struct dentry **components = NULL;
1817 unsigned int ncomponents = 0;
1818 __be32 err = nfserr_jukebox;
1709 1819
1710 fh_init(&tmp_fh, NFS4_FHSIZE); 1820 dprintk("nfsd4_encode_components(");
1711 *stat = exp_pseudoroot(rqstp, &tmp_fh);
1712 if (*stat)
1713 return NULL;
1714 rootpath = tmp_fh.fh_export->ex_pathname;
1715 1821
1716 path = exp->ex_pathname; 1822 path_get(&cur);
1823 /* First walk the path up to the nfsd root, and store the
1824 * dentries/path components in an array.
1825 */
1826 for (;;) {
1827 if (cur.dentry == root->dentry && cur.mnt == root->mnt)
1828 break;
1829 if (cur.dentry == cur.mnt->mnt_root) {
1830 if (follow_up(&cur))
1831 continue;
1832 goto out_free;
1833 }
1834 if ((ncomponents & 15) == 0) {
1835 struct dentry **new;
1836 new = krealloc(components,
1837 sizeof(*new) * (ncomponents + 16),
1838 GFP_KERNEL);
1839 if (!new)
1840 goto out_free;
1841 components = new;
1842 }
1843 components[ncomponents++] = cur.dentry;
1844 cur.dentry = dget_parent(cur.dentry);
1845 }
1717 1846
1718 rootlen = strlen(rootpath); 1847 *buflen -= 4;
1719 if (strncmp(path, rootpath, rootlen)) { 1848 if (*buflen < 0)
1720 dprintk("nfsd: fs_locations failed;" 1849 goto out_free;
1721 "%s is not contained in %s\n", path, rootpath); 1850 WRITE32(ncomponents);
1722 *stat = nfserr_notsupp; 1851
1723 path = NULL; 1852 while (ncomponents) {
1724 goto out; 1853 struct dentry *dentry = components[ncomponents - 1];
1854 unsigned int len = dentry->d_name.len;
1855
1856 *buflen -= 4 + (XDR_QUADLEN(len) << 2);
1857 if (*buflen < 0)
1858 goto out_free;
1859 WRITE32(len);
1860 WRITEMEM(dentry->d_name.name, len);
1861 dprintk("/%s", dentry->d_name.name);
1862 dput(dentry);
1863 ncomponents--;
1725 } 1864 }
1726 path += rootlen; 1865
1727out: 1866 *pp = p;
1728 fh_put(&tmp_fh); 1867 err = 0;
1729 return path; 1868out_free:
1869 dprintk(")\n");
1870 while (ncomponents)
1871 dput(components[--ncomponents]);
1872 kfree(components);
1873 path_put(&cur);
1874 return err;
1875}
1876
1877static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
1878 const struct path *path, __be32 **pp, int *buflen)
1879{
1880 struct svc_export *exp_ps;
1881 __be32 res;
1882
1883 exp_ps = rqst_find_fsidzero_export(rqstp);
1884 if (IS_ERR(exp_ps))
1885 return nfserrno(PTR_ERR(exp_ps));
1886 res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen);
1887 exp_put(exp_ps);
1888 return res;
1730} 1889}
1731 1890
1732/* 1891/*
@@ -1740,11 +1899,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
1740 int i; 1899 int i;
1741 __be32 *p = *pp; 1900 __be32 *p = *pp;
1742 struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; 1901 struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
1743 char *root = nfsd4_path(rqstp, exp, &status);
1744 1902
1745 if (status) 1903 status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen);
1746 return status;
1747 status = nfsd4_encode_components('/', root, &p, buflen);
1748 if (status) 1904 if (status)
1749 return status; 1905 return status;
1750 if ((*buflen -= 4) < 0) 1906 if ((*buflen -= 4) < 0)
@@ -1760,12 +1916,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
1760 return 0; 1916 return 0;
1761} 1917}
1762 1918
1763static u32 nfs4_ftypes[16] = { 1919static u32 nfs4_file_type(umode_t mode)
1764 NF4BAD, NF4FIFO, NF4CHR, NF4BAD, 1920{
1765 NF4DIR, NF4BAD, NF4BLK, NF4BAD, 1921 switch (mode & S_IFMT) {
1766 NF4REG, NF4BAD, NF4LNK, NF4BAD, 1922 case S_IFIFO: return NF4FIFO;
1767 NF4SOCK, NF4BAD, NF4LNK, NF4BAD, 1923 case S_IFCHR: return NF4CHR;
1768}; 1924 case S_IFDIR: return NF4DIR;
1925 case S_IFBLK: return NF4BLK;
1926 case S_IFLNK: return NF4LNK;
1927 case S_IFREG: return NF4REG;
1928 case S_IFSOCK: return NF4SOCK;
1929 default: return NF4BAD;
1930 };
1931}
1769 1932
1770static __be32 1933static __be32
1771nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, 1934nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
@@ -1954,7 +2117,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1954 if (bmval0 & FATTR4_WORD0_TYPE) { 2117 if (bmval0 & FATTR4_WORD0_TYPE) {
1955 if ((buflen -= 4) < 0) 2118 if ((buflen -= 4) < 0)
1956 goto out_resource; 2119 goto out_resource;
1957 dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12]; 2120 dummy = nfs4_file_type(stat.mode);
1958 if (dummy == NF4BAD) 2121 if (dummy == NF4BAD)
1959 goto out_serverfault; 2122 goto out_serverfault;
1960 WRITE32(dummy); 2123 WRITE32(dummy);
@@ -2488,7 +2651,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
2488 if (!nfserr) 2651 if (!nfserr)
2489 nfsd4_encode_stateid(resp, &close->cl_stateid); 2652 nfsd4_encode_stateid(resp, &close->cl_stateid);
2490 2653
2491 ENCODE_SEQID_OP_TAIL(close->cl_stateowner); 2654 encode_seqid_op_tail(resp, save, nfserr);
2492 return nfserr; 2655 return nfserr;
2493} 2656}
2494 2657
@@ -2564,17 +2727,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
2564static void 2727static void
2565nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) 2728nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
2566{ 2729{
2730 struct xdr_netobj *conf = &ld->ld_owner;
2567 __be32 *p; 2731 __be32 *p;
2568 2732
2569 RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0)); 2733 RESERVE_SPACE(32 + XDR_LEN(conf->len));
2570 WRITE64(ld->ld_start); 2734 WRITE64(ld->ld_start);
2571 WRITE64(ld->ld_length); 2735 WRITE64(ld->ld_length);
2572 WRITE32(ld->ld_type); 2736 WRITE32(ld->ld_type);
2573 if (ld->ld_sop) { 2737 if (conf->len) {
2574 WRITEMEM(&ld->ld_clientid, 8); 2738 WRITEMEM(&ld->ld_clientid, 8);
2575 WRITE32(ld->ld_sop->so_owner.len); 2739 WRITE32(conf->len);
2576 WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len); 2740 WRITEMEM(conf->data, conf->len);
2577 kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner); 2741 kfree(conf->data);
2578 } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ 2742 } else { /* non - nfsv4 lock in conflict, no clientid nor owner */
2579 WRITE64((u64)0); /* clientid */ 2743 WRITE64((u64)0); /* clientid */
2580 WRITE32(0); /* length of owner name */ 2744 WRITE32(0); /* length of owner name */
@@ -2592,7 +2756,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
2592 else if (nfserr == nfserr_denied) 2756 else if (nfserr == nfserr_denied)
2593 nfsd4_encode_lock_denied(resp, &lock->lk_denied); 2757 nfsd4_encode_lock_denied(resp, &lock->lk_denied);
2594 2758
2595 ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); 2759 encode_seqid_op_tail(resp, save, nfserr);
2596 return nfserr; 2760 return nfserr;
2597} 2761}
2598 2762
@@ -2612,7 +2776,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
2612 if (!nfserr) 2776 if (!nfserr)
2613 nfsd4_encode_stateid(resp, &locku->lu_stateid); 2777 nfsd4_encode_stateid(resp, &locku->lu_stateid);
2614 2778
2615 ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); 2779 encode_seqid_op_tail(resp, save, nfserr);
2616 return nfserr; 2780 return nfserr;
2617} 2781}
2618 2782
@@ -2693,7 +2857,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
2693 } 2857 }
2694 /* XXX save filehandle here */ 2858 /* XXX save filehandle here */
2695out: 2859out:
2696 ENCODE_SEQID_OP_TAIL(open->op_stateowner); 2860 encode_seqid_op_tail(resp, save, nfserr);
2697 return nfserr; 2861 return nfserr;
2698} 2862}
2699 2863
@@ -2705,7 +2869,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
2705 if (!nfserr) 2869 if (!nfserr)
2706 nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); 2870 nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
2707 2871
2708 ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); 2872 encode_seqid_op_tail(resp, save, nfserr);
2709 return nfserr; 2873 return nfserr;
2710} 2874}
2711 2875
@@ -2717,7 +2881,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
2717 if (!nfserr) 2881 if (!nfserr)
2718 nfsd4_encode_stateid(resp, &od->od_stateid); 2882 nfsd4_encode_stateid(resp, &od->od_stateid);
2719 2883
2720 ENCODE_SEQID_OP_TAIL(od->od_stateowner); 2884 encode_seqid_op_tail(resp, save, nfserr);
2721 return nfserr; 2885 return nfserr;
2722} 2886}
2723 2887
@@ -2759,8 +2923,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2759 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, 2923 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
2760 &maxcount); 2924 &maxcount);
2761 2925
2762 if (nfserr == nfserr_symlink)
2763 nfserr = nfserr_inval;
2764 if (nfserr) 2926 if (nfserr)
2765 return nfserr; 2927 return nfserr;
2766 eof = (read->rd_offset + maxcount >= 2928 eof = (read->rd_offset + maxcount >=
@@ -2886,8 +3048,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
2886 readdir->common.err == nfserr_toosmall && 3048 readdir->common.err == nfserr_toosmall &&
2887 readdir->buffer == page) 3049 readdir->buffer == page)
2888 nfserr = nfserr_toosmall; 3050 nfserr = nfserr_toosmall;
2889 if (nfserr == nfserr_symlink)
2890 nfserr = nfserr_notdir;
2891 if (nfserr) 3051 if (nfserr)
2892 goto err_no_verf; 3052 goto err_no_verf;
2893 3053
@@ -3218,9 +3378,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3218 WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); 3378 WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3219 WRITE32(seq->seqid); 3379 WRITE32(seq->seqid);
3220 WRITE32(seq->slotid); 3380 WRITE32(seq->slotid);
3221 WRITE32(seq->maxslots); 3381 /* Note slotid's are numbered from zero: */
3222 /* For now: target_maxslots = maxslots */ 3382 WRITE32(seq->maxslots - 1); /* sr_highest_slotid */
3223 WRITE32(seq->maxslots); 3383 WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */
3224 WRITE32(seq->status_flags); 3384 WRITE32(seq->status_flags);
3225 3385
3226 ADJUST_ARGS(); 3386 ADJUST_ARGS();
@@ -3233,6 +3393,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
3233 struct nfsd4_test_stateid *test_stateid) 3393 struct nfsd4_test_stateid *test_stateid)
3234{ 3394{
3235 struct nfsd4_compoundargs *argp; 3395 struct nfsd4_compoundargs *argp;
3396 struct nfs4_client *cl = resp->cstate.session->se_client;
3236 stateid_t si; 3397 stateid_t si;
3237 __be32 *p; 3398 __be32 *p;
3238 int i; 3399 int i;
@@ -3248,7 +3409,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
3248 nfs4_lock_state(); 3409 nfs4_lock_state();
3249 for (i = 0; i < test_stateid->ts_num_ids; i++) { 3410 for (i = 0; i < test_stateid->ts_num_ids; i++) {
3250 nfsd4_decode_stateid(argp, &si); 3411 nfsd4_decode_stateid(argp, &si);
3251 valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session); 3412 valid = nfs4_validate_stateid(cl, &si);
3252 RESERVE_SPACE(4); 3413 RESERVE_SPACE(4);
3253 *p++ = htonl(valid); 3414 *p++ = htonl(valid);
3254 resp->p = p; 3415 resp->p = p;
@@ -3334,34 +3495,29 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3334 3495
3335/* 3496/*
3336 * Calculate the total amount of memory that the compound response has taken 3497 * Calculate the total amount of memory that the compound response has taken
3337 * after encoding the current operation. 3498 * after encoding the current operation with pad.
3338 * 3499 *
3339 * pad: add on 8 bytes for the next operation's op_code and status so that 3500 * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop()
3340 * there is room to cache a failure on the next operation. 3501 * which was specified at nfsd4_operation, else pad is zero.
3341 * 3502 *
3342 * Compare this length to the session se_fmaxresp_cached. 3503 * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached.
3343 * 3504 *
3344 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so 3505 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
3345 * will be at least a page and will therefore hold the xdr_buf head. 3506 * will be at least a page and will therefore hold the xdr_buf head.
3346 */ 3507 */
3347static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) 3508int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
3348{ 3509{
3349 int status = 0;
3350 struct xdr_buf *xb = &resp->rqstp->rq_res; 3510 struct xdr_buf *xb = &resp->rqstp->rq_res;
3351 struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
3352 struct nfsd4_session *session = NULL; 3511 struct nfsd4_session *session = NULL;
3353 struct nfsd4_slot *slot = resp->cstate.slot; 3512 struct nfsd4_slot *slot = resp->cstate.slot;
3354 u32 length, tlen = 0, pad = 8; 3513 u32 length, tlen = 0;
3355 3514
3356 if (!nfsd4_has_session(&resp->cstate)) 3515 if (!nfsd4_has_session(&resp->cstate))
3357 return status; 3516 return 0;
3358 3517
3359 session = resp->cstate.session; 3518 session = resp->cstate.session;
3360 if (session == NULL || slot->sl_cachethis == 0) 3519 if (session == NULL)
3361 return status; 3520 return 0;
3362
3363 if (resp->opcnt >= args->opcnt)
3364 pad = 0; /* this is the last operation */
3365 3521
3366 if (xb->page_len == 0) { 3522 if (xb->page_len == 0) {
3367 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; 3523 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3374,10 +3530,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3374 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, 3530 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
3375 length, xb->page_len, tlen, pad); 3531 length, xb->page_len, tlen, pad);
3376 3532
3377 if (length <= session->se_fchannel.maxresp_cached) 3533 if (length > session->se_fchannel.maxresp_sz)
3378 return status; 3534 return nfserr_rep_too_big;
3379 else 3535
3536 if (slot->sl_cachethis == 1 &&
3537 length > session->se_fchannel.maxresp_cached)
3380 return nfserr_rep_too_big_to_cache; 3538 return nfserr_rep_too_big_to_cache;
3539
3540 return 0;
3381} 3541}
3382 3542
3383void 3543void
@@ -3397,8 +3557,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
3397 !nfsd4_enc_ops[op->opnum]); 3557 !nfsd4_enc_ops[op->opnum]);
3398 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); 3558 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
3399 /* nfsd4_check_drc_limit guarantees enough room for error status */ 3559 /* nfsd4_check_drc_limit guarantees enough room for error status */
3400 if (!op->status && nfsd4_check_drc_limit(resp)) 3560 if (!op->status)
3401 op->status = nfserr_rep_too_big_to_cache; 3561 op->status = nfsd4_check_resp_size(resp, 0);
3402status: 3562status:
3403 /* 3563 /*
3404 * Note: We write the status directly, instead of using WRITE32(), 3564 * Note: We write the status directly, instead of using WRITE32(),
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c7716143cbd1..db34a585e112 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -9,7 +9,6 @@
9#include <linux/ctype.h> 9#include <linux/ctype.h>
10 10
11#include <linux/sunrpc/svcsock.h> 11#include <linux/sunrpc/svcsock.h>
12#include <linux/nfsd/syscall.h>
13#include <linux/lockd/lockd.h> 12#include <linux/lockd/lockd.h>
14#include <linux/sunrpc/clnt.h> 13#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/gss_api.h> 14#include <linux/sunrpc/gss_api.h>
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7ecfa2420307..58134a23fdfb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -11,13 +11,39 @@
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/mount.h> 12#include <linux/mount.h>
13 13
14#include <linux/nfs.h>
15#include <linux/nfs2.h>
16#include <linux/nfs3.h>
17#include <linux/nfs4.h>
18#include <linux/sunrpc/msg_prot.h>
19
14#include <linux/nfsd/debug.h> 20#include <linux/nfsd/debug.h>
15#include <linux/nfsd/export.h> 21#include <linux/nfsd/export.h>
16#include <linux/nfsd/stats.h> 22#include <linux/nfsd/stats.h>
23
17/* 24/*
18 * nfsd version 25 * nfsd version
19 */ 26 */
20#define NFSD_SUPPORTED_MINOR_VERSION 1 27#define NFSD_SUPPORTED_MINOR_VERSION 1
28/*
29 * Maximum blocksizes supported by daemon under various circumstances.
30 */
31#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD
32/* NFSv2 is limited by the protocol specification, see RFC 1094 */
33#define NFSSVC_MAXBLKSIZE_V2 (8*1024)
34
35
36/*
37 * Largest number of bytes we need to allocate for an NFS
38 * call or reply. Used to control buffer sizes. We use
39 * the length of v3 WRITE, READDIR and READDIR replies
40 * which are an RPC header, up to 26 XDR units of reply
41 * data, and some page data.
42 *
43 * Note that accuracy here doesn't matter too much as the
44 * size is rounded up to a page size when allocating space.
45 */
46#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
21 47
22struct readdir_cd { 48struct readdir_cd {
23 __be32 err; /* 0, nfserr, or nfserr_eof */ 49 __be32 err; /* 0, nfserr, or nfserr_eof */
@@ -335,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
335#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ 361#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
336 NFSD_WRITEABLE_ATTRS_WORD2 362 NFSD_WRITEABLE_ATTRS_WORD2
337 363
364extern int nfsd4_is_junction(struct dentry *dentry);
365#else
366static inline int nfsd4_is_junction(struct dentry *dentry)
367{
368 return 0;
369}
370
338#endif /* CONFIG_NFSD_V4 */ 371#endif /* CONFIG_NFSD_V4 */
339 372
340#endif /* LINUX_NFSD_NFSD_H */ 373#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 90c6aa6d5e0f..c763de5c1157 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,28 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
59 * the write call). 59 * the write call).
60 */ 60 */
61static inline __be32 61static inline __be32
62nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) 62nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested)
63{ 63{
64 /* Type can be negative when creating hardlinks - not to a dir */ 64 mode &= S_IFMT;
65 if (type > 0 && (mode & S_IFMT) != type) { 65
66 if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) 66 if (requested == 0) /* the caller doesn't care */
67 return nfserr_symlink; 67 return nfs_ok;
68 else if (type == S_IFDIR) 68 if (mode == requested)
69 return nfserr_notdir; 69 return nfs_ok;
70 else if ((mode & S_IFMT) == S_IFDIR) 70 /*
71 return nfserr_isdir; 71 * v4 has an error more specific than err_notdir which we should
72 else 72 * return in preference to err_notdir:
73 return nfserr_inval; 73 */
74 } 74 if (rqstp->rq_vers == 4 && mode == S_IFLNK)
75 if (type < 0 && (mode & S_IFMT) == -type) { 75 return nfserr_symlink;
76 if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) 76 if (requested == S_IFDIR)
77 return nfserr_symlink; 77 return nfserr_notdir;
78 else if (type == -S_IFDIR) 78 if (mode == S_IFDIR)
79 return nfserr_isdir; 79 return nfserr_isdir;
80 else 80 return nfserr_inval;
81 return nfserr_notdir;
82 }
83 return 0;
84} 81}
85 82
86static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, 83static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4eefaf1b42e8..a3cf38476a1b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
35#ifndef _NFSD4_STATE_H 35#ifndef _NFSD4_STATE_H
36#define _NFSD4_STATE_H 36#define _NFSD4_STATE_H
37 37
38#include <linux/idr.h>
38#include <linux/sunrpc/svc_xprt.h> 39#include <linux/sunrpc/svc_xprt.h>
39#include <linux/nfsd/nfsfh.h> 40#include <linux/nfsd/nfsfh.h>
40#include "nfsfh.h" 41#include "nfsfh.h"
@@ -45,24 +46,20 @@ typedef struct {
45} clientid_t; 46} clientid_t;
46 47
47typedef struct { 48typedef struct {
48 u32 so_boot; 49 clientid_t so_clid;
49 u32 so_stateownerid; 50 u32 so_id;
50 u32 so_fileid;
51} stateid_opaque_t; 51} stateid_opaque_t;
52 52
53typedef struct { 53typedef struct {
54 u32 si_generation; 54 u32 si_generation;
55 stateid_opaque_t si_opaque; 55 stateid_opaque_t si_opaque;
56} stateid_t; 56} stateid_t;
57#define si_boot si_opaque.so_boot
58#define si_stateownerid si_opaque.so_stateownerid
59#define si_fileid si_opaque.so_fileid
60 57
61#define STATEID_FMT "(%08x/%08x/%08x/%08x)" 58#define STATEID_FMT "(%08x/%08x/%08x/%08x)"
62#define STATEID_VAL(s) \ 59#define STATEID_VAL(s) \
63 (s)->si_boot, \ 60 (s)->si_opaque.so_clid.cl_boot, \
64 (s)->si_stateownerid, \ 61 (s)->si_opaque.so_clid.cl_id, \
65 (s)->si_fileid, \ 62 (s)->si_opaque.so_id, \
66 (s)->si_generation 63 (s)->si_generation
67 64
68struct nfsd4_callback { 65struct nfsd4_callback {
@@ -76,17 +73,27 @@ struct nfsd4_callback {
76 bool cb_done; 73 bool cb_done;
77}; 74};
78 75
76struct nfs4_stid {
77#define NFS4_OPEN_STID 1
78#define NFS4_LOCK_STID 2
79#define NFS4_DELEG_STID 4
80/* For an open stateid kept around *only* to process close replays: */
81#define NFS4_CLOSED_STID 8
82 unsigned char sc_type;
83 stateid_t sc_stateid;
84 struct nfs4_client *sc_client;
85};
86
79struct nfs4_delegation { 87struct nfs4_delegation {
88 struct nfs4_stid dl_stid; /* must be first field */
80 struct list_head dl_perfile; 89 struct list_head dl_perfile;
81 struct list_head dl_perclnt; 90 struct list_head dl_perclnt;
82 struct list_head dl_recall_lru; /* delegation recalled */ 91 struct list_head dl_recall_lru; /* delegation recalled */
83 atomic_t dl_count; /* ref count */ 92 atomic_t dl_count; /* ref count */
84 struct nfs4_client *dl_client;
85 struct nfs4_file *dl_file; 93 struct nfs4_file *dl_file;
86 u32 dl_type; 94 u32 dl_type;
87 time_t dl_time; 95 time_t dl_time;
88/* For recall: */ 96/* For recall: */
89 stateid_t dl_stateid;
90 struct knfsd_fh dl_fh; 97 struct knfsd_fh dl_fh;
91 int dl_retries; 98 int dl_retries;
92 struct nfsd4_callback dl_recall; 99 struct nfsd4_callback dl_recall;
@@ -104,6 +111,11 @@ struct nfs4_cb_conn {
104 struct svc_xprt *cb_xprt; /* minorversion 1 only */ 111 struct svc_xprt *cb_xprt; /* minorversion 1 only */
105}; 112};
106 113
114static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
115{
116 return container_of(s, struct nfs4_delegation, dl_stid);
117}
118
107/* Maximum number of slots per session. 160 is useful for long haul TCP */ 119/* Maximum number of slots per session. 160 is useful for long haul TCP */
108#define NFSD_MAX_SLOTS_PER_SESSION 160 120#define NFSD_MAX_SLOTS_PER_SESSION 160
109/* Maximum number of operations per session compound */ 121/* Maximum number of operations per session compound */
@@ -220,6 +232,7 @@ struct nfs4_client {
220 struct list_head cl_idhash; /* hash by cl_clientid.id */ 232 struct list_head cl_idhash; /* hash by cl_clientid.id */
221 struct list_head cl_strhash; /* hash by cl_name */ 233 struct list_head cl_strhash; /* hash by cl_name */
222 struct list_head cl_openowners; 234 struct list_head cl_openowners;
235 struct idr cl_stateids; /* stateid lookup */
223 struct list_head cl_delegations; 236 struct list_head cl_delegations;
224 struct list_head cl_lru; /* tail queue */ 237 struct list_head cl_lru; /* tail queue */
225 struct xdr_netobj cl_name; /* id generated by client */ 238 struct xdr_netobj cl_name; /* id generated by client */
@@ -245,6 +258,7 @@ struct nfs4_client {
245#define NFSD4_CB_UP 0 258#define NFSD4_CB_UP 0
246#define NFSD4_CB_UNKNOWN 1 259#define NFSD4_CB_UNKNOWN 1
247#define NFSD4_CB_DOWN 2 260#define NFSD4_CB_DOWN 2
261#define NFSD4_CB_FAULT 3
248 int cl_cb_state; 262 int cl_cb_state;
249 struct nfsd4_callback cl_cb_null; 263 struct nfsd4_callback cl_cb_null;
250 struct nfsd4_session *cl_cb_session; 264 struct nfsd4_session *cl_cb_session;
@@ -293,6 +307,9 @@ static inline void
293update_stateid(stateid_t *stateid) 307update_stateid(stateid_t *stateid)
294{ 308{
295 stateid->si_generation++; 309 stateid->si_generation++;
310 /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
311 if (stateid->si_generation == 0)
312 stateid->si_generation = 1;
296} 313}
297 314
298/* A reasonable value for REPLAY_ISIZE was estimated as follows: 315/* A reasonable value for REPLAY_ISIZE was estimated as follows:
@@ -312,49 +329,57 @@ struct nfs4_replay {
312 __be32 rp_status; 329 __be32 rp_status;
313 unsigned int rp_buflen; 330 unsigned int rp_buflen;
314 char *rp_buf; 331 char *rp_buf;
315 unsigned intrp_allocated;
316 struct knfsd_fh rp_openfh; 332 struct knfsd_fh rp_openfh;
317 char rp_ibuf[NFSD4_REPLAY_ISIZE]; 333 char rp_ibuf[NFSD4_REPLAY_ISIZE];
318}; 334};
319 335
320/*
321* nfs4_stateowner can either be an open_owner, or a lock_owner
322*
323* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
324* for lock_owner
325* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
326* for lock_owner
327* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
328* struct is reaped.
329* so_perfilestate: heads the list of nfs4_stateid (either open or lock)
330* and is used to ensure no dangling nfs4_stateid references when we
331* release a stateowner.
332* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
333* close is called to reap associated byte-range locks
334* so_close_lru: (open) stateowner is placed on this list instead of being
335* reaped (when so_perfilestate is empty) to hold the last close replay.
336* reaped by laundramat thread after lease period.
337*/
338struct nfs4_stateowner { 336struct nfs4_stateowner {
339 struct kref so_ref;
340 struct list_head so_idhash; /* hash by so_id */
341 struct list_head so_strhash; /* hash by op_name */ 337 struct list_head so_strhash; /* hash by op_name */
342 struct list_head so_perclient;
343 struct list_head so_stateids; 338 struct list_head so_stateids;
344 struct list_head so_perstateid; /* for lockowners only */
345 struct list_head so_close_lru; /* tail queue */
346 time_t so_time; /* time of placement on so_close_lru */
347 int so_is_open_owner; /* 1=openowner,0=lockowner */
348 u32 so_id;
349 struct nfs4_client * so_client; 339 struct nfs4_client * so_client;
350 /* after increment in ENCODE_SEQID_OP_TAIL, represents the next 340 /* after increment in ENCODE_SEQID_OP_TAIL, represents the next
351 * sequence id expected from the client: */ 341 * sequence id expected from the client: */
352 u32 so_seqid; 342 u32 so_seqid;
353 struct xdr_netobj so_owner; /* open owner name */ 343 struct xdr_netobj so_owner; /* open owner name */
354 int so_confirmed; /* successful OPEN_CONFIRM? */
355 struct nfs4_replay so_replay; 344 struct nfs4_replay so_replay;
345 bool so_is_open_owner;
356}; 346};
357 347
348struct nfs4_openowner {
349 struct nfs4_stateowner oo_owner; /* must be first field */
350 struct list_head oo_perclient;
351 /*
352 * We keep around openowners a little while after last close,
353 * which saves clients from having to confirm, and allows us to
354 * handle close replays if they come soon enough. The close_lru
355 * is a list of such openowners, to be reaped by the laundromat
356 * thread eventually if they remain unused:
357 */
358 struct list_head oo_close_lru;
359 struct nfs4_ol_stateid *oo_last_closed_stid;
360 time_t oo_time; /* time of placement on so_close_lru */
361#define NFS4_OO_CONFIRMED 1
362#define NFS4_OO_PURGE_CLOSE 2
363#define NFS4_OO_NEW 4
364 unsigned char oo_flags;
365};
366
367struct nfs4_lockowner {
368 struct nfs4_stateowner lo_owner; /* must be first element */
369 struct list_head lo_perstateid; /* for lockowners only */
370 struct list_head lo_list; /* for temporary uses */
371};
372
373static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
374{
375 return container_of(so, struct nfs4_openowner, oo_owner);
376}
377
378static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
379{
380 return container_of(so, struct nfs4_lockowner, lo_owner);
381}
382
358/* 383/*
359* nfs4_file: a file opened by some number of (open) nfs4_stateowners. 384* nfs4_file: a file opened by some number of (open) nfs4_stateowners.
360* o fi_perfile list is used to search for conflicting 385* o fi_perfile list is used to search for conflicting
@@ -368,17 +393,17 @@ struct nfs4_file {
368 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ 393 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
369 struct file * fi_fds[3]; 394 struct file * fi_fds[3];
370 /* 395 /*
371 * Each open or lock stateid contributes 1 to either 396 * Each open or lock stateid contributes 0-4 to the counts
372 * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending 397 * below depending on which bits are set in st_access_bitmap:
373 * on open or lock mode: 398 * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set
399 * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set
400 * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
374 */ 401 */
375 atomic_t fi_access[2]; 402 atomic_t fi_access[2];
376 struct file *fi_deleg_file; 403 struct file *fi_deleg_file;
377 struct file_lock *fi_lease; 404 struct file_lock *fi_lease;
378 atomic_t fi_delegees; 405 atomic_t fi_delegees;
379 struct inode *fi_inode; 406 struct inode *fi_inode;
380 u32 fi_id; /* used with stateowner->so_id
381 * for stateid_hashtbl hash */
382 bool fi_had_conflict; 407 bool fi_had_conflict;
383}; 408};
384 409
@@ -408,50 +433,27 @@ static inline struct file *find_any_file(struct nfs4_file *f)
408 return f->fi_fds[O_RDONLY]; 433 return f->fi_fds[O_RDONLY];
409} 434}
410 435
411/* 436/* "ol" stands for "Open or Lock". Better suggestions welcome. */
412* nfs4_stateid can either be an open stateid or (eventually) a lock stateid 437struct nfs4_ol_stateid {
413* 438 struct nfs4_stid st_stid; /* must be first field */
414* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
415*
416* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
417* st_perfile: file_hashtbl[] entry.
418* st_perfile_state: nfs4_stateowner->so_perfilestate
419* st_perlockowner: (open stateid) list of lock nfs4_stateowners
420* st_access_bmap: used only for open stateid
421* st_deny_bmap: used only for open stateid
422* st_openstp: open stateid lock stateid was derived from
423*
424* XXX: open stateids and lock stateids have diverged sufficiently that
425* we should consider defining separate structs for the two cases.
426*/
427
428struct nfs4_stateid {
429 struct list_head st_hash;
430 struct list_head st_perfile; 439 struct list_head st_perfile;
431 struct list_head st_perstateowner; 440 struct list_head st_perstateowner;
432 struct list_head st_lockowners; 441 struct list_head st_lockowners;
433 struct nfs4_stateowner * st_stateowner; 442 struct nfs4_stateowner * st_stateowner;
434 struct nfs4_file * st_file; 443 struct nfs4_file * st_file;
435 stateid_t st_stateid;
436 unsigned long st_access_bmap; 444 unsigned long st_access_bmap;
437 unsigned long st_deny_bmap; 445 unsigned long st_deny_bmap;
438 struct nfs4_stateid * st_openstp; 446 struct nfs4_ol_stateid * st_openstp;
439}; 447};
440 448
449static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
450{
451 return container_of(s, struct nfs4_ol_stateid, st_stid);
452}
453
441/* flags for preprocess_seqid_op() */ 454/* flags for preprocess_seqid_op() */
442#define HAS_SESSION 0x00000001
443#define CONFIRM 0x00000002
444#define OPEN_STATE 0x00000004
445#define LOCK_STATE 0x00000008
446#define RD_STATE 0x00000010 455#define RD_STATE 0x00000010
447#define WR_STATE 0x00000020 456#define WR_STATE 0x00000020
448#define CLOSE_STATE 0x00000040
449
450#define seqid_mutating_err(err) \
451 (((err) != nfserr_stale_clientid) && \
452 ((err) != nfserr_bad_seqid) && \
453 ((err) != nfserr_stale_stateid) && \
454 ((err) != nfserr_bad_stateid))
455 457
456struct nfsd4_compound_state; 458struct nfsd4_compound_state;
457 459
@@ -461,7 +463,8 @@ extern void nfs4_lock_state(void);
461extern void nfs4_unlock_state(void); 463extern void nfs4_unlock_state(void);
462extern int nfs4_in_grace(void); 464extern int nfs4_in_grace(void);
463extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 465extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
464extern void nfs4_free_stateowner(struct kref *kref); 466extern void nfs4_free_openowner(struct nfs4_openowner *);
467extern void nfs4_free_lockowner(struct nfs4_lockowner *);
465extern int set_callback_cred(void); 468extern int set_callback_cred(void);
466extern void nfsd4_probe_callback(struct nfs4_client *clp); 469extern void nfsd4_probe_callback(struct nfs4_client *clp);
467extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); 470extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
@@ -473,7 +476,7 @@ extern void nfsd4_destroy_callback_queue(void);
473extern void nfsd4_shutdown_callback(struct nfs4_client *); 476extern void nfsd4_shutdown_callback(struct nfs4_client *);
474extern void nfs4_put_delegation(struct nfs4_delegation *dp); 477extern void nfs4_put_delegation(struct nfs4_delegation *dp);
475extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 478extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
476extern void nfsd4_init_recdir(char *recdir_name); 479extern void nfsd4_init_recdir(void);
477extern int nfsd4_recdir_load(void); 480extern int nfsd4_recdir_load(void);
478extern void nfsd4_shutdown_recdir(void); 481extern void nfsd4_shutdown_recdir(void);
479extern int nfs4_client_to_reclaim(const char *name); 482extern int nfs4_client_to_reclaim(const char *name);
@@ -482,18 +485,7 @@ extern void nfsd4_recdir_purge_old(void);
482extern int nfsd4_create_clid_dir(struct nfs4_client *clp); 485extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
483extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); 486extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
484extern void release_session_client(struct nfsd4_session *); 487extern void release_session_client(struct nfsd4_session *);
485extern __be32 nfs4_validate_stateid(stateid_t *, int); 488extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
486 489extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
487static inline void
488nfs4_put_stateowner(struct nfs4_stateowner *so)
489{
490 kref_put(&so->so_ref, nfs4_free_stateowner);
491}
492
493static inline void
494nfs4_get_stateowner(struct nfs4_stateowner *so)
495{
496 kref_get(&so->so_ref);
497}
498 490
499#endif /* NFSD4_STATE_H */ 491#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fd0acca5370a..7a2e442623c8 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
168{ 168{
169 if (d_mountpoint(dentry)) 169 if (d_mountpoint(dentry))
170 return 1; 170 return 1;
171 if (nfsd4_is_junction(dentry))
172 return 1;
171 if (!(exp->ex_flags & NFSEXP_V4ROOT)) 173 if (!(exp->ex_flags & NFSEXP_V4ROOT))
172 return 0; 174 return 0;
173 return dentry->d_inode != NULL; 175 return dentry->d_inode != NULL;
@@ -502,7 +504,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
502 unsigned int flags = 0; 504 unsigned int flags = 0;
503 505
504 /* Get inode */ 506 /* Get inode */
505 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); 507 error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
506 if (error) 508 if (error)
507 return error; 509 return error;
508 510
@@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
592 return error; 594 return error;
593} 595}
594 596
597#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
598#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
599int nfsd4_is_junction(struct dentry *dentry)
600{
601 struct inode *inode = dentry->d_inode;
602
603 if (inode == NULL)
604 return 0;
605 if (inode->i_mode & S_IXUGO)
606 return 0;
607 if (!(inode->i_mode & S_ISVTX))
608 return 0;
609 if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
610 return 0;
611 return 1;
612}
595#endif /* defined(CONFIG_NFSD_V4) */ 613#endif /* defined(CONFIG_NFSD_V4) */
596 614
597#ifdef CONFIG_NFSD_V3 615#ifdef CONFIG_NFSD_V3
@@ -1352,7 +1370,7 @@ __be32
1352do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, 1370do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1353 char *fname, int flen, struct iattr *iap, 1371 char *fname, int flen, struct iattr *iap,
1354 struct svc_fh *resfhp, int createmode, u32 *verifier, 1372 struct svc_fh *resfhp, int createmode, u32 *verifier,
1355 int *truncp, int *created) 1373 bool *truncp, bool *created)
1356{ 1374{
1357 struct dentry *dentry, *dchild = NULL; 1375 struct dentry *dentry, *dchild = NULL;
1358 struct inode *dirp; 1376 struct inode *dirp;
@@ -1632,10 +1650,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1632 err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); 1650 err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
1633 if (err) 1651 if (err)
1634 goto out; 1652 goto out;
1635 err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP); 1653 err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
1636 if (err) 1654 if (err)
1637 goto out; 1655 goto out;
1638 1656 err = nfserr_isdir;
1657 if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode))
1658 goto out;
1639 err = nfserr_perm; 1659 err = nfserr_perm;
1640 if (!len) 1660 if (!len)
1641 goto out; 1661 goto out;
@@ -2114,7 +2134,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2114 2134
2115 /* Allow read access to binaries even when mode 111 */ 2135 /* Allow read access to binaries even when mode 111 */
2116 if (err == -EACCES && S_ISREG(inode->i_mode) && 2136 if (err == -EACCES && S_ISREG(inode->i_mode) &&
2117 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) 2137 (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
2138 acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
2118 err = inode_permission(inode, MAY_EXEC); 2139 err = inode_permission(inode, MAY_EXEC);
2119 2140
2120 return err? nfserrno(err) : 0; 2141 return err? nfserrno(err) : 0;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index e0bbac04d1dd..3f54ad03bb2b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -10,21 +10,22 @@
10/* 10/*
11 * Flags for nfsd_permission 11 * Flags for nfsd_permission
12 */ 12 */
13#define NFSD_MAY_NOP 0 13#define NFSD_MAY_NOP 0
14#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ 14#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */
15#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ 15#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */
16#define NFSD_MAY_READ 4 /* == MAY_READ */ 16#define NFSD_MAY_READ 0x004 /* == MAY_READ */
17#define NFSD_MAY_SATTR 8 17#define NFSD_MAY_SATTR 0x008
18#define NFSD_MAY_TRUNC 16 18#define NFSD_MAY_TRUNC 0x010
19#define NFSD_MAY_LOCK 32 19#define NFSD_MAY_LOCK 0x020
20#define NFSD_MAY_MASK 63 20#define NFSD_MAY_MASK 0x03f
21 21
22/* extra hints to permission and open routines: */ 22/* extra hints to permission and open routines: */
23#define NFSD_MAY_OWNER_OVERRIDE 64 23#define NFSD_MAY_OWNER_OVERRIDE 0x040
24#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ 24#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */
25#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 25#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100
26#define NFSD_MAY_NOT_BREAK_LEASE 512 26#define NFSD_MAY_NOT_BREAK_LEASE 0x200
27#define NFSD_MAY_BYPASS_GSS 1024 27#define NFSD_MAY_BYPASS_GSS 0x400
28#define NFSD_MAY_READ_IF_EXEC 0x800
28 29
29#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) 30#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
30#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) 31#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -61,7 +62,7 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
61__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, 62__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
62 char *name, int len, struct iattr *attrs, 63 char *name, int len, struct iattr *attrs,
63 struct svc_fh *res, int createmode, 64 struct svc_fh *res, int createmode,
64 u32 *verifier, int *truncp, int *created); 65 u32 *verifier, bool *truncp, bool *created);
65__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, 66__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
66 loff_t, unsigned long); 67 loff_t, unsigned long);
67#endif /* CONFIG_NFSD_V3 */ 68#endif /* CONFIG_NFSD_V3 */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d2a8d04428c7..2364747ee97d 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -81,7 +81,6 @@ struct nfsd4_access {
81struct nfsd4_close { 81struct nfsd4_close {
82 u32 cl_seqid; /* request */ 82 u32 cl_seqid; /* request */
83 stateid_t cl_stateid; /* request+response */ 83 stateid_t cl_stateid; /* request+response */
84 struct nfs4_stateowner * cl_stateowner; /* response */
85}; 84};
86 85
87struct nfsd4_commit { 86struct nfsd4_commit {
@@ -131,7 +130,7 @@ struct nfsd4_link {
131 130
132struct nfsd4_lock_denied { 131struct nfsd4_lock_denied {
133 clientid_t ld_clientid; 132 clientid_t ld_clientid;
134 struct nfs4_stateowner *ld_sop; 133 struct xdr_netobj ld_owner;
135 u64 ld_start; 134 u64 ld_start;
136 u64 ld_length; 135 u64 ld_length;
137 u32 ld_type; 136 u32 ld_type;
@@ -165,9 +164,6 @@ struct nfsd4_lock {
165 } ok; 164 } ok;
166 struct nfsd4_lock_denied denied; 165 struct nfsd4_lock_denied denied;
167 } u; 166 } u;
168 /* The lk_replay_owner is the open owner in the open_to_lock_owner
169 * case and the lock owner otherwise: */
170 struct nfs4_stateowner *lk_replay_owner;
171}; 167};
172#define lk_new_open_seqid v.new.open_seqid 168#define lk_new_open_seqid v.new.open_seqid
173#define lk_new_open_stateid v.new.open_stateid 169#define lk_new_open_stateid v.new.open_stateid
@@ -188,7 +184,6 @@ struct nfsd4_lockt {
188 struct xdr_netobj lt_owner; 184 struct xdr_netobj lt_owner;
189 u64 lt_offset; 185 u64 lt_offset;
190 u64 lt_length; 186 u64 lt_length;
191 struct nfs4_stateowner * lt_stateowner;
192 struct nfsd4_lock_denied lt_denied; 187 struct nfsd4_lock_denied lt_denied;
193}; 188};
194 189
@@ -199,7 +194,6 @@ struct nfsd4_locku {
199 stateid_t lu_stateid; 194 stateid_t lu_stateid;
200 u64 lu_offset; 195 u64 lu_offset;
201 u64 lu_length; 196 u64 lu_length;
202 struct nfs4_stateowner *lu_stateowner;
203}; 197};
204 198
205 199
@@ -232,8 +226,11 @@ struct nfsd4_open {
232 u32 op_recall; /* recall */ 226 u32 op_recall; /* recall */
233 struct nfsd4_change_info op_cinfo; /* response */ 227 struct nfsd4_change_info op_cinfo; /* response */
234 u32 op_rflags; /* response */ 228 u32 op_rflags; /* response */
235 int op_truncate; /* used during processing */ 229 bool op_truncate; /* used during processing */
236 struct nfs4_stateowner *op_stateowner; /* used during processing */ 230 bool op_created; /* used during processing */
231 struct nfs4_openowner *op_openowner; /* used during processing */
232 struct nfs4_file *op_file; /* used during processing */
233 struct nfs4_ol_stateid *op_stp; /* used during processing */
237 struct nfs4_acl *op_acl; 234 struct nfs4_acl *op_acl;
238}; 235};
239#define op_iattr iattr 236#define op_iattr iattr
@@ -243,7 +240,6 @@ struct nfsd4_open_confirm {
243 stateid_t oc_req_stateid /* request */; 240 stateid_t oc_req_stateid /* request */;
244 u32 oc_seqid /* request */; 241 u32 oc_seqid /* request */;
245 stateid_t oc_resp_stateid /* response */; 242 stateid_t oc_resp_stateid /* response */;
246 struct nfs4_stateowner * oc_stateowner; /* response */
247}; 243};
248 244
249struct nfsd4_open_downgrade { 245struct nfsd4_open_downgrade {
@@ -251,7 +247,6 @@ struct nfsd4_open_downgrade {
251 u32 od_seqid; 247 u32 od_seqid;
252 u32 od_share_access; 248 u32 od_share_access;
253 u32 od_share_deny; 249 u32 od_share_deny;
254 struct nfs4_stateowner *od_stateowner;
255}; 250};
256 251
257 252
@@ -325,8 +320,7 @@ struct nfsd4_setattr {
325 320
326struct nfsd4_setclientid { 321struct nfsd4_setclientid {
327 nfs4_verifier se_verf; /* request */ 322 nfs4_verifier se_verf; /* request */
328 u32 se_namelen; /* request */ 323 struct xdr_netobj se_name;
329 char * se_name; /* request */
330 u32 se_callback_prog; /* request */ 324 u32 se_callback_prog; /* request */
331 u32 se_callback_netid_len; /* request */ 325 u32 se_callback_netid_len; /* request */
332 char * se_callback_netid_val; /* request */ 326 char * se_callback_netid_val; /* request */
@@ -351,7 +345,6 @@ struct nfsd4_saved_compoundargs {
351 345
352struct nfsd4_test_stateid { 346struct nfsd4_test_stateid {
353 __be32 ts_num_ids; 347 __be32 ts_num_ids;
354 __be32 ts_has_session;
355 struct nfsd4_compoundargs *ts_saved_args; 348 struct nfsd4_compoundargs *ts_saved_args;
356 struct nfsd4_saved_compoundargs ts_savedp; 349 struct nfsd4_saved_compoundargs ts_savedp;
357}; 350};
@@ -405,6 +398,10 @@ struct nfsd4_destroy_session {
405 struct nfs4_sessionid sessionid; 398 struct nfs4_sessionid sessionid;
406}; 399};
407 400
401struct nfsd4_destroy_clientid {
402 clientid_t clientid;
403};
404
408struct nfsd4_reclaim_complete { 405struct nfsd4_reclaim_complete {
409 u32 rca_one_fs; 406 u32 rca_one_fs;
410}; 407};
@@ -532,6 +529,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
532 struct nfsd4_compoundargs *); 529 struct nfsd4_compoundargs *);
533int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, 530int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
534 struct nfsd4_compoundres *); 531 struct nfsd4_compoundres *);
532int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
535void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); 533void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
536void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); 534void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
537__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 535__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
@@ -558,11 +556,13 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
558extern __be32 nfsd4_destroy_session(struct svc_rqst *, 556extern __be32 nfsd4_destroy_session(struct svc_rqst *,
559 struct nfsd4_compound_state *, 557 struct nfsd4_compound_state *,
560 struct nfsd4_destroy_session *); 558 struct nfsd4_destroy_session *);
559extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
561__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); 560__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
562extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, 561extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
563 struct nfsd4_open *open); 562 struct nfsd4_open *open);
564extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, 563extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
565 struct svc_fh *current_fh, struct nfsd4_open *open); 564 struct svc_fh *current_fh, struct nfsd4_open *open);
565extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
566extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, 566extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
567 struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); 567 struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
568extern __be32 nfsd4_close(struct svc_rqst *rqstp, 568extern __be32 nfsd4_close(struct svc_rqst *rqstp,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 81ecf9c0bf0a..194fb22ef79d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7185,20 +7185,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
7185{ 7185{
7186 int ret = 0; 7186 int ret = 0;
7187 struct buffer_head *dir_bh = NULL; 7187 struct buffer_head *dir_bh = NULL;
7188 struct ocfs2_security_xattr_info si = {
7189 .enable = 1,
7190 };
7191 7188
7192 ret = ocfs2_init_security_get(inode, dir, qstr, &si); 7189 ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
7193 if (!ret) { 7190 if (!ret) {
7194 ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
7195 si.name, si.value, si.value_len,
7196 XATTR_CREATE);
7197 if (ret) {
7198 mlog_errno(ret);
7199 goto leave;
7200 }
7201 } else if (ret != -EOPNOTSUPP) {
7202 mlog_errno(ret); 7191 mlog_errno(ret);
7203 goto leave; 7192 goto leave;
7204 } 7193 }
@@ -7255,6 +7244,22 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
7255 name, value, size, flags); 7244 name, value, size, flags);
7256} 7245}
7257 7246
7247int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
7248 void *fs_info)
7249{
7250 const struct xattr *xattr;
7251 int err = 0;
7252
7253 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
7254 err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
7255 xattr->name, xattr->value,
7256 xattr->value_len, XATTR_CREATE);
7257 if (err)
7258 break;
7259 }
7260 return err;
7261}
7262
7258int ocfs2_init_security_get(struct inode *inode, 7263int ocfs2_init_security_get(struct inode *inode,
7259 struct inode *dir, 7264 struct inode *dir,
7260 const struct qstr *qstr, 7265 const struct qstr *qstr,
@@ -7263,8 +7268,13 @@ int ocfs2_init_security_get(struct inode *inode,
7263 /* check whether ocfs2 support feature xattr */ 7268 /* check whether ocfs2 support feature xattr */
7264 if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) 7269 if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
7265 return -EOPNOTSUPP; 7270 return -EOPNOTSUPP;
7266 return security_inode_init_security(inode, dir, qstr, &si->name, 7271 if (si)
7267 &si->value, &si->value_len); 7272 return security_old_inode_init_security(inode, dir, qstr,
7273 &si->name, &si->value,
7274 &si->value_len);
7275
7276 return security_inode_init_security(inode, dir, qstr,
7277 &ocfs2_initxattrs, NULL);
7268} 7278}
7269 7279
7270int ocfs2_init_security_set(handle_t *handle, 7280int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/open.c b/fs/open.c
index f71192109457..22c41b543f2d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -685,6 +685,10 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
685 if (error) 685 if (error)
686 goto cleanup_all; 686 goto cleanup_all;
687 687
688 error = break_lease(inode, f->f_flags);
689 if (error)
690 goto cleanup_all;
691
688 if (!open && f->f_op) 692 if (!open && f->f_op)
689 open = f->f_op->open; 693 open = f->f_op->open;
690 if (open) { 694 if (open) {
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 10027b42b7e2..cea4623f1ed6 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -218,6 +218,8 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
218 const struct posix_acl_entry *pa, *pe, *mask_obj; 218 const struct posix_acl_entry *pa, *pe, *mask_obj;
219 int found = 0; 219 int found = 0;
220 220
221 want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
222
221 FOREACH_ACL_ENTRY(pa, acl, pe) { 223 FOREACH_ACL_ENTRY(pa, acl, pe) {
222 switch(pa->e_tag) { 224 switch(pa->e_tag) {
223 case ACL_USER_OBJ: 225 case ACL_USER_OBJ:
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 9758b654a1bc..42b274da92c3 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,6 +10,7 @@
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/irqnr.h> 11#include <linux/irqnr.h>
12#include <asm/cputime.h> 12#include <asm/cputime.h>
13#include <linux/tick.h>
13 14
14#ifndef arch_irq_stat_cpu 15#ifndef arch_irq_stat_cpu
15#define arch_irq_stat_cpu(cpu) 0 16#define arch_irq_stat_cpu(cpu) 0
@@ -21,6 +22,35 @@
21#define arch_idle_time(cpu) 0 22#define arch_idle_time(cpu) 0
22#endif 23#endif
23 24
25static cputime64_t get_idle_time(int cpu)
26{
27 u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
28 cputime64_t idle;
29
30 if (idle_time == -1ULL) {
31 /* !NO_HZ so we can rely on cpustat.idle */
32 idle = kstat_cpu(cpu).cpustat.idle;
33 idle = cputime64_add(idle, arch_idle_time(cpu));
34 } else
35 idle = usecs_to_cputime(idle_time);
36
37 return idle;
38}
39
40static cputime64_t get_iowait_time(int cpu)
41{
42 u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
43 cputime64_t iowait;
44
45 if (iowait_time == -1ULL)
46 /* !NO_HZ so we can rely on cpustat.iowait */
47 iowait = kstat_cpu(cpu).cpustat.iowait;
48 else
49 iowait = usecs_to_cputime(iowait_time);
50
51 return iowait;
52}
53
24static int show_stat(struct seq_file *p, void *v) 54static int show_stat(struct seq_file *p, void *v)
25{ 55{
26 int i, j; 56 int i, j;
@@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v)
42 user = cputime64_add(user, kstat_cpu(i).cpustat.user); 72 user = cputime64_add(user, kstat_cpu(i).cpustat.user);
43 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); 73 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
44 system = cputime64_add(system, kstat_cpu(i).cpustat.system); 74 system = cputime64_add(system, kstat_cpu(i).cpustat.system);
45 idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); 75 idle = cputime64_add(idle, get_idle_time(i));
46 idle = cputime64_add(idle, arch_idle_time(i)); 76 iowait = cputime64_add(iowait, get_iowait_time(i));
47 iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
48 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); 77 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
49 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 78 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
50 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 79 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
@@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v)
76 (unsigned long long)cputime64_to_clock_t(guest), 105 (unsigned long long)cputime64_to_clock_t(guest),
77 (unsigned long long)cputime64_to_clock_t(guest_nice)); 106 (unsigned long long)cputime64_to_clock_t(guest_nice));
78 for_each_online_cpu(i) { 107 for_each_online_cpu(i) {
79
80 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 108 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
81 user = kstat_cpu(i).cpustat.user; 109 user = kstat_cpu(i).cpustat.user;
82 nice = kstat_cpu(i).cpustat.nice; 110 nice = kstat_cpu(i).cpustat.nice;
83 system = kstat_cpu(i).cpustat.system; 111 system = kstat_cpu(i).cpustat.system;
84 idle = kstat_cpu(i).cpustat.idle; 112 idle = get_idle_time(i);
85 idle = cputime64_add(idle, arch_idle_time(i)); 113 iowait = get_iowait_time(i);
86 iowait = kstat_cpu(i).cpustat.iowait;
87 irq = kstat_cpu(i).cpustat.irq; 114 irq = kstat_cpu(i).cpustat.irq;
88 softirq = kstat_cpu(i).cpustat.softirq; 115 softirq = kstat_cpu(i).cpustat.softirq;
89 steal = kstat_cpu(i).cpustat.steal; 116 steal = kstat_cpu(i).cpustat.steal;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 25b6a887adb9..5afaa58a8630 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -877,30 +877,54 @@ struct numa_maps_private {
877 struct numa_maps md; 877 struct numa_maps md;
878}; 878};
879 879
880static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty) 880static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
881 unsigned long nr_pages)
881{ 882{
882 int count = page_mapcount(page); 883 int count = page_mapcount(page);
883 884
884 md->pages++; 885 md->pages += nr_pages;
885 if (pte_dirty || PageDirty(page)) 886 if (pte_dirty || PageDirty(page))
886 md->dirty++; 887 md->dirty += nr_pages;
887 888
888 if (PageSwapCache(page)) 889 if (PageSwapCache(page))
889 md->swapcache++; 890 md->swapcache += nr_pages;
890 891
891 if (PageActive(page) || PageUnevictable(page)) 892 if (PageActive(page) || PageUnevictable(page))
892 md->active++; 893 md->active += nr_pages;
893 894
894 if (PageWriteback(page)) 895 if (PageWriteback(page))
895 md->writeback++; 896 md->writeback += nr_pages;
896 897
897 if (PageAnon(page)) 898 if (PageAnon(page))
898 md->anon++; 899 md->anon += nr_pages;
899 900
900 if (count > md->mapcount_max) 901 if (count > md->mapcount_max)
901 md->mapcount_max = count; 902 md->mapcount_max = count;
902 903
903 md->node[page_to_nid(page)]++; 904 md->node[page_to_nid(page)] += nr_pages;
905}
906
907static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
908 unsigned long addr)
909{
910 struct page *page;
911 int nid;
912
913 if (!pte_present(pte))
914 return NULL;
915
916 page = vm_normal_page(vma, addr, pte);
917 if (!page)
918 return NULL;
919
920 if (PageReserved(page))
921 return NULL;
922
923 nid = page_to_nid(page);
924 if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
925 return NULL;
926
927 return page;
904} 928}
905 929
906static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 930static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
@@ -912,26 +936,32 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
912 pte_t *pte; 936 pte_t *pte;
913 937
914 md = walk->private; 938 md = walk->private;
915 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 939 spin_lock(&walk->mm->page_table_lock);
916 do { 940 if (pmd_trans_huge(*pmd)) {
917 struct page *page; 941 if (pmd_trans_splitting(*pmd)) {
918 int nid; 942 spin_unlock(&walk->mm->page_table_lock);
943 wait_split_huge_page(md->vma->anon_vma, pmd);
944 } else {
945 pte_t huge_pte = *(pte_t *)pmd;
946 struct page *page;
919 947
920 if (!pte_present(*pte)) 948 page = can_gather_numa_stats(huge_pte, md->vma, addr);
921 continue; 949 if (page)
950 gather_stats(page, md, pte_dirty(huge_pte),
951 HPAGE_PMD_SIZE/PAGE_SIZE);
952 spin_unlock(&walk->mm->page_table_lock);
953 return 0;
954 }
955 } else {
956 spin_unlock(&walk->mm->page_table_lock);
957 }
922 958
923 page = vm_normal_page(md->vma, addr, *pte); 959 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
960 do {
961 struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
924 if (!page) 962 if (!page)
925 continue; 963 continue;
926 964 gather_stats(page, md, pte_dirty(*pte), 1);
927 if (PageReserved(page))
928 continue;
929
930 nid = page_to_nid(page);
931 if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
932 continue;
933
934 gather_stats(page, md, pte_dirty(*pte));
935 965
936 } while (pte++, addr += PAGE_SIZE, addr != end); 966 } while (pte++, addr += PAGE_SIZE, addr != end);
937 pte_unmap_unlock(orig_pte, ptl); 967 pte_unmap_unlock(orig_pte, ptl);
@@ -952,7 +982,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
952 return 0; 982 return 0;
953 983
954 md = walk->private; 984 md = walk->private;
955 gather_stats(page, md, pte_dirty(*pte)); 985 gather_stats(page, md, pte_dirty(*pte), 1);
956 return 0; 986 return 0;
957} 987}
958 988
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b34bdb25490c..10b6be3ca280 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -355,7 +355,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
355 * resolution (think about autofs) and thus deadlocks could arise. 355 * resolution (think about autofs) and thus deadlocks could arise.
356 */ 356 */
357 if (cmds == Q_QUOTAON) { 357 if (cmds == Q_QUOTAON) {
358 ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path); 358 ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
359 if (ret) 359 if (ret)
360 pathp = ERR_PTR(ret); 360 pathp = ERR_PTR(ret);
361 else 361 else
diff --git a/fs/read_write.c b/fs/read_write.c
index 179f1c33ea57..dfd125798791 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -35,23 +35,45 @@ static inline int unsigned_offsets(struct file *file)
35 return file->f_mode & FMODE_UNSIGNED_OFFSET; 35 return file->f_mode & FMODE_UNSIGNED_OFFSET;
36} 36}
37 37
38static loff_t lseek_execute(struct file *file, struct inode *inode,
39 loff_t offset, loff_t maxsize)
40{
41 if (offset < 0 && !unsigned_offsets(file))
42 return -EINVAL;
43 if (offset > maxsize)
44 return -EINVAL;
45
46 if (offset != file->f_pos) {
47 file->f_pos = offset;
48 file->f_version = 0;
49 }
50 return offset;
51}
52
38/** 53/**
39 * generic_file_llseek_unlocked - lockless generic llseek implementation 54 * generic_file_llseek_size - generic llseek implementation for regular files
40 * @file: file structure to seek on 55 * @file: file structure to seek on
41 * @offset: file offset to seek to 56 * @offset: file offset to seek to
42 * @origin: type of seek 57 * @origin: type of seek
58 * @size: max size of file system
59 *
60 * This is a variant of generic_file_llseek that allows passing in a custom
61 * file size.
43 * 62 *
44 * Updates the file offset to the value specified by @offset and @origin. 63 * Synchronization:
45 * Locking must be provided by the caller. 64 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
65 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
66 * read/writes behave like SEEK_SET against seeks.
46 */ 67 */
47loff_t 68loff_t
48generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) 69generic_file_llseek_size(struct file *file, loff_t offset, int origin,
70 loff_t maxsize)
49{ 71{
50 struct inode *inode = file->f_mapping->host; 72 struct inode *inode = file->f_mapping->host;
51 73
52 switch (origin) { 74 switch (origin) {
53 case SEEK_END: 75 case SEEK_END:
54 offset += inode->i_size; 76 offset += i_size_read(inode);
55 break; 77 break;
56 case SEEK_CUR: 78 case SEEK_CUR:
57 /* 79 /*
@@ -62,14 +84,22 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
62 */ 84 */
63 if (offset == 0) 85 if (offset == 0)
64 return file->f_pos; 86 return file->f_pos;
65 offset += file->f_pos; 87 /*
66 break; 88 * f_lock protects against read/modify/write race with other
89 * SEEK_CURs. Note that parallel writes and reads behave
90 * like SEEK_SET.
91 */
92 spin_lock(&file->f_lock);
93 offset = lseek_execute(file, inode, file->f_pos + offset,
94 maxsize);
95 spin_unlock(&file->f_lock);
96 return offset;
67 case SEEK_DATA: 97 case SEEK_DATA:
68 /* 98 /*
69 * In the generic case the entire file is data, so as long as 99 * In the generic case the entire file is data, so as long as
70 * offset isn't at the end of the file then the offset is data. 100 * offset isn't at the end of the file then the offset is data.
71 */ 101 */
72 if (offset >= inode->i_size) 102 if (offset >= i_size_read(inode))
73 return -ENXIO; 103 return -ENXIO;
74 break; 104 break;
75 case SEEK_HOLE: 105 case SEEK_HOLE:
@@ -77,26 +107,15 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
77 * There is a virtual hole at the end of the file, so as long as 107 * There is a virtual hole at the end of the file, so as long as
78 * offset isn't i_size or larger, return i_size. 108 * offset isn't i_size or larger, return i_size.
79 */ 109 */
80 if (offset >= inode->i_size) 110 if (offset >= i_size_read(inode))
81 return -ENXIO; 111 return -ENXIO;
82 offset = inode->i_size; 112 offset = i_size_read(inode);
83 break; 113 break;
84 } 114 }
85 115
86 if (offset < 0 && !unsigned_offsets(file)) 116 return lseek_execute(file, inode, offset, maxsize);
87 return -EINVAL;
88 if (offset > inode->i_sb->s_maxbytes)
89 return -EINVAL;
90
91 /* Special lock needed here? */
92 if (offset != file->f_pos) {
93 file->f_pos = offset;
94 file->f_version = 0;
95 }
96
97 return offset;
98} 117}
99EXPORT_SYMBOL(generic_file_llseek_unlocked); 118EXPORT_SYMBOL(generic_file_llseek_size);
100 119
101/** 120/**
102 * generic_file_llseek - generic llseek implementation for regular files 121 * generic_file_llseek - generic llseek implementation for regular files
@@ -110,13 +129,10 @@ EXPORT_SYMBOL(generic_file_llseek_unlocked);
110 */ 129 */
111loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) 130loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
112{ 131{
113 loff_t rval; 132 struct inode *inode = file->f_mapping->host;
114
115 mutex_lock(&file->f_dentry->d_inode->i_mutex);
116 rval = generic_file_llseek_unlocked(file, offset, origin);
117 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
118 133
119 return rval; 134 return generic_file_llseek_size(file, offset, origin,
135 inode->i_sb->s_maxbytes);
120} 136}
121EXPORT_SYMBOL(generic_file_llseek); 137EXPORT_SYMBOL(generic_file_llseek);
122 138
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index a159ba5a35e7..eb711060a6f2 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -291,14 +291,13 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb,
291 for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { 291 for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
292 jb = jb_array + i; 292 jb = jb_array + i;
293 jb->journal_list = NULL; 293 jb->journal_list = NULL;
294 jb->bitmaps = vmalloc(mem); 294 jb->bitmaps = vzalloc(mem);
295 if (!jb->bitmaps) { 295 if (!jb->bitmaps) {
296 reiserfs_warning(sb, "clm-2000", "unable to " 296 reiserfs_warning(sb, "clm-2000", "unable to "
297 "allocate bitmaps for journal lists"); 297 "allocate bitmaps for journal lists");
298 failed = 1; 298 failed = 1;
299 break; 299 break;
300 } 300 }
301 memset(jb->bitmaps, 0, mem);
302 } 301 }
303 if (failed) { 302 if (failed) {
304 free_list_bitmaps(sb, jb_array); 303 free_list_bitmaps(sb, jb_array);
@@ -353,11 +352,10 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
353 if (num_cnodes <= 0) { 352 if (num_cnodes <= 0) {
354 return NULL; 353 return NULL;
355 } 354 }
356 head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); 355 head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
357 if (!head) { 356 if (!head) {
358 return NULL; 357 return NULL;
359 } 358 }
360 memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));
361 head[0].prev = NULL; 359 head[0].prev = NULL;
362 head[0].next = head + 1; 360 head[0].next = head + 1;
363 for (i = 1; i < num_cnodes; i++) { 361 for (i = 1; i < num_cnodes; i++) {
@@ -2685,14 +2683,13 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2685 * dependency inversion warnings. 2683 * dependency inversion warnings.
2686 */ 2684 */
2687 reiserfs_write_unlock(sb); 2685 reiserfs_write_unlock(sb);
2688 journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); 2686 journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
2689 if (!journal) { 2687 if (!journal) {
2690 reiserfs_warning(sb, "journal-1256", 2688 reiserfs_warning(sb, "journal-1256",
2691 "unable to get memory for journal structure"); 2689 "unable to get memory for journal structure");
2692 reiserfs_write_lock(sb); 2690 reiserfs_write_lock(sb);
2693 return 1; 2691 return 1;
2694 } 2692 }
2695 memset(journal, 0, sizeof(struct reiserfs_journal));
2696 INIT_LIST_HEAD(&journal->j_bitmap_nodes); 2693 INIT_LIST_HEAD(&journal->j_bitmap_nodes);
2697 INIT_LIST_HEAD(&journal->j_prealloc_list); 2694 INIT_LIST_HEAD(&journal->j_prealloc_list);
2698 INIT_LIST_HEAD(&journal->j_working_list); 2695 INIT_LIST_HEAD(&journal->j_working_list);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index b6b9b1fe33b0..7483279b482d 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -111,15 +111,13 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
111 /* allocate additional bitmap blocks, reallocate array of bitmap 111 /* allocate additional bitmap blocks, reallocate array of bitmap
112 * block pointers */ 112 * block pointers */
113 bitmap = 113 bitmap =
114 vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); 114 vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
115 if (!bitmap) { 115 if (!bitmap) {
116 /* Journal bitmaps are still supersized, but the memory isn't 116 /* Journal bitmaps are still supersized, but the memory isn't
117 * leaked, so I guess it's ok */ 117 * leaked, so I guess it's ok */
118 printk("reiserfs_resize: unable to allocate memory.\n"); 118 printk("reiserfs_resize: unable to allocate memory.\n");
119 return -ENOMEM; 119 return -ENOMEM;
120 } 120 }
121 memset(bitmap, 0,
122 sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
123 for (i = 0; i < bmap_nr; i++) 121 for (i = 0; i < bmap_nr; i++)
124 bitmap[i] = old_bitmap[i]; 122 bitmap[i] = old_bitmap[i];
125 123
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index ef66c18a9332..534668fa41be 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -66,8 +66,8 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
66 if (IS_PRIVATE(dir)) 66 if (IS_PRIVATE(dir))
67 return 0; 67 return 0;
68 68
69 error = security_inode_init_security(inode, dir, qstr, &sec->name, 69 error = security_old_inode_init_security(inode, dir, qstr, &sec->name,
70 &sec->value, &sec->length); 70 &sec->value, &sec->length);
71 if (error) { 71 if (error) {
72 if (error == -EOPNOTSUPP) 72 if (error == -EOPNOTSUPP)
73 error = 0; 73 error = 0;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 1360d4f88f41..048b59d5b2f0 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -19,9 +19,9 @@ config SQUASHFS
19 19
20 If you want to compile this as a module ( = code which can be 20 If you want to compile this as a module ( = code which can be
21 inserted in and removed from the running kernel whenever you want), 21 inserted in and removed from the running kernel whenever you want),
22 say M here and read <file:Documentation/modules.txt>. The module 22 say M here. The module will be called squashfs. Note that the root
23 will be called squashfs. Note that the root file system (the one 23 file system (the one containing the directory /) cannot be compiled
24 containing the directory /) cannot be compiled as a module. 24 as a module.
25 25
26 If unsure, say N. 26 If unsure, say N.
27 27
diff --git a/fs/stat.c b/fs/stat.c
index ba5316ffac61..78a3aa83c7ea 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -81,8 +81,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
81 81
82 if (!(flag & AT_SYMLINK_NOFOLLOW)) 82 if (!(flag & AT_SYMLINK_NOFOLLOW))
83 lookup_flags |= LOOKUP_FOLLOW; 83 lookup_flags |= LOOKUP_FOLLOW;
84 if (flag & AT_NO_AUTOMOUNT)
85 lookup_flags |= LOOKUP_NO_AUTOMOUNT;
86 if (flag & AT_EMPTY_PATH) 84 if (flag & AT_EMPTY_PATH)
87 lookup_flags |= LOOKUP_EMPTY; 85 lookup_flags |= LOOKUP_EMPTY;
88 86
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ea9120a830d8..48ffbdf0d017 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,20 +43,48 @@ static DEFINE_IDA(sysfs_ino_ida);
43static void sysfs_link_sibling(struct sysfs_dirent *sd) 43static void sysfs_link_sibling(struct sysfs_dirent *sd)
44{ 44{
45 struct sysfs_dirent *parent_sd = sd->s_parent; 45 struct sysfs_dirent *parent_sd = sd->s_parent;
46 struct sysfs_dirent **pos;
47 46
48 BUG_ON(sd->s_sibling); 47 struct rb_node **p;
49 48 struct rb_node *parent;
50 /* Store directory entries in order by ino. This allows 49
51 * readdir to properly restart without having to add a 50 if (sysfs_type(sd) == SYSFS_DIR)
52 * cursor into the s_dir.children list. 51 parent_sd->s_dir.subdirs++;
53 */ 52
54 for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) { 53 p = &parent_sd->s_dir.inode_tree.rb_node;
55 if (sd->s_ino < (*pos)->s_ino) 54 parent = NULL;
56 break; 55 while (*p) {
56 parent = *p;
57#define node rb_entry(parent, struct sysfs_dirent, inode_node)
58 if (sd->s_ino < node->s_ino) {
59 p = &node->inode_node.rb_left;
60 } else if (sd->s_ino > node->s_ino) {
61 p = &node->inode_node.rb_right;
62 } else {
63 printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n",
64 (unsigned long) sd->s_ino);
65 BUG();
66 }
67#undef node
57 } 68 }
58 sd->s_sibling = *pos; 69 rb_link_node(&sd->inode_node, parent, p);
59 *pos = sd; 70 rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree);
71
72 p = &parent_sd->s_dir.name_tree.rb_node;
73 parent = NULL;
74 while (*p) {
75 int c;
76 parent = *p;
77#define node rb_entry(parent, struct sysfs_dirent, name_node)
78 c = strcmp(sd->s_name, node->s_name);
79 if (c < 0) {
80 p = &node->name_node.rb_left;
81 } else {
82 p = &node->name_node.rb_right;
83 }
84#undef node
85 }
86 rb_link_node(&sd->name_node, parent, p);
87 rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree);
60} 88}
61 89
62/** 90/**
@@ -71,16 +99,11 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
71 */ 99 */
72static void sysfs_unlink_sibling(struct sysfs_dirent *sd) 100static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
73{ 101{
74 struct sysfs_dirent **pos; 102 if (sysfs_type(sd) == SYSFS_DIR)
103 sd->s_parent->s_dir.subdirs--;
75 104
76 for (pos = &sd->s_parent->s_dir.children; *pos; 105 rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree);
77 pos = &(*pos)->s_sibling) { 106 rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
78 if (*pos == sd) {
79 *pos = sd->s_sibling;
80 sd->s_sibling = NULL;
81 break;
82 }
83 }
84} 107}
85 108
86/** 109/**
@@ -126,7 +149,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
126 */ 149 */
127void sysfs_put_active(struct sysfs_dirent *sd) 150void sysfs_put_active(struct sysfs_dirent *sd)
128{ 151{
129 struct completion *cmpl;
130 int v; 152 int v;
131 153
132 if (unlikely(!sd)) 154 if (unlikely(!sd))
@@ -138,10 +160,9 @@ void sysfs_put_active(struct sysfs_dirent *sd)
138 return; 160 return;
139 161
140 /* atomic_dec_return() is a mb(), we'll always see the updated 162 /* atomic_dec_return() is a mb(), we'll always see the updated
141 * sd->s_sibling. 163 * sd->u.completion.
142 */ 164 */
143 cmpl = (void *)sd->s_sibling; 165 complete(sd->u.completion);
144 complete(cmpl);
145} 166}
146 167
147/** 168/**
@@ -155,16 +176,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
155 DECLARE_COMPLETION_ONSTACK(wait); 176 DECLARE_COMPLETION_ONSTACK(wait);
156 int v; 177 int v;
157 178
158 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); 179 BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
159 180
160 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) 181 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
161 return; 182 return;
162 183
163 sd->s_sibling = (void *)&wait; 184 sd->u.completion = (void *)&wait;
164 185
165 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); 186 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
166 /* atomic_add_return() is a mb(), put_active() will always see 187 /* atomic_add_return() is a mb(), put_active() will always see
167 * the updated sd->s_sibling. 188 * the updated sd->u.completion.
168 */ 189 */
169 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); 190 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
170 191
@@ -173,8 +194,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
173 wait_for_completion(&wait); 194 wait_for_completion(&wait);
174 } 195 }
175 196
176 sd->s_sibling = NULL;
177
178 lock_acquired(&sd->dep_map, _RET_IP_); 197 lock_acquired(&sd->dep_map, _RET_IP_);
179 rwsem_release(&sd->dep_map, 1, _RET_IP_); 198 rwsem_release(&sd->dep_map, 1, _RET_IP_);
180} 199}
@@ -384,6 +403,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
384{ 403{
385 struct sysfs_inode_attrs *ps_iattr; 404 struct sysfs_inode_attrs *ps_iattr;
386 405
406 if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
407 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
408 sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
409 acxt->parent_sd->s_name, sd->s_name);
410 return -EINVAL;
411 }
412
387 if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) 413 if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
388 return -EEXIST; 414 return -EEXIST;
389 415
@@ -490,7 +516,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
490 } 516 }
491 517
492 sd->s_flags |= SYSFS_FLAG_REMOVED; 518 sd->s_flags |= SYSFS_FLAG_REMOVED;
493 sd->s_sibling = acxt->removed; 519 sd->u.removed_list = acxt->removed;
494 acxt->removed = sd; 520 acxt->removed = sd;
495} 521}
496 522
@@ -514,8 +540,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
514 while (acxt->removed) { 540 while (acxt->removed) {
515 struct sysfs_dirent *sd = acxt->removed; 541 struct sysfs_dirent *sd = acxt->removed;
516 542
517 acxt->removed = sd->s_sibling; 543 acxt->removed = sd->u.removed_list;
518 sd->s_sibling = NULL;
519 544
520 sysfs_deactivate(sd); 545 sysfs_deactivate(sd);
521 unmap_bin_file(sd); 546 unmap_bin_file(sd);
@@ -540,15 +565,43 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
540 const void *ns, 565 const void *ns,
541 const unsigned char *name) 566 const unsigned char *name)
542{ 567{
543 struct sysfs_dirent *sd; 568 struct rb_node *p = parent_sd->s_dir.name_tree.rb_node;
569 struct sysfs_dirent *found = NULL;
544 570
545 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { 571 if (!!sysfs_ns_type(parent_sd) != !!ns) {
546 if (ns && sd->s_ns && (sd->s_ns != ns)) 572 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
547 continue; 573 sysfs_ns_type(parent_sd)? "required": "invalid",
548 if (!strcmp(sd->s_name, name)) 574 parent_sd->s_name, name);
549 return sd; 575 return NULL;
550 } 576 }
551 return NULL; 577
578 while (p) {
579 int c;
580#define node rb_entry(p, struct sysfs_dirent, name_node)
581 c = strcmp(name, node->s_name);
582 if (c < 0) {
583 p = node->name_node.rb_left;
584 } else if (c > 0) {
585 p = node->name_node.rb_right;
586 } else {
587 found = node;
588 p = node->name_node.rb_left;
589 }
590#undef node
591 }
592
593 if (found) {
594 while (found->s_ns != ns) {
595 p = rb_next(&found->name_node);
596 if (!p)
597 return NULL;
598 found = rb_entry(p, struct sysfs_dirent, name_node);
599 if (strcmp(name, found->s_name))
600 return NULL;
601 }
602 }
603
604 return found;
552} 605}
553 606
554/** 607/**
@@ -744,21 +797,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd)
744static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) 797static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
745{ 798{
746 struct sysfs_addrm_cxt acxt; 799 struct sysfs_addrm_cxt acxt;
747 struct sysfs_dirent **pos; 800 struct rb_node *pos;
748 801
749 if (!dir_sd) 802 if (!dir_sd)
750 return; 803 return;
751 804
752 pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); 805 pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
753 sysfs_addrm_start(&acxt, dir_sd); 806 sysfs_addrm_start(&acxt, dir_sd);
754 pos = &dir_sd->s_dir.children; 807 pos = rb_first(&dir_sd->s_dir.inode_tree);
755 while (*pos) { 808 while (pos) {
756 struct sysfs_dirent *sd = *pos; 809 struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node);
757 810 pos = rb_next(pos);
758 if (sysfs_type(sd) != SYSFS_DIR) 811 if (sysfs_type(sd) != SYSFS_DIR)
759 sysfs_remove_one(&acxt, sd); 812 sysfs_remove_one(&acxt, sd);
760 else
761 pos = &(*pos)->s_sibling;
762 } 813 }
763 sysfs_addrm_finish(&acxt); 814 sysfs_addrm_finish(&acxt);
764 815
@@ -881,12 +932,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
881 pos = NULL; 932 pos = NULL;
882 } 933 }
883 if (!pos && (ino > 1) && (ino < INT_MAX)) { 934 if (!pos && (ino > 1) && (ino < INT_MAX)) {
884 pos = parent_sd->s_dir.children; 935 struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node;
885 while (pos && (ino > pos->s_ino)) 936 while (p) {
886 pos = pos->s_sibling; 937#define node rb_entry(p, struct sysfs_dirent, inode_node)
938 if (ino < node->s_ino) {
939 pos = node;
940 p = node->inode_node.rb_left;
941 } else if (ino > node->s_ino) {
942 p = node->inode_node.rb_right;
943 } else {
944 pos = node;
945 break;
946 }
947#undef node
948 }
949 }
950 while (pos && pos->s_ns != ns) {
951 struct rb_node *p = rb_next(&pos->inode_node);
952 if (!p)
953 pos = NULL;
954 else
955 pos = rb_entry(p, struct sysfs_dirent, inode_node);
887 } 956 }
888 while (pos && pos->s_ns && pos->s_ns != ns)
889 pos = pos->s_sibling;
890 return pos; 957 return pos;
891} 958}
892 959
@@ -894,10 +961,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
894 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) 961 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
895{ 962{
896 pos = sysfs_dir_pos(ns, parent_sd, ino, pos); 963 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
897 if (pos) 964 if (pos) do {
898 pos = pos->s_sibling; 965 struct rb_node *p = rb_next(&pos->inode_node);
899 while (pos && pos->s_ns && pos->s_ns != ns) 966 if (!p)
900 pos = pos->s_sibling; 967 pos = NULL;
968 else
969 pos = rb_entry(p, struct sysfs_dirent, inode_node);
970 } while (pos && pos->s_ns != ns);
901 return pos; 971 return pos;
902} 972}
903 973
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1ad8c93c1b85..d4e6080b4b20 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
466 mutex_lock(&sysfs_mutex); 466 mutex_lock(&sysfs_mutex);
467 467
468 if (sd && dir) 468 if (sd && dir)
469 /* Only directories are tagged, so no need to pass
470 * a tag explicitly.
471 */
472 sd = sysfs_find_dirent(sd, NULL, dir); 469 sd = sysfs_find_dirent(sd, NULL, dir);
473 if (sd && attr) 470 if (sd && attr)
474 sd = sysfs_find_dirent(sd, NULL, attr); 471 sd = sysfs_find_dirent(sd, NULL, attr);
@@ -488,17 +485,56 @@ const struct file_operations sysfs_file_operations = {
488 .poll = sysfs_poll, 485 .poll = sysfs_poll,
489}; 486};
490 487
488int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
489 const void **pns)
490{
491 struct sysfs_dirent *dir_sd = kobj->sd;
492 const struct sysfs_ops *ops;
493 const void *ns = NULL;
494 int err;
495
496 err = 0;
497 if (!sysfs_ns_type(dir_sd))
498 goto out;
499
500 err = -EINVAL;
501 if (!kobj->ktype)
502 goto out;
503 ops = kobj->ktype->sysfs_ops;
504 if (!ops)
505 goto out;
506 if (!ops->namespace)
507 goto out;
508
509 err = 0;
510 ns = ops->namespace(kobj, attr);
511out:
512 if (err) {
513 WARN(1, KERN_ERR "missing sysfs namespace attribute operation for "
514 "kobject: %s\n", kobject_name(kobj));
515 }
516 *pns = ns;
517 return err;
518}
519
491int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, 520int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
492 const struct attribute *attr, int type, mode_t amode) 521 const struct attribute *attr, int type, mode_t amode)
493{ 522{
494 umode_t mode = (amode & S_IALLUGO) | S_IFREG; 523 umode_t mode = (amode & S_IALLUGO) | S_IFREG;
495 struct sysfs_addrm_cxt acxt; 524 struct sysfs_addrm_cxt acxt;
496 struct sysfs_dirent *sd; 525 struct sysfs_dirent *sd;
526 const void *ns;
497 int rc; 527 int rc;
498 528
529 rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns);
530 if (rc)
531 return rc;
532
499 sd = sysfs_new_dirent(attr->name, mode, type); 533 sd = sysfs_new_dirent(attr->name, mode, type);
500 if (!sd) 534 if (!sd)
501 return -ENOMEM; 535 return -ENOMEM;
536
537 sd->s_ns = ns;
502 sd->s_attr.attr = (void *)attr; 538 sd->s_attr.attr = (void *)attr;
503 sysfs_dirent_init_lockdep(sd); 539 sysfs_dirent_init_lockdep(sd);
504 540
@@ -586,12 +622,17 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
586{ 622{
587 struct sysfs_dirent *sd; 623 struct sysfs_dirent *sd;
588 struct iattr newattrs; 624 struct iattr newattrs;
625 const void *ns;
589 int rc; 626 int rc;
590 627
628 rc = sysfs_attr_ns(kobj, attr, &ns);
629 if (rc)
630 return rc;
631
591 mutex_lock(&sysfs_mutex); 632 mutex_lock(&sysfs_mutex);
592 633
593 rc = -ENOENT; 634 rc = -ENOENT;
594 sd = sysfs_find_dirent(kobj->sd, NULL, attr->name); 635 sd = sysfs_find_dirent(kobj->sd, ns, attr->name);
595 if (!sd) 636 if (!sd)
596 goto out; 637 goto out;
597 638
@@ -616,7 +657,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
616 657
617void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) 658void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
618{ 659{
619 sysfs_hash_and_remove(kobj->sd, NULL, attr->name); 660 const void *ns;
661
662 if (sysfs_attr_ns(kobj, attr, &ns))
663 return;
664
665 sysfs_hash_and_remove(kobj->sd, ns, attr->name);
620} 666}
621 667
622void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) 668void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e3f091a81c72..e23f28894a3a 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
202 inode->i_ctime = iattr->ia_ctime; 202 inode->i_ctime = iattr->ia_ctime;
203} 203}
204 204
205static int sysfs_count_nlink(struct sysfs_dirent *sd)
206{
207 struct sysfs_dirent *child;
208 int nr = 0;
209
210 for (child = sd->s_dir.children; child; child = child->s_sibling)
211 if (sysfs_type(child) == SYSFS_DIR)
212 nr++;
213
214 return nr + 2;
215}
216
217static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) 205static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
218{ 206{
219 struct sysfs_inode_attrs *iattrs = sd->s_iattr; 207 struct sysfs_inode_attrs *iattrs = sd->s_iattr;
@@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
230 } 218 }
231 219
232 if (sysfs_type(sd) == SYSFS_DIR) 220 if (sysfs_type(sd) == SYSFS_DIR)
233 inode->i_nlink = sysfs_count_nlink(sd); 221 inode->i_nlink = sd->s_dir.subdirs + 2;
234} 222}
235 223
236int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 224int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -336,8 +324,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
336 sysfs_addrm_start(&acxt, dir_sd); 324 sysfs_addrm_start(&acxt, dir_sd);
337 325
338 sd = sysfs_find_dirent(dir_sd, ns, name); 326 sd = sysfs_find_dirent(dir_sd, ns, name);
339 if (sd && (sd->s_ns != ns))
340 sd = NULL;
341 if (sd) 327 if (sd)
342 sysfs_remove_one(&acxt, sd); 328 sysfs_remove_one(&acxt, sd);
343 329
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 845ab3ad229d..ce29e28b766d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,14 +11,18 @@
11#include <linux/lockdep.h> 11#include <linux/lockdep.h>
12#include <linux/kobject_ns.h> 12#include <linux/kobject_ns.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/rbtree.h>
14 15
15struct sysfs_open_dirent; 16struct sysfs_open_dirent;
16 17
17/* type-specific structures for sysfs_dirent->s_* union members */ 18/* type-specific structures for sysfs_dirent->s_* union members */
18struct sysfs_elem_dir { 19struct sysfs_elem_dir {
19 struct kobject *kobj; 20 struct kobject *kobj;
20 /* children list starts here and goes through sd->s_sibling */ 21
21 struct sysfs_dirent *children; 22 unsigned long subdirs;
23
24 struct rb_root inode_tree;
25 struct rb_root name_tree;
22}; 26};
23 27
24struct sysfs_elem_symlink { 28struct sysfs_elem_symlink {
@@ -56,9 +60,16 @@ struct sysfs_dirent {
56 struct lockdep_map dep_map; 60 struct lockdep_map dep_map;
57#endif 61#endif
58 struct sysfs_dirent *s_parent; 62 struct sysfs_dirent *s_parent;
59 struct sysfs_dirent *s_sibling;
60 const char *s_name; 63 const char *s_name;
61 64
65 struct rb_node inode_node;
66 struct rb_node name_node;
67
68 union {
69 struct completion *completion;
70 struct sysfs_dirent *removed_list;
71 } u;
72
62 const void *s_ns; /* namespace tag */ 73 const void *s_ns; /* namespace tag */
63 union { 74 union {
64 struct sysfs_elem_dir s_dir; 75 struct sysfs_elem_dir s_dir;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 45174b534377..feb361e252ac 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -335,9 +335,9 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
335#define DBGKEY(key) ((char *)(key)) 335#define DBGKEY(key) ((char *)(key))
336#define DBGKEY1(key) ((char *)(key)) 336#define DBGKEY1(key) ((char *)(key))
337 337
338#define ubifs_dbg_msg(fmt, ...) do { \ 338#define ubifs_dbg_msg(fmt, ...) do { \
339 if (0) \ 339 if (0) \
340 pr_debug(fmt "\n", ##__VA_ARGS__); \ 340 printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \
341} while (0) 341} while (0)
342 342
343#define dbg_dump_stack() 343#define dbg_dump_stack()
diff --git a/fs/xattr.c b/fs/xattr.c
index f060663ab70c..67583de8218c 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -14,6 +14,7 @@
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/namei.h> 15#include <linux/namei.h>
16#include <linux/security.h> 16#include <linux/security.h>
17#include <linux/evm.h>
17#include <linux/syscalls.h> 18#include <linux/syscalls.h>
18#include <linux/module.h> 19#include <linux/module.h>
19#include <linux/fsnotify.h> 20#include <linux/fsnotify.h>
@@ -166,6 +167,64 @@ out_noalloc:
166} 167}
167EXPORT_SYMBOL_GPL(xattr_getsecurity); 168EXPORT_SYMBOL_GPL(xattr_getsecurity);
168 169
170/*
171 * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
172 *
173 * Allocate memory, if not already allocated, or re-allocate correct size,
174 * before retrieving the extended attribute.
175 *
176 * Returns the result of alloc, if failed, or the getxattr operation.
177 */
178ssize_t
179vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
180 size_t xattr_size, gfp_t flags)
181{
182 struct inode *inode = dentry->d_inode;
183 char *value = *xattr_value;
184 int error;
185
186 error = xattr_permission(inode, name, MAY_READ);
187 if (error)
188 return error;
189
190 if (!inode->i_op->getxattr)
191 return -EOPNOTSUPP;
192
193 error = inode->i_op->getxattr(dentry, name, NULL, 0);
194 if (error < 0)
195 return error;
196
197 if (!value || (error > xattr_size)) {
198 value = krealloc(*xattr_value, error + 1, flags);
199 if (!value)
200 return -ENOMEM;
201 memset(value, 0, error + 1);
202 }
203
204 error = inode->i_op->getxattr(dentry, name, value, error);
205 *xattr_value = value;
206 return error;
207}
208
209/* Compare an extended attribute value with the given value */
210int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
211 const char *value, size_t size, gfp_t flags)
212{
213 char *xattr_value = NULL;
214 int rc;
215
216 rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
217 if (rc < 0)
218 return rc;
219
220 if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
221 rc = -EINVAL;
222 else
223 rc = 0;
224 kfree(xattr_value);
225 return rc;
226}
227
169ssize_t 228ssize_t
170vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) 229vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
171{ 230{
@@ -243,8 +302,10 @@ vfs_removexattr(struct dentry *dentry, const char *name)
243 error = inode->i_op->removexattr(dentry, name); 302 error = inode->i_op->removexattr(dentry, name);
244 mutex_unlock(&inode->i_mutex); 303 mutex_unlock(&inode->i_mutex);
245 304
246 if (!error) 305 if (!error) {
247 fsnotify_xattr(dentry); 306 fsnotify_xattr(dentry);
307 evm_inode_post_removexattr(dentry, name);
308 }
248 return error; 309 return error;
249} 310}
250EXPORT_SYMBOL_GPL(vfs_removexattr); 311EXPORT_SYMBOL_GPL(vfs_removexattr);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 75bb316529dd..427a4e82a588 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,44 +16,53 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19ccflags-y := -I$(src) -I$(src)/linux-2.6 19ccflags-y += -I$(src) # needed for trace events
20ccflags-$(CONFIG_XFS_DEBUG) += -g
21 20
22XFS_LINUX := linux-2.6 21ccflags-$(CONFIG_XFS_DEBUG) += -g
23 22
24obj-$(CONFIG_XFS_FS) += xfs.o 23obj-$(CONFIG_XFS_FS) += xfs.o
25 24
26xfs-y += linux-2.6/xfs_trace.o 25# this one should be compiled first, as the tracing macros can easily blow up
27 26xfs-y += xfs_trace.o
28xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
29 xfs_dquot.o \
30 xfs_dquot_item.o \
31 xfs_trans_dquot.o \
32 xfs_qm_syscalls.o \
33 xfs_qm_bhv.o \
34 xfs_qm.o)
35xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o
36
37ifeq ($(CONFIG_XFS_QUOTA),y)
38xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
39endif
40
41xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
42xfs-$(CONFIG_XFS_POSIX_ACL) += $(XFS_LINUX)/xfs_acl.o
43xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o
44xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o
45xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o
46 27
28# highlevel code
29xfs-y += xfs_aops.o \
30 xfs_bit.o \
31 xfs_buf.o \
32 xfs_dfrag.o \
33 xfs_discard.o \
34 xfs_error.o \
35 xfs_export.o \
36 xfs_file.o \
37 xfs_filestream.o \
38 xfs_fsops.o \
39 xfs_fs_subr.o \
40 xfs_globals.o \
41 xfs_iget.o \
42 xfs_ioctl.o \
43 xfs_iomap.o \
44 xfs_iops.o \
45 xfs_itable.o \
46 xfs_message.o \
47 xfs_mru_cache.o \
48 xfs_super.o \
49 xfs_sync.o \
50 xfs_xattr.o \
51 xfs_rename.o \
52 xfs_rw.o \
53 xfs_utils.o \
54 xfs_vnodeops.o \
55 kmem.o \
56 uuid.o
47 57
58# code shared with libxfs
48xfs-y += xfs_alloc.o \ 59xfs-y += xfs_alloc.o \
49 xfs_alloc_btree.o \ 60 xfs_alloc_btree.o \
50 xfs_attr.o \ 61 xfs_attr.o \
51 xfs_attr_leaf.o \ 62 xfs_attr_leaf.o \
52 xfs_bit.o \
53 xfs_bmap.o \ 63 xfs_bmap.o \
54 xfs_bmap_btree.o \ 64 xfs_bmap_btree.o \
55 xfs_btree.o \ 65 xfs_btree.o \
56 xfs_buf_item.o \
57 xfs_da_btree.o \ 66 xfs_da_btree.o \
58 xfs_dir2.o \ 67 xfs_dir2.o \
59 xfs_dir2_block.o \ 68 xfs_dir2_block.o \
@@ -61,49 +70,37 @@ xfs-y += xfs_alloc.o \
61 xfs_dir2_leaf.o \ 70 xfs_dir2_leaf.o \
62 xfs_dir2_node.o \ 71 xfs_dir2_node.o \
63 xfs_dir2_sf.o \ 72 xfs_dir2_sf.o \
64 xfs_error.o \
65 xfs_extfree_item.o \
66 xfs_filestream.o \
67 xfs_fsops.o \
68 xfs_ialloc.o \ 73 xfs_ialloc.o \
69 xfs_ialloc_btree.o \ 74 xfs_ialloc_btree.o \
70 xfs_iget.o \
71 xfs_inode.o \ 75 xfs_inode.o \
72 xfs_inode_item.o \
73 xfs_iomap.o \
74 xfs_itable.o \
75 xfs_dfrag.o \
76 xfs_log.o \
77 xfs_log_cil.o \
78 xfs_log_recover.o \ 76 xfs_log_recover.o \
79 xfs_mount.o \ 77 xfs_mount.o \
80 xfs_mru_cache.o \ 78 xfs_trans.o
81 xfs_rename.o \ 79
82 xfs_trans.o \ 80# low-level transaction/log code
81xfs-y += xfs_log.o \
82 xfs_log_cil.o \
83 xfs_buf_item.o \
84 xfs_extfree_item.o \
85 xfs_inode_item.o \
83 xfs_trans_ail.o \ 86 xfs_trans_ail.o \
84 xfs_trans_buf.o \ 87 xfs_trans_buf.o \
85 xfs_trans_extfree.o \ 88 xfs_trans_extfree.o \
86 xfs_trans_inode.o \ 89 xfs_trans_inode.o \
87 xfs_utils.o \
88 xfs_vnodeops.o \
89 xfs_rw.o
90
91# Objects in linux/
92xfs-y += $(addprefix $(XFS_LINUX)/, \
93 kmem.o \
94 xfs_aops.o \
95 xfs_buf.o \
96 xfs_discard.o \
97 xfs_export.o \
98 xfs_file.o \
99 xfs_fs_subr.o \
100 xfs_globals.o \
101 xfs_ioctl.o \
102 xfs_iops.o \
103 xfs_message.o \
104 xfs_super.o \
105 xfs_sync.o \
106 xfs_xattr.o)
107 90
108# Objects in support/ 91# optional features
109xfs-y += support/uuid.o 92xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
93 xfs_dquot_item.o \
94 xfs_trans_dquot.o \
95 xfs_qm_syscalls.o \
96 xfs_qm_bhv.o \
97 xfs_qm.o \
98 xfs_quotaops.o
99ifeq ($(CONFIG_XFS_QUOTA),y)
100xfs-$(CONFIG_PROC_FS) += xfs_qm_stats.o
101endif
102xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
103xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
104xfs-$(CONFIG_PROC_FS) += xfs_stats.o
105xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
106xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/kmem.c
index a907de565db3..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/kmem.c
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/kmem.h
index f7c8f7a9ea6d..292eff198030 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/kmem.h
@@ -61,12 +61,7 @@ extern void kmem_free(const void *);
61 61
62static inline void *kmem_zalloc_large(size_t size) 62static inline void *kmem_zalloc_large(size_t size)
63{ 63{
64 void *ptr; 64 return vzalloc(size);
65
66 ptr = vmalloc(size);
67 if (ptr)
68 memset(ptr, 0, size);
69 return ptr;
70} 65}
71static inline void kmem_free_large(void *ptr) 66static inline void kmem_free_large(void *ptr)
72{ 67{
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/mrlock.h
index ff6a19873e5c..ff6a19873e5c 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/mrlock.h
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/time.h
index 387e695a184c..387e695a184c 100644
--- a/fs/xfs/linux-2.6/time.h
+++ b/fs/xfs/time.h
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/uuid.c
index b83f76b6d410..b83f76b6d410 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/uuid.c
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..4732d71262cc 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/uuid.h
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 53ec3ea9a625..d8b11b7f94aa 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -24,5 +24,6 @@
24#define XFS_BUF_LOCK_TRACKING 1 24#define XFS_BUF_LOCK_TRACKING 1
25#endif 25#endif
26 26
27#include <linux-2.6/xfs_linux.h> 27#include "xfs_linux.h"
28
28#endif /* __XFS_H__ */ 29#endif /* __XFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/xfs_acl.c
index b6c4b3795c4a..b6c4b3795c4a 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 6530769a999b..4805f009f923 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -103,7 +103,7 @@ typedef struct xfs_agf {
103/* disk block (xfs_daddr_t) in the AG */ 103/* disk block (xfs_daddr_t) in the AG */
104#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) 104#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
105#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) 105#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
106#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) 106#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
107 107
108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, 108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); 109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
@@ -156,7 +156,7 @@ typedef struct xfs_agi {
156/* disk block (xfs_daddr_t) in the AG */ 156/* disk block (xfs_daddr_t) in the AG */
157#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) 157#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
158#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) 158#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
159#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) 159#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
160 160
161extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, 161extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
162 xfs_agnumber_t agno, struct xfs_buf **bpp); 162 xfs_agnumber_t agno, struct xfs_buf **bpp);
@@ -168,7 +168,7 @@ extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
168#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) 168#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
169#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) 169#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
170#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) 170#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
171#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) 171#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
172 172
173typedef struct xfs_agfl { 173typedef struct xfs_agfl {
174 __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ 174 __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 1e00b3ef6274..ce84ffd0264c 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -451,9 +451,8 @@ xfs_alloc_read_agfl(
451 XFS_FSS_TO_BB(mp, 1), 0, &bp); 451 XFS_FSS_TO_BB(mp, 1), 0, &bp);
452 if (error) 452 if (error)
453 return error; 453 return error;
454 ASSERT(bp); 454 ASSERT(!xfs_buf_geterror(bp));
455 ASSERT(!XFS_BUF_GETERROR(bp)); 455 xfs_buf_set_ref(bp, XFS_AGFL_REF);
456 XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
457 *bpp = bp; 456 *bpp = bp;
458 return 0; 457 return 0;
459} 458}
@@ -2116,7 +2115,7 @@ xfs_read_agf(
2116 if (!*bpp) 2115 if (!*bpp)
2117 return 0; 2116 return 0;
2118 2117
2119 ASSERT(!XFS_BUF_GETERROR(*bpp)); 2118 ASSERT(!(*bpp)->b_error);
2120 agf = XFS_BUF_TO_AGF(*bpp); 2119 agf = XFS_BUF_TO_AGF(*bpp);
2121 2120
2122 /* 2121 /*
@@ -2140,7 +2139,7 @@ xfs_read_agf(
2140 xfs_trans_brelse(tp, *bpp); 2139 xfs_trans_brelse(tp, *bpp);
2141 return XFS_ERROR(EFSCORRUPTED); 2140 return XFS_ERROR(EFSCORRUPTED);
2142 } 2141 }
2143 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); 2142 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2144 return 0; 2143 return 0;
2145} 2144}
2146 2145
@@ -2168,7 +2167,7 @@ xfs_alloc_read_agf(
2168 return error; 2167 return error;
2169 if (!*bpp) 2168 if (!*bpp)
2170 return 0; 2169 return 0;
2171 ASSERT(!XFS_BUF_GETERROR(*bpp)); 2170 ASSERT(!(*bpp)->b_error);
2172 2171
2173 agf = XFS_BUF_TO_AGF(*bpp); 2172 agf = XFS_BUF_TO_AGF(*bpp);
2174 pag = xfs_perag_get(mp, agno); 2173 pag = xfs_perag_get(mp, agno);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/xfs_aops.c
index 63e971e2b837..11b2aad982d4 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -38,40 +38,6 @@
38#include <linux/pagevec.h> 38#include <linux/pagevec.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40 40
41
42/*
43 * Prime number of hash buckets since address is used as the key.
44 */
45#define NVSYNC 37
46#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
47static wait_queue_head_t xfs_ioend_wq[NVSYNC];
48
49void __init
50xfs_ioend_init(void)
51{
52 int i;
53
54 for (i = 0; i < NVSYNC; i++)
55 init_waitqueue_head(&xfs_ioend_wq[i]);
56}
57
58void
59xfs_ioend_wait(
60 xfs_inode_t *ip)
61{
62 wait_queue_head_t *wq = to_ioend_wq(ip);
63
64 wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
65}
66
67STATIC void
68xfs_ioend_wake(
69 xfs_inode_t *ip)
70{
71 if (atomic_dec_and_test(&ip->i_iocount))
72 wake_up(to_ioend_wq(ip));
73}
74
75void 41void
76xfs_count_page_state( 42xfs_count_page_state(
77 struct page *page, 43 struct page *page,
@@ -115,25 +81,20 @@ xfs_destroy_ioend(
115 xfs_ioend_t *ioend) 81 xfs_ioend_t *ioend)
116{ 82{
117 struct buffer_head *bh, *next; 83 struct buffer_head *bh, *next;
118 struct xfs_inode *ip = XFS_I(ioend->io_inode);
119 84
120 for (bh = ioend->io_buffer_head; bh; bh = next) { 85 for (bh = ioend->io_buffer_head; bh; bh = next) {
121 next = bh->b_private; 86 next = bh->b_private;
122 bh->b_end_io(bh, !ioend->io_error); 87 bh->b_end_io(bh, !ioend->io_error);
123 } 88 }
124 89
125 /* 90 if (ioend->io_iocb) {
126 * Volume managers supporting multiple paths can send back ENODEV 91 if (ioend->io_isasync) {
127 * when the final path disappears. In this case continuing to fill 92 aio_complete(ioend->io_iocb, ioend->io_error ?
128 * the page cache with dirty data which cannot be written out is 93 ioend->io_error : ioend->io_result, 0);
129 * evil, so prevent that. 94 }
130 */ 95 inode_dio_done(ioend->io_inode);
131 if (unlikely(ioend->io_error == -ENODEV)) {
132 xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
133 __FILE__, __LINE__);
134 } 96 }
135 97
136 xfs_ioend_wake(ip);
137 mempool_free(ioend, xfs_ioend_pool); 98 mempool_free(ioend, xfs_ioend_pool);
138} 99}
139 100
@@ -156,6 +117,15 @@ xfs_ioend_new_eof(
156} 117}
157 118
158/* 119/*
120 * Fast and loose check if this write could update the on-disk inode size.
121 */
122static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
123{
124 return ioend->io_offset + ioend->io_size >
125 XFS_I(ioend->io_inode)->i_d.di_size;
126}
127
128/*
159 * Update on-disk file size now that data has been written to disk. The 129 * Update on-disk file size now that data has been written to disk. The
160 * current in-memory file size is i_size. If a write is beyond eof i_new_size 130 * current in-memory file size is i_size. If a write is beyond eof i_new_size
161 * will be the intended file size until i_size is updated. If this write does 131 * will be the intended file size until i_size is updated. If this write does
@@ -173,9 +143,6 @@ xfs_setfilesize(
173 xfs_inode_t *ip = XFS_I(ioend->io_inode); 143 xfs_inode_t *ip = XFS_I(ioend->io_inode);
174 xfs_fsize_t isize; 144 xfs_fsize_t isize;
175 145
176 if (unlikely(ioend->io_error))
177 return 0;
178
179 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 146 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
180 return EAGAIN; 147 return EAGAIN;
181 148
@@ -192,6 +159,9 @@ xfs_setfilesize(
192 159
193/* 160/*
194 * Schedule IO completion handling on the final put of an ioend. 161 * Schedule IO completion handling on the final put of an ioend.
162 *
163 * If there is no work to do we might as well call it a day and free the
164 * ioend right now.
195 */ 165 */
196STATIC void 166STATIC void
197xfs_finish_ioend( 167xfs_finish_ioend(
@@ -200,8 +170,10 @@ xfs_finish_ioend(
200 if (atomic_dec_and_test(&ioend->io_remaining)) { 170 if (atomic_dec_and_test(&ioend->io_remaining)) {
201 if (ioend->io_type == IO_UNWRITTEN) 171 if (ioend->io_type == IO_UNWRITTEN)
202 queue_work(xfsconvertd_workqueue, &ioend->io_work); 172 queue_work(xfsconvertd_workqueue, &ioend->io_work);
203 else 173 else if (xfs_ioend_is_append(ioend))
204 queue_work(xfsdatad_workqueue, &ioend->io_work); 174 queue_work(xfsdatad_workqueue, &ioend->io_work);
175 else
176 xfs_destroy_ioend(ioend);
205 } 177 }
206} 178}
207 179
@@ -216,17 +188,24 @@ xfs_end_io(
216 struct xfs_inode *ip = XFS_I(ioend->io_inode); 188 struct xfs_inode *ip = XFS_I(ioend->io_inode);
217 int error = 0; 189 int error = 0;
218 190
191 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
192 error = -EIO;
193 goto done;
194 }
195 if (ioend->io_error)
196 goto done;
197
219 /* 198 /*
220 * For unwritten extents we need to issue transactions to convert a 199 * For unwritten extents we need to issue transactions to convert a
221 * range to normal written extens after the data I/O has finished. 200 * range to normal written extens after the data I/O has finished.
222 */ 201 */
223 if (ioend->io_type == IO_UNWRITTEN && 202 if (ioend->io_type == IO_UNWRITTEN) {
224 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
225
226 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 203 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
227 ioend->io_size); 204 ioend->io_size);
228 if (error) 205 if (error) {
229 ioend->io_error = error; 206 ioend->io_error = -error;
207 goto done;
208 }
230 } 209 }
231 210
232 /* 211 /*
@@ -236,6 +215,7 @@ xfs_end_io(
236 error = xfs_setfilesize(ioend); 215 error = xfs_setfilesize(ioend);
237 ASSERT(!error || error == EAGAIN); 216 ASSERT(!error || error == EAGAIN);
238 217
218done:
239 /* 219 /*
240 * If we didn't complete processing of the ioend, requeue it to the 220 * If we didn't complete processing of the ioend, requeue it to the
241 * tail of the workqueue for another attempt later. Otherwise destroy 221 * tail of the workqueue for another attempt later. Otherwise destroy
@@ -247,8 +227,6 @@ xfs_end_io(
247 /* ensure we don't spin on blocked ioends */ 227 /* ensure we don't spin on blocked ioends */
248 delay(1); 228 delay(1);
249 } else { 229 } else {
250 if (ioend->io_iocb)
251 aio_complete(ioend->io_iocb, ioend->io_result, 0);
252 xfs_destroy_ioend(ioend); 230 xfs_destroy_ioend(ioend);
253 } 231 }
254} 232}
@@ -285,13 +263,13 @@ xfs_alloc_ioend(
285 * all the I/O from calling the completion routine too early. 263 * all the I/O from calling the completion routine too early.
286 */ 264 */
287 atomic_set(&ioend->io_remaining, 1); 265 atomic_set(&ioend->io_remaining, 1);
266 ioend->io_isasync = 0;
288 ioend->io_error = 0; 267 ioend->io_error = 0;
289 ioend->io_list = NULL; 268 ioend->io_list = NULL;
290 ioend->io_type = type; 269 ioend->io_type = type;
291 ioend->io_inode = inode; 270 ioend->io_inode = inode;
292 ioend->io_buffer_head = NULL; 271 ioend->io_buffer_head = NULL;
293 ioend->io_buffer_tail = NULL; 272 ioend->io_buffer_tail = NULL;
294 atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
295 ioend->io_offset = 0; 273 ioend->io_offset = 0;
296 ioend->io_size = 0; 274 ioend->io_size = 0;
297 ioend->io_iocb = NULL; 275 ioend->io_iocb = NULL;
@@ -337,8 +315,8 @@ xfs_map_blocks(
337 count = mp->m_maxioffset - offset; 315 count = mp->m_maxioffset - offset;
338 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 316 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
339 offset_fsb = XFS_B_TO_FSBT(mp, offset); 317 offset_fsb = XFS_B_TO_FSBT(mp, offset);
340 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, 318 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
341 bmapi_flags, NULL, 0, imap, &nimaps, NULL); 319 imap, &nimaps, bmapi_flags);
342 xfs_iunlock(ip, XFS_ILOCK_SHARED); 320 xfs_iunlock(ip, XFS_ILOCK_SHARED);
343 321
344 if (error) 322 if (error)
@@ -551,7 +529,6 @@ xfs_cancel_ioend(
551 unlock_buffer(bh); 529 unlock_buffer(bh);
552 } while ((bh = next_bh) != NULL); 530 } while ((bh = next_bh) != NULL);
553 531
554 xfs_ioend_wake(XFS_I(ioend->io_inode));
555 mempool_free(ioend, xfs_ioend_pool); 532 mempool_free(ioend, xfs_ioend_pool);
556 } while ((ioend = next) != NULL); 533 } while ((ioend = next) != NULL);
557} 534}
@@ -1161,8 +1138,8 @@ __xfs_get_blocks(
1161 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1138 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1162 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1139 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1163 1140
1164 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, 1141 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1165 XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL); 1142 &imap, &nimaps, XFS_BMAPI_ENTIRE);
1166 if (error) 1143 if (error)
1167 goto out_unlock; 1144 goto out_unlock;
1168 1145
@@ -1310,28 +1287,17 @@ xfs_end_io_direct_write(
1310 1287
1311 ioend->io_offset = offset; 1288 ioend->io_offset = offset;
1312 ioend->io_size = size; 1289 ioend->io_size = size;
1290 ioend->io_iocb = iocb;
1291 ioend->io_result = ret;
1313 if (private && size > 0) 1292 if (private && size > 0)
1314 ioend->io_type = IO_UNWRITTEN; 1293 ioend->io_type = IO_UNWRITTEN;
1315 1294
1316 if (is_async) { 1295 if (is_async) {
1317 /* 1296 ioend->io_isasync = 1;
1318 * If we are converting an unwritten extent we need to delay
1319 * the AIO completion until after the unwrittent extent
1320 * conversion has completed, otherwise do it ASAP.
1321 */
1322 if (ioend->io_type == IO_UNWRITTEN) {
1323 ioend->io_iocb = iocb;
1324 ioend->io_result = ret;
1325 } else {
1326 aio_complete(iocb, ret, 0);
1327 }
1328 xfs_finish_ioend(ioend); 1297 xfs_finish_ioend(ioend);
1329 } else { 1298 } else {
1330 xfs_finish_ioend_sync(ioend); 1299 xfs_finish_ioend_sync(ioend);
1331 } 1300 }
1332
1333 /* XXX: probably should move into the real I/O completion handler */
1334 inode_dio_done(ioend->io_inode);
1335} 1301}
1336 1302
1337STATIC ssize_t 1303STATIC ssize_t
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/xfs_aops.h
index 71f721e1a71f..116dd5c37034 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -47,6 +47,7 @@ typedef struct xfs_ioend {
47 unsigned int io_type; /* delalloc / unwritten */ 47 unsigned int io_type; /* delalloc / unwritten */
48 int io_error; /* I/O error code */ 48 int io_error; /* I/O error code */
49 atomic_t io_remaining; /* hold count */ 49 atomic_t io_remaining; /* hold count */
50 unsigned int io_isasync : 1; /* needs aio_complete */
50 struct inode *io_inode; /* file being written to */ 51 struct inode *io_inode; /* file being written to */
51 struct buffer_head *io_buffer_head;/* buffer linked list head */ 52 struct buffer_head *io_buffer_head;/* buffer linked list head */
52 struct buffer_head *io_buffer_tail;/* buffer linked list tail */ 53 struct buffer_head *io_buffer_tail;/* buffer linked list tail */
@@ -60,9 +61,6 @@ typedef struct xfs_ioend {
60extern const struct address_space_operations xfs_address_space_operations; 61extern const struct address_space_operations xfs_address_space_operations;
61extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); 62extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
62 63
63extern void xfs_ioend_init(void);
64extern void xfs_ioend_wait(struct xfs_inode *);
65
66extern void xfs_count_page_state(struct page *, int *, int *); 64extern void xfs_count_page_state(struct page *, int *, int *);
67 65
68#endif /* __XFS_AOPS_H__ */ 66#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index cbae424fe1ba..1e5d97f86ea8 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -319,7 +319,7 @@ xfs_attr_set_int(
319 return (error); 319 return (error);
320 } 320 }
321 321
322 xfs_trans_ijoin(args.trans, dp); 322 xfs_trans_ijoin(args.trans, dp, 0);
323 323
324 /* 324 /*
325 * If the attribute list is non-existent or a shortform list, 325 * If the attribute list is non-existent or a shortform list,
@@ -389,7 +389,7 @@ xfs_attr_set_int(
389 * a new one. We need the inode to be in all transactions. 389 * a new one. We need the inode to be in all transactions.
390 */ 390 */
391 if (committed) 391 if (committed)
392 xfs_trans_ijoin(args.trans, dp); 392 xfs_trans_ijoin(args.trans, dp, 0);
393 393
394 /* 394 /*
395 * Commit the leaf transformation. We'll need another (linked) 395 * Commit the leaf transformation. We'll need another (linked)
@@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
537 * No need to make quota reservations here. We expect to release some 537 * No need to make quota reservations here. We expect to release some
538 * blocks not allocate in the common case. 538 * blocks not allocate in the common case.
539 */ 539 */
540 xfs_trans_ijoin(args.trans, dp); 540 xfs_trans_ijoin(args.trans, dp, 0);
541 541
542 /* 542 /*
543 * Decide on what work routines to call based on the inode size. 543 * Decide on what work routines to call based on the inode size.
@@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
809 * No need to make quota reservations here. We expect to release some 809 * No need to make quota reservations here. We expect to release some
810 * blocks, not allocate, in the common case. 810 * blocks, not allocate, in the common case.
811 */ 811 */
812 xfs_trans_ijoin(trans, dp); 812 xfs_trans_ijoin(trans, dp, 0);
813 813
814 /* 814 /*
815 * Decide on what work routines to call based on the inode size. 815 * Decide on what work routines to call based on the inode size.
@@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
823 if (error) 823 if (error)
824 goto out; 824 goto out;
825 825
826 /*
827 * Signal synchronous inactive transactions unless this is a
828 * synchronous mount filesystem in which case we know that we're here
829 * because we've been called out of xfs_inactive which means that the
830 * last reference is gone and the unlink transaction has already hit
831 * the disk so async inactive transactions are safe.
832 */
833 if (!(mp->m_flags & XFS_MOUNT_WSYNC)) {
834 if (dp->i_d.di_anextents > 0)
835 xfs_trans_set_sync(trans);
836 }
837
838 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); 826 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
839 if (error) 827 if (error)
840 goto out; 828 goto out;
@@ -973,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
973 * a new one. We need the inode to be in all transactions. 961 * a new one. We need the inode to be in all transactions.
974 */ 962 */
975 if (committed) 963 if (committed)
976 xfs_trans_ijoin(args->trans, dp); 964 xfs_trans_ijoin(args->trans, dp, 0);
977 965
978 /* 966 /*
979 * Commit the current trans (including the inode) and start 967 * Commit the current trans (including the inode) and start
@@ -1075,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1075 * in all transactions. 1063 * in all transactions.
1076 */ 1064 */
1077 if (committed) 1065 if (committed)
1078 xfs_trans_ijoin(args->trans, dp); 1066 xfs_trans_ijoin(args->trans, dp, 0);
1079 } else 1067 } else
1080 xfs_da_buf_done(bp); 1068 xfs_da_buf_done(bp);
1081 1069
@@ -1149,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1149 * a new one. We need the inode to be in all transactions. 1137 * a new one. We need the inode to be in all transactions.
1150 */ 1138 */
1151 if (committed) 1139 if (committed)
1152 xfs_trans_ijoin(args->trans, dp); 1140 xfs_trans_ijoin(args->trans, dp, 0);
1153 } else 1141 } else
1154 xfs_da_buf_done(bp); 1142 xfs_da_buf_done(bp);
1155 return(0); 1143 return(0);
@@ -1303,7 +1291,7 @@ restart:
1303 * in all transactions. 1291 * in all transactions.
1304 */ 1292 */
1305 if (committed) 1293 if (committed)
1306 xfs_trans_ijoin(args->trans, dp); 1294 xfs_trans_ijoin(args->trans, dp, 0);
1307 1295
1308 /* 1296 /*
1309 * Commit the node conversion and start the next 1297 * Commit the node conversion and start the next
@@ -1340,7 +1328,7 @@ restart:
1340 * a new one. We need the inode to be in all transactions. 1328 * a new one. We need the inode to be in all transactions.
1341 */ 1329 */
1342 if (committed) 1330 if (committed)
1343 xfs_trans_ijoin(args->trans, dp); 1331 xfs_trans_ijoin(args->trans, dp, 0);
1344 } else { 1332 } else {
1345 /* 1333 /*
1346 * Addition succeeded, update Btree hashvals. 1334 * Addition succeeded, update Btree hashvals.
@@ -1452,7 +1440,7 @@ restart:
1452 * in all transactions. 1440 * in all transactions.
1453 */ 1441 */
1454 if (committed) 1442 if (committed)
1455 xfs_trans_ijoin(args->trans, dp); 1443 xfs_trans_ijoin(args->trans, dp, 0);
1456 } 1444 }
1457 1445
1458 /* 1446 /*
@@ -1584,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1584 * a new one. We need the inode to be in all transactions. 1572 * a new one. We need the inode to be in all transactions.
1585 */ 1573 */
1586 if (committed) 1574 if (committed)
1587 xfs_trans_ijoin(args->trans, dp); 1575 xfs_trans_ijoin(args->trans, dp, 0);
1588 1576
1589 /* 1577 /*
1590 * Commit the Btree join operation and start a new trans. 1578 * Commit the Btree join operation and start a new trans.
@@ -1635,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1635 * in all transactions. 1623 * in all transactions.
1636 */ 1624 */
1637 if (committed) 1625 if (committed)
1638 xfs_trans_ijoin(args->trans, dp); 1626 xfs_trans_ijoin(args->trans, dp, 0);
1639 } else 1627 } else
1640 xfs_da_brelse(args->trans, bp); 1628 xfs_da_brelse(args->trans, bp);
1641 } 1629 }
@@ -1975,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1975 lblkno = args->rmtblkno; 1963 lblkno = args->rmtblkno;
1976 while (valuelen > 0) { 1964 while (valuelen > 0) {
1977 nmap = ATTR_RMTVALUE_MAPSIZE; 1965 nmap = ATTR_RMTVALUE_MAPSIZE;
1978 error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, 1966 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
1979 args->rmtblkcnt, 1967 args->rmtblkcnt, map, &nmap,
1980 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 1968 XFS_BMAPI_ATTRFORK);
1981 NULL, 0, map, &nmap, NULL);
1982 if (error) 1969 if (error)
1983 return(error); 1970 return(error);
1984 ASSERT(nmap >= 1); 1971 ASSERT(nmap >= 1);
@@ -2052,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2052 */ 2039 */
2053 xfs_bmap_init(args->flist, args->firstblock); 2040 xfs_bmap_init(args->flist, args->firstblock);
2054 nmap = 1; 2041 nmap = 1;
2055 error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, 2042 error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
2056 blkcnt, 2043 blkcnt,
2057 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | 2044 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2058 XFS_BMAPI_WRITE,
2059 args->firstblock, args->total, &map, &nmap, 2045 args->firstblock, args->total, &map, &nmap,
2060 args->flist); 2046 args->flist);
2061 if (!error) { 2047 if (!error) {
@@ -2074,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2074 * a new one. We need the inode to be in all transactions. 2060 * a new one. We need the inode to be in all transactions.
2075 */ 2061 */
2076 if (committed) 2062 if (committed)
2077 xfs_trans_ijoin(args->trans, dp); 2063 xfs_trans_ijoin(args->trans, dp, 0);
2078 2064
2079 ASSERT(nmap == 1); 2065 ASSERT(nmap == 1);
2080 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 2066 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2104,14 +2090,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2104 */ 2090 */
2105 xfs_bmap_init(args->flist, args->firstblock); 2091 xfs_bmap_init(args->flist, args->firstblock);
2106 nmap = 1; 2092 nmap = 1;
2107 error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, 2093 error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
2108 args->rmtblkcnt, 2094 args->rmtblkcnt, &map, &nmap,
2109 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2095 XFS_BMAPI_ATTRFORK);
2110 args->firstblock, 0, &map, &nmap, 2096 if (error)
2111 NULL);
2112 if (error) {
2113 return(error); 2097 return(error);
2114 }
2115 ASSERT(nmap == 1); 2098 ASSERT(nmap == 1);
2116 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 2099 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
2117 (map.br_startblock != HOLESTARTBLOCK)); 2100 (map.br_startblock != HOLESTARTBLOCK));
@@ -2121,17 +2104,17 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2121 2104
2122 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 2105 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
2123 XBF_LOCK | XBF_DONT_BLOCK); 2106 XBF_LOCK | XBF_DONT_BLOCK);
2124 ASSERT(bp); 2107 if (!bp)
2125 ASSERT(!XFS_BUF_GETERROR(bp)); 2108 return ENOMEM;
2126
2127 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2109 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2128 XFS_BUF_SIZE(bp); 2110 XFS_BUF_SIZE(bp);
2129 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); 2111 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
2130 if (tmp < XFS_BUF_SIZE(bp)) 2112 if (tmp < XFS_BUF_SIZE(bp))
2131 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2113 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2132 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2114 error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
2133 return (error); 2115 xfs_buf_relse(bp);
2134 } 2116 if (error)
2117 return error;
2135 src += tmp; 2118 src += tmp;
2136 valuelen -= tmp; 2119 valuelen -= tmp;
2137 2120
@@ -2167,16 +2150,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2167 /* 2150 /*
2168 * Try to remember where we decided to put the value. 2151 * Try to remember where we decided to put the value.
2169 */ 2152 */
2170 xfs_bmap_init(args->flist, args->firstblock);
2171 nmap = 1; 2153 nmap = 1;
2172 error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, 2154 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
2173 args->rmtblkcnt, 2155 args->rmtblkcnt, &map, &nmap,
2174 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2156 XFS_BMAPI_ATTRFORK);
2175 args->firstblock, 0, &map, &nmap, 2157 if (error)
2176 args->flist);
2177 if (error) {
2178 return(error); 2158 return(error);
2179 }
2180 ASSERT(nmap == 1); 2159 ASSERT(nmap == 1);
2181 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 2160 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
2182 (map.br_startblock != HOLESTARTBLOCK)); 2161 (map.br_startblock != HOLESTARTBLOCK));
@@ -2189,8 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2189 */ 2168 */
2190 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); 2169 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
2191 if (bp) { 2170 if (bp) {
2192 XFS_BUF_STALE(bp); 2171 xfs_buf_stale(bp);
2193 XFS_BUF_UNDELAYWRITE(bp);
2194 xfs_buf_relse(bp); 2172 xfs_buf_relse(bp);
2195 bp = NULL; 2173 bp = NULL;
2196 } 2174 }
@@ -2228,7 +2206,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2228 * a new one. We need the inode to be in all transactions. 2206 * a new one. We need the inode to be in all transactions.
2229 */ 2207 */
2230 if (committed) 2208 if (committed)
2231 xfs_trans_ijoin(args->trans, args->dp); 2209 xfs_trans_ijoin(args->trans, args->dp, 0);
2232 2210
2233 /* 2211 /*
2234 * Close out trans and start the next one in the chain. 2212 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 8fad9602542b..d4906e7c9787 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2926,9 +2926,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2926 * Try to remember where we decided to put the value. 2926 * Try to remember where we decided to put the value.
2927 */ 2927 */
2928 nmap = 1; 2928 nmap = 1;
2929 error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, 2929 error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
2930 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2930 &map, &nmap, XFS_BMAPI_ATTRFORK);
2931 NULL, 0, &map, &nmap, NULL);
2932 if (error) { 2931 if (error) {
2933 return(error); 2932 return(error);
2934 } 2933 }
@@ -2948,6 +2947,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2948 bp = xfs_trans_get_buf(*trans, 2947 bp = xfs_trans_get_buf(*trans,
2949 dp->i_mount->m_ddev_targp, 2948 dp->i_mount->m_ddev_targp,
2950 dblkno, dblkcnt, XBF_LOCK); 2949 dblkno, dblkcnt, XBF_LOCK);
2950 if (!bp)
2951 return ENOMEM;
2951 xfs_trans_binval(*trans, bp); 2952 xfs_trans_binval(*trans, bp);
2952 /* 2953 /*
2953 * Roll to next transaction. 2954 * Roll to next transaction.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ab3e5c6c4642..c68baeb0974a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -50,17 +50,22 @@
50#include "xfs_trace.h" 50#include "xfs_trace.h"
51 51
52 52
53#ifdef DEBUG
54STATIC void
55xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
56#endif
57
58kmem_zone_t *xfs_bmap_free_item_zone; 53kmem_zone_t *xfs_bmap_free_item_zone;
59 54
60/* 55/*
61 * Prototypes for internal bmap routines. 56 * Prototypes for internal bmap routines.
62 */ 57 */
63 58
59#ifdef DEBUG
60STATIC void
61xfs_bmap_check_leaf_extents(
62 struct xfs_btree_cur *cur,
63 struct xfs_inode *ip,
64 int whichfork);
65#else
66#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
67#endif
68
64 69
65/* 70/*
66 * Called from xfs_bmap_add_attrfork to handle extents format files. 71 * Called from xfs_bmap_add_attrfork to handle extents format files.
@@ -85,58 +90,6 @@ xfs_bmap_add_attrfork_local(
85 int *flags); /* inode logging flags */ 90 int *flags); /* inode logging flags */
86 91
87/* 92/*
88 * Called by xfs_bmap_add_extent to handle cases converting a delayed
89 * allocation to a real allocation.
90 */
91STATIC int /* error */
92xfs_bmap_add_extent_delay_real(
93 struct xfs_trans *tp, /* transaction pointer */
94 xfs_inode_t *ip, /* incore inode pointer */
95 xfs_extnum_t *idx, /* extent number to update/insert */
96 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
97 xfs_bmbt_irec_t *new, /* new data to add to file extents */
98 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
99 xfs_fsblock_t *first, /* pointer to firstblock variable */
100 xfs_bmap_free_t *flist, /* list of extents to be freed */
101 int *logflagsp); /* inode logging flags */
102
103/*
104 * Called by xfs_bmap_add_extent to handle cases converting a hole
105 * to a delayed allocation.
106 */
107STATIC int /* error */
108xfs_bmap_add_extent_hole_delay(
109 xfs_inode_t *ip, /* incore inode pointer */
110 xfs_extnum_t *idx, /* extent number to update/insert */
111 xfs_bmbt_irec_t *new, /* new data to add to file extents */
112 int *logflagsp); /* inode logging flags */
113
114/*
115 * Called by xfs_bmap_add_extent to handle cases converting a hole
116 * to a real allocation.
117 */
118STATIC int /* error */
119xfs_bmap_add_extent_hole_real(
120 xfs_inode_t *ip, /* incore inode pointer */
121 xfs_extnum_t *idx, /* extent number to update/insert */
122 xfs_btree_cur_t *cur, /* if null, not a btree */
123 xfs_bmbt_irec_t *new, /* new data to add to file extents */
124 int *logflagsp, /* inode logging flags */
125 int whichfork); /* data or attr fork */
126
127/*
128 * Called by xfs_bmap_add_extent to handle cases converting an unwritten
129 * allocation to a real allocation or vice versa.
130 */
131STATIC int /* error */
132xfs_bmap_add_extent_unwritten_real(
133 xfs_inode_t *ip, /* incore inode pointer */
134 xfs_extnum_t *idx, /* extent number to update/insert */
135 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
136 xfs_bmbt_irec_t *new, /* new data to add to file extents */
137 int *logflagsp); /* inode logging flags */
138
139/*
140 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. 93 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
141 * It figures out where to ask the underlying allocator to put the new extent. 94 * It figures out where to ask the underlying allocator to put the new extent.
142 */ 95 */
@@ -215,19 +168,6 @@ xfs_bmap_search_extents(
215 xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ 168 xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */
216 169
217/* 170/*
218 * Check the last inode extent to determine whether this allocation will result
219 * in blocks being allocated at the end of the file. When we allocate new data
220 * blocks at the end of the file which do not start at the previous data block,
221 * we will try to align the new blocks at stripe unit boundaries.
222 */
223STATIC int /* error */
224xfs_bmap_isaeof(
225 xfs_inode_t *ip, /* incore inode pointer */
226 xfs_fileoff_t off, /* file offset in fsblocks */
227 int whichfork, /* data or attribute fork */
228 char *aeof); /* return value */
229
230/*
231 * Compute the worst-case number of indirect blocks that will be used 171 * Compute the worst-case number of indirect blocks that will be used
232 * for ip's delayed extent of length "len". 172 * for ip's delayed extent of length "len".
233 */ 173 */
@@ -431,188 +371,13 @@ xfs_bmap_add_attrfork_local(
431} 371}
432 372
433/* 373/*
434 * Called by xfs_bmapi to update file extent records and the btree 374 * Convert a delayed allocation to a real allocation.
435 * after allocating space (or doing a delayed allocation).
436 */
437STATIC int /* error */
438xfs_bmap_add_extent(
439 struct xfs_trans *tp, /* transaction pointer */
440 xfs_inode_t *ip, /* incore inode pointer */
441 xfs_extnum_t *idx, /* extent number to update/insert */
442 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
443 xfs_bmbt_irec_t *new, /* new data to add to file extents */
444 xfs_fsblock_t *first, /* pointer to firstblock variable */
445 xfs_bmap_free_t *flist, /* list of extents to be freed */
446 int *logflagsp, /* inode logging flags */
447 int whichfork) /* data or attr fork */
448{
449 xfs_btree_cur_t *cur; /* btree cursor or null */
450 xfs_filblks_t da_new; /* new count del alloc blocks used */
451 xfs_filblks_t da_old; /* old count del alloc blocks used */
452 int error; /* error return value */
453 xfs_ifork_t *ifp; /* inode fork ptr */
454 int logflags; /* returned value */
455 xfs_extnum_t nextents; /* number of extents in file now */
456
457 XFS_STATS_INC(xs_add_exlist);
458
459 cur = *curp;
460 ifp = XFS_IFORK_PTR(ip, whichfork);
461 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
462 da_old = da_new = 0;
463 error = 0;
464
465 ASSERT(*idx >= 0);
466 ASSERT(*idx <= nextents);
467
468 /*
469 * This is the first extent added to a new/empty file.
470 * Special case this one, so other routines get to assume there are
471 * already extents in the list.
472 */
473 if (nextents == 0) {
474 xfs_iext_insert(ip, *idx, 1, new,
475 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
476
477 ASSERT(cur == NULL);
478
479 if (!isnullstartblock(new->br_startblock)) {
480 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
481 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
482 } else
483 logflags = 0;
484 }
485 /*
486 * Any kind of new delayed allocation goes here.
487 */
488 else if (isnullstartblock(new->br_startblock)) {
489 if (cur)
490 ASSERT((cur->bc_private.b.flags &
491 XFS_BTCUR_BPRV_WASDEL) == 0);
492 error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
493 &logflags);
494 }
495 /*
496 * Real allocation off the end of the file.
497 */
498 else if (*idx == nextents) {
499 if (cur)
500 ASSERT((cur->bc_private.b.flags &
501 XFS_BTCUR_BPRV_WASDEL) == 0);
502 error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
503 &logflags, whichfork);
504 } else {
505 xfs_bmbt_irec_t prev; /* old extent at offset idx */
506
507 /*
508 * Get the record referred to by idx.
509 */
510 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
511 /*
512 * If it's a real allocation record, and the new allocation ends
513 * after the start of the referred to record, then we're filling
514 * in a delayed or unwritten allocation with a real one, or
515 * converting real back to unwritten.
516 */
517 if (!isnullstartblock(new->br_startblock) &&
518 new->br_startoff + new->br_blockcount > prev.br_startoff) {
519 if (prev.br_state != XFS_EXT_UNWRITTEN &&
520 isnullstartblock(prev.br_startblock)) {
521 da_old = startblockval(prev.br_startblock);
522 if (cur)
523 ASSERT(cur->bc_private.b.flags &
524 XFS_BTCUR_BPRV_WASDEL);
525 error = xfs_bmap_add_extent_delay_real(tp, ip,
526 idx, &cur, new, &da_new,
527 first, flist, &logflags);
528 } else {
529 ASSERT(new->br_state == XFS_EXT_NORM ||
530 new->br_state == XFS_EXT_UNWRITTEN);
531
532 error = xfs_bmap_add_extent_unwritten_real(ip,
533 idx, &cur, new, &logflags);
534 if (error)
535 goto done;
536 }
537 }
538 /*
539 * Otherwise we're filling in a hole with an allocation.
540 */
541 else {
542 if (cur)
543 ASSERT((cur->bc_private.b.flags &
544 XFS_BTCUR_BPRV_WASDEL) == 0);
545 error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
546 new, &logflags, whichfork);
547 }
548 }
549
550 if (error)
551 goto done;
552 ASSERT(*curp == cur || *curp == NULL);
553
554 /*
555 * Convert to a btree if necessary.
556 */
557 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
558 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
559 int tmp_logflags; /* partial log flag return val */
560
561 ASSERT(cur == NULL);
562 error = xfs_bmap_extents_to_btree(tp, ip, first,
563 flist, &cur, da_old > 0, &tmp_logflags, whichfork);
564 logflags |= tmp_logflags;
565 if (error)
566 goto done;
567 }
568 /*
569 * Adjust for changes in reserved delayed indirect blocks.
570 * Nothing to do for disk quotas here.
571 */
572 if (da_old || da_new) {
573 xfs_filblks_t nblks;
574
575 nblks = da_new;
576 if (cur)
577 nblks += cur->bc_private.b.allocated;
578 ASSERT(nblks <= da_old);
579 if (nblks < da_old)
580 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
581 (int64_t)(da_old - nblks), 0);
582 }
583 /*
584 * Clear out the allocated field, done with it now in any case.
585 */
586 if (cur) {
587 cur->bc_private.b.allocated = 0;
588 *curp = cur;
589 }
590done:
591#ifdef DEBUG
592 if (!error)
593 xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
594#endif
595 *logflagsp = logflags;
596 return error;
597}
598
599/*
600 * Called by xfs_bmap_add_extent to handle cases converting a delayed
601 * allocation to a real allocation.
602 */ 375 */
603STATIC int /* error */ 376STATIC int /* error */
604xfs_bmap_add_extent_delay_real( 377xfs_bmap_add_extent_delay_real(
605 struct xfs_trans *tp, /* transaction pointer */ 378 struct xfs_bmalloca *bma)
606 xfs_inode_t *ip, /* incore inode pointer */
607 xfs_extnum_t *idx, /* extent number to update/insert */
608 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
609 xfs_bmbt_irec_t *new, /* new data to add to file extents */
610 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
611 xfs_fsblock_t *first, /* pointer to firstblock variable */
612 xfs_bmap_free_t *flist, /* list of extents to be freed */
613 int *logflagsp) /* inode logging flags */
614{ 379{
615 xfs_btree_cur_t *cur; /* btree cursor */ 380 struct xfs_bmbt_irec *new = &bma->got;
616 int diff; /* temp value */ 381 int diff; /* temp value */
617 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ 382 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
618 int error; /* error return value */ 383 int error; /* error return value */
@@ -623,10 +388,22 @@ xfs_bmap_add_extent_delay_real(
623 /* left is 0, right is 1, prev is 2 */ 388 /* left is 0, right is 1, prev is 2 */
624 int rval=0; /* return value (logging flags) */ 389 int rval=0; /* return value (logging flags) */
625 int state = 0;/* state bits, accessed thru macros */ 390 int state = 0;/* state bits, accessed thru macros */
626 xfs_filblks_t temp=0; /* value for dnew calculations */ 391 xfs_filblks_t da_new; /* new count del alloc blocks used */
627 xfs_filblks_t temp2=0;/* value for dnew calculations */ 392 xfs_filblks_t da_old; /* old count del alloc blocks used */
393 xfs_filblks_t temp=0; /* value for da_new calculations */
394 xfs_filblks_t temp2=0;/* value for da_new calculations */
628 int tmp_rval; /* partial logging flags */ 395 int tmp_rval; /* partial logging flags */
629 396
397 ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
398
399 ASSERT(bma->idx >= 0);
400 ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
401 ASSERT(!isnullstartblock(new->br_startblock));
402 ASSERT(!bma->cur ||
403 (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
404
405 XFS_STATS_INC(xs_add_exlist);
406
630#define LEFT r[0] 407#define LEFT r[0]
631#define RIGHT r[1] 408#define RIGHT r[1]
632#define PREV r[2] 409#define PREV r[2]
@@ -634,14 +411,15 @@ xfs_bmap_add_extent_delay_real(
634 /* 411 /*
635 * Set up a bunch of variables to make the tests simpler. 412 * Set up a bunch of variables to make the tests simpler.
636 */ 413 */
637 cur = *curp; 414 ep = xfs_iext_get_ext(ifp, bma->idx);
638 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
639 ep = xfs_iext_get_ext(ifp, *idx);
640 xfs_bmbt_get_all(ep, &PREV); 415 xfs_bmbt_get_all(ep, &PREV);
641 new_endoff = new->br_startoff + new->br_blockcount; 416 new_endoff = new->br_startoff + new->br_blockcount;
642 ASSERT(PREV.br_startoff <= new->br_startoff); 417 ASSERT(PREV.br_startoff <= new->br_startoff);
643 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); 418 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
644 419
420 da_old = startblockval(PREV.br_startblock);
421 da_new = 0;
422
645 /* 423 /*
646 * Set flags determining what part of the previous delayed allocation 424 * Set flags determining what part of the previous delayed allocation
647 * extent is being replaced by a real allocation. 425 * extent is being replaced by a real allocation.
@@ -655,9 +433,9 @@ xfs_bmap_add_extent_delay_real(
655 * Check and set flags if this segment has a left neighbor. 433 * Check and set flags if this segment has a left neighbor.
656 * Don't set contiguous if the combined extent would be too large. 434 * Don't set contiguous if the combined extent would be too large.
657 */ 435 */
658 if (*idx > 0) { 436 if (bma->idx > 0) {
659 state |= BMAP_LEFT_VALID; 437 state |= BMAP_LEFT_VALID;
660 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); 438 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
661 439
662 if (isnullstartblock(LEFT.br_startblock)) 440 if (isnullstartblock(LEFT.br_startblock))
663 state |= BMAP_LEFT_DELAY; 441 state |= BMAP_LEFT_DELAY;
@@ -675,9 +453,9 @@ xfs_bmap_add_extent_delay_real(
675 * Don't set contiguous if the combined extent would be too large. 453 * Don't set contiguous if the combined extent would be too large.
676 * Also check for all-three-contiguous being too large. 454 * Also check for all-three-contiguous being too large.
677 */ 455 */
678 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { 456 if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
679 state |= BMAP_RIGHT_VALID; 457 state |= BMAP_RIGHT_VALID;
680 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); 458 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
681 459
682 if (isnullstartblock(RIGHT.br_startblock)) 460 if (isnullstartblock(RIGHT.br_startblock))
683 state |= BMAP_RIGHT_DELAY; 461 state |= BMAP_RIGHT_DELAY;
@@ -708,38 +486,41 @@ xfs_bmap_add_extent_delay_real(
708 * Filling in all of a previously delayed allocation extent. 486 * Filling in all of a previously delayed allocation extent.
709 * The left and right neighbors are both contiguous with new. 487 * The left and right neighbors are both contiguous with new.
710 */ 488 */
711 --*idx; 489 bma->idx--;
712 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 490 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
713 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), 491 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
714 LEFT.br_blockcount + PREV.br_blockcount + 492 LEFT.br_blockcount + PREV.br_blockcount +
715 RIGHT.br_blockcount); 493 RIGHT.br_blockcount);
716 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 494 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
717 495
718 xfs_iext_remove(ip, *idx + 1, 2, state); 496 xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
719 ip->i_d.di_nextents--; 497 bma->ip->i_d.di_nextents--;
720 if (cur == NULL) 498 if (bma->cur == NULL)
721 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 499 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
722 else { 500 else {
723 rval = XFS_ILOG_CORE; 501 rval = XFS_ILOG_CORE;
724 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, 502 error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
725 RIGHT.br_startblock, 503 RIGHT.br_startblock,
726 RIGHT.br_blockcount, &i))) 504 RIGHT.br_blockcount, &i);
505 if (error)
727 goto done; 506 goto done;
728 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 507 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
729 if ((error = xfs_btree_delete(cur, &i))) 508 error = xfs_btree_delete(bma->cur, &i);
509 if (error)
730 goto done; 510 goto done;
731 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 511 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
732 if ((error = xfs_btree_decrement(cur, 0, &i))) 512 error = xfs_btree_decrement(bma->cur, 0, &i);
513 if (error)
733 goto done; 514 goto done;
734 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 515 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
735 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 516 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
736 LEFT.br_startblock, 517 LEFT.br_startblock,
737 LEFT.br_blockcount + 518 LEFT.br_blockcount +
738 PREV.br_blockcount + 519 PREV.br_blockcount +
739 RIGHT.br_blockcount, LEFT.br_state))) 520 RIGHT.br_blockcount, LEFT.br_state);
521 if (error)
740 goto done; 522 goto done;
741 } 523 }
742 *dnew = 0;
743 break; 524 break;
744 525
745 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: 526 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -747,30 +528,31 @@ xfs_bmap_add_extent_delay_real(
747 * Filling in all of a previously delayed allocation extent. 528 * Filling in all of a previously delayed allocation extent.
748 * The left neighbor is contiguous, the right is not. 529 * The left neighbor is contiguous, the right is not.
749 */ 530 */
750 --*idx; 531 bma->idx--;
751 532
752 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 533 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
753 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), 534 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
754 LEFT.br_blockcount + PREV.br_blockcount); 535 LEFT.br_blockcount + PREV.br_blockcount);
755 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 536 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
756 537
757 xfs_iext_remove(ip, *idx + 1, 1, state); 538 xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
758 if (cur == NULL) 539 if (bma->cur == NULL)
759 rval = XFS_ILOG_DEXT; 540 rval = XFS_ILOG_DEXT;
760 else { 541 else {
761 rval = 0; 542 rval = 0;
762 if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, 543 error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
763 LEFT.br_startblock, LEFT.br_blockcount, 544 LEFT.br_startblock, LEFT.br_blockcount,
764 &i))) 545 &i);
546 if (error)
765 goto done; 547 goto done;
766 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 548 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
767 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 549 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
768 LEFT.br_startblock, 550 LEFT.br_startblock,
769 LEFT.br_blockcount + 551 LEFT.br_blockcount +
770 PREV.br_blockcount, LEFT.br_state))) 552 PREV.br_blockcount, LEFT.br_state);
553 if (error)
771 goto done; 554 goto done;
772 } 555 }
773 *dnew = 0;
774 break; 556 break;
775 557
776 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 558 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -778,30 +560,30 @@ xfs_bmap_add_extent_delay_real(
778 * Filling in all of a previously delayed allocation extent. 560 * Filling in all of a previously delayed allocation extent.
779 * The right neighbor is contiguous, the left is not. 561 * The right neighbor is contiguous, the left is not.
780 */ 562 */
781 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 563 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
782 xfs_bmbt_set_startblock(ep, new->br_startblock); 564 xfs_bmbt_set_startblock(ep, new->br_startblock);
783 xfs_bmbt_set_blockcount(ep, 565 xfs_bmbt_set_blockcount(ep,
784 PREV.br_blockcount + RIGHT.br_blockcount); 566 PREV.br_blockcount + RIGHT.br_blockcount);
785 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 567 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
786 568
787 xfs_iext_remove(ip, *idx + 1, 1, state); 569 xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
788 if (cur == NULL) 570 if (bma->cur == NULL)
789 rval = XFS_ILOG_DEXT; 571 rval = XFS_ILOG_DEXT;
790 else { 572 else {
791 rval = 0; 573 rval = 0;
792 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, 574 error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
793 RIGHT.br_startblock, 575 RIGHT.br_startblock,
794 RIGHT.br_blockcount, &i))) 576 RIGHT.br_blockcount, &i);
577 if (error)
795 goto done; 578 goto done;
796 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 579 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
797 if ((error = xfs_bmbt_update(cur, PREV.br_startoff, 580 error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
798 new->br_startblock, 581 new->br_startblock,
799 PREV.br_blockcount + 582 PREV.br_blockcount +
800 RIGHT.br_blockcount, PREV.br_state))) 583 RIGHT.br_blockcount, PREV.br_state);
584 if (error)
801 goto done; 585 goto done;
802 } 586 }
803
804 *dnew = 0;
805 break; 587 break;
806 588
807 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: 589 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -810,27 +592,27 @@ xfs_bmap_add_extent_delay_real(
810 * Neither the left nor right neighbors are contiguous with 592 * Neither the left nor right neighbors are contiguous with
811 * the new one. 593 * the new one.
812 */ 594 */
813 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 595 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
814 xfs_bmbt_set_startblock(ep, new->br_startblock); 596 xfs_bmbt_set_startblock(ep, new->br_startblock);
815 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 597 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
816 598
817 ip->i_d.di_nextents++; 599 bma->ip->i_d.di_nextents++;
818 if (cur == NULL) 600 if (bma->cur == NULL)
819 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 601 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
820 else { 602 else {
821 rval = XFS_ILOG_CORE; 603 rval = XFS_ILOG_CORE;
822 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, 604 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
823 new->br_startblock, new->br_blockcount, 605 new->br_startblock, new->br_blockcount,
824 &i))) 606 &i);
607 if (error)
825 goto done; 608 goto done;
826 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 609 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
827 cur->bc_rec.b.br_state = XFS_EXT_NORM; 610 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
828 if ((error = xfs_btree_insert(cur, &i))) 611 error = xfs_btree_insert(bma->cur, &i);
612 if (error)
829 goto done; 613 goto done;
830 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 614 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
831 } 615 }
832
833 *dnew = 0;
834 break; 616 break;
835 617
836 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: 618 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -838,39 +620,40 @@ xfs_bmap_add_extent_delay_real(
838 * Filling in the first part of a previous delayed allocation. 620 * Filling in the first part of a previous delayed allocation.
839 * The left neighbor is contiguous. 621 * The left neighbor is contiguous.
840 */ 622 */
841 trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); 623 trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
842 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), 624 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
843 LEFT.br_blockcount + new->br_blockcount); 625 LEFT.br_blockcount + new->br_blockcount);
844 xfs_bmbt_set_startoff(ep, 626 xfs_bmbt_set_startoff(ep,
845 PREV.br_startoff + new->br_blockcount); 627 PREV.br_startoff + new->br_blockcount);
846 trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); 628 trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
847 629
848 temp = PREV.br_blockcount - new->br_blockcount; 630 temp = PREV.br_blockcount - new->br_blockcount;
849 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 631 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
850 xfs_bmbt_set_blockcount(ep, temp); 632 xfs_bmbt_set_blockcount(ep, temp);
851 if (cur == NULL) 633 if (bma->cur == NULL)
852 rval = XFS_ILOG_DEXT; 634 rval = XFS_ILOG_DEXT;
853 else { 635 else {
854 rval = 0; 636 rval = 0;
855 if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, 637 error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
856 LEFT.br_startblock, LEFT.br_blockcount, 638 LEFT.br_startblock, LEFT.br_blockcount,
857 &i))) 639 &i);
640 if (error)
858 goto done; 641 goto done;
859 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 642 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
860 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 643 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
861 LEFT.br_startblock, 644 LEFT.br_startblock,
862 LEFT.br_blockcount + 645 LEFT.br_blockcount +
863 new->br_blockcount, 646 new->br_blockcount,
864 LEFT.br_state))) 647 LEFT.br_state);
648 if (error)
865 goto done; 649 goto done;
866 } 650 }
867 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 651 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
868 startblockval(PREV.br_startblock)); 652 startblockval(PREV.br_startblock));
869 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 653 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
870 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 654 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
871 655
872 --*idx; 656 bma->idx--;
873 *dnew = temp;
874 break; 657 break;
875 658
876 case BMAP_LEFT_FILLING: 659 case BMAP_LEFT_FILLING:
@@ -878,43 +661,43 @@ xfs_bmap_add_extent_delay_real(
878 * Filling in the first part of a previous delayed allocation. 661 * Filling in the first part of a previous delayed allocation.
879 * The left neighbor is not contiguous. 662 * The left neighbor is not contiguous.
880 */ 663 */
881 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 664 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
882 xfs_bmbt_set_startoff(ep, new_endoff); 665 xfs_bmbt_set_startoff(ep, new_endoff);
883 temp = PREV.br_blockcount - new->br_blockcount; 666 temp = PREV.br_blockcount - new->br_blockcount;
884 xfs_bmbt_set_blockcount(ep, temp); 667 xfs_bmbt_set_blockcount(ep, temp);
885 xfs_iext_insert(ip, *idx, 1, new, state); 668 xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
886 ip->i_d.di_nextents++; 669 bma->ip->i_d.di_nextents++;
887 if (cur == NULL) 670 if (bma->cur == NULL)
888 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 671 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
889 else { 672 else {
890 rval = XFS_ILOG_CORE; 673 rval = XFS_ILOG_CORE;
891 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, 674 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
892 new->br_startblock, new->br_blockcount, 675 new->br_startblock, new->br_blockcount,
893 &i))) 676 &i);
677 if (error)
894 goto done; 678 goto done;
895 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 679 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
896 cur->bc_rec.b.br_state = XFS_EXT_NORM; 680 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
897 if ((error = xfs_btree_insert(cur, &i))) 681 error = xfs_btree_insert(bma->cur, &i);
682 if (error)
898 goto done; 683 goto done;
899 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 684 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
900 } 685 }
901 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 686 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
902 ip->i_d.di_nextents > ip->i_df.if_ext_max) { 687 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
903 error = xfs_bmap_extents_to_btree(tp, ip, 688 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
904 first, flist, &cur, 1, &tmp_rval, 689 bma->firstblock, bma->flist,
905 XFS_DATA_FORK); 690 &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
906 rval |= tmp_rval; 691 rval |= tmp_rval;
907 if (error) 692 if (error)
908 goto done; 693 goto done;
909 } 694 }
910 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 695 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
911 startblockval(PREV.br_startblock) - 696 startblockval(PREV.br_startblock) -
912 (cur ? cur->bc_private.b.allocated : 0)); 697 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
913 ep = xfs_iext_get_ext(ifp, *idx + 1); 698 ep = xfs_iext_get_ext(ifp, bma->idx + 1);
914 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 699 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
915 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); 700 trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
916
917 *dnew = temp;
918 break; 701 break;
919 702
920 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 703 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -923,38 +706,39 @@ xfs_bmap_add_extent_delay_real(
923 * The right neighbor is contiguous with the new allocation. 706 * The right neighbor is contiguous with the new allocation.
924 */ 707 */
925 temp = PREV.br_blockcount - new->br_blockcount; 708 temp = PREV.br_blockcount - new->br_blockcount;
926 trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_); 709 trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
927 xfs_bmbt_set_blockcount(ep, temp); 710 xfs_bmbt_set_blockcount(ep, temp);
928 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1), 711 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
929 new->br_startoff, new->br_startblock, 712 new->br_startoff, new->br_startblock,
930 new->br_blockcount + RIGHT.br_blockcount, 713 new->br_blockcount + RIGHT.br_blockcount,
931 RIGHT.br_state); 714 RIGHT.br_state);
932 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); 715 trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
933 if (cur == NULL) 716 if (bma->cur == NULL)
934 rval = XFS_ILOG_DEXT; 717 rval = XFS_ILOG_DEXT;
935 else { 718 else {
936 rval = 0; 719 rval = 0;
937 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, 720 error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
938 RIGHT.br_startblock, 721 RIGHT.br_startblock,
939 RIGHT.br_blockcount, &i))) 722 RIGHT.br_blockcount, &i);
723 if (error)
940 goto done; 724 goto done;
941 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 725 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
942 if ((error = xfs_bmbt_update(cur, new->br_startoff, 726 error = xfs_bmbt_update(bma->cur, new->br_startoff,
943 new->br_startblock, 727 new->br_startblock,
944 new->br_blockcount + 728 new->br_blockcount +
945 RIGHT.br_blockcount, 729 RIGHT.br_blockcount,
946 RIGHT.br_state))) 730 RIGHT.br_state);
731 if (error)
947 goto done; 732 goto done;
948 } 733 }
949 734
950 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 735 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
951 startblockval(PREV.br_startblock)); 736 startblockval(PREV.br_startblock));
952 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 737 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
953 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 738 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
954 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 739 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
955 740
956 ++*idx; 741 bma->idx++;
957 *dnew = temp;
958 break; 742 break;
959 743
960 case BMAP_RIGHT_FILLING: 744 case BMAP_RIGHT_FILLING:
@@ -963,42 +747,43 @@ xfs_bmap_add_extent_delay_real(
963 * The right neighbor is not contiguous. 747 * The right neighbor is not contiguous.
964 */ 748 */
965 temp = PREV.br_blockcount - new->br_blockcount; 749 temp = PREV.br_blockcount - new->br_blockcount;
966 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 750 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
967 xfs_bmbt_set_blockcount(ep, temp); 751 xfs_bmbt_set_blockcount(ep, temp);
968 xfs_iext_insert(ip, *idx + 1, 1, new, state); 752 xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
969 ip->i_d.di_nextents++; 753 bma->ip->i_d.di_nextents++;
970 if (cur == NULL) 754 if (bma->cur == NULL)
971 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 755 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
972 else { 756 else {
973 rval = XFS_ILOG_CORE; 757 rval = XFS_ILOG_CORE;
974 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, 758 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
975 new->br_startblock, new->br_blockcount, 759 new->br_startblock, new->br_blockcount,
976 &i))) 760 &i);
761 if (error)
977 goto done; 762 goto done;
978 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 763 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
979 cur->bc_rec.b.br_state = XFS_EXT_NORM; 764 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
980 if ((error = xfs_btree_insert(cur, &i))) 765 error = xfs_btree_insert(bma->cur, &i);
766 if (error)
981 goto done; 767 goto done;
982 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 768 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
983 } 769 }
984 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 770 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
985 ip->i_d.di_nextents > ip->i_df.if_ext_max) { 771 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
986 error = xfs_bmap_extents_to_btree(tp, ip, 772 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
987 first, flist, &cur, 1, &tmp_rval, 773 bma->firstblock, bma->flist, &bma->cur, 1,
988 XFS_DATA_FORK); 774 &tmp_rval, XFS_DATA_FORK);
989 rval |= tmp_rval; 775 rval |= tmp_rval;
990 if (error) 776 if (error)
991 goto done; 777 goto done;
992 } 778 }
993 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 779 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
994 startblockval(PREV.br_startblock) - 780 startblockval(PREV.br_startblock) -
995 (cur ? cur->bc_private.b.allocated : 0)); 781 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
996 ep = xfs_iext_get_ext(ifp, *idx); 782 ep = xfs_iext_get_ext(ifp, bma->idx);
997 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 783 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
998 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 784 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
999 785
1000 ++*idx; 786 bma->idx++;
1001 *dnew = temp;
1002 break; 787 break;
1003 788
1004 case 0: 789 case 0:
@@ -1024,82 +809,65 @@ xfs_bmap_add_extent_delay_real(
1024 */ 809 */
1025 temp = new->br_startoff - PREV.br_startoff; 810 temp = new->br_startoff - PREV.br_startoff;
1026 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 811 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1027 trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_); 812 trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
1028 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ 813 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1029 LEFT = *new; 814 LEFT = *new;
1030 RIGHT.br_state = PREV.br_state; 815 RIGHT.br_state = PREV.br_state;
1031 RIGHT.br_startblock = nullstartblock( 816 RIGHT.br_startblock = nullstartblock(
1032 (int)xfs_bmap_worst_indlen(ip, temp2)); 817 (int)xfs_bmap_worst_indlen(bma->ip, temp2));
1033 RIGHT.br_startoff = new_endoff; 818 RIGHT.br_startoff = new_endoff;
1034 RIGHT.br_blockcount = temp2; 819 RIGHT.br_blockcount = temp2;
1035 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ 820 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1036 xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state); 821 xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
1037 ip->i_d.di_nextents++; 822 bma->ip->i_d.di_nextents++;
1038 if (cur == NULL) 823 if (bma->cur == NULL)
1039 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 824 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1040 else { 825 else {
1041 rval = XFS_ILOG_CORE; 826 rval = XFS_ILOG_CORE;
1042 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, 827 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
1043 new->br_startblock, new->br_blockcount, 828 new->br_startblock, new->br_blockcount,
1044 &i))) 829 &i);
830 if (error)
1045 goto done; 831 goto done;
1046 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 832 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1047 cur->bc_rec.b.br_state = XFS_EXT_NORM; 833 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
1048 if ((error = xfs_btree_insert(cur, &i))) 834 error = xfs_btree_insert(bma->cur, &i);
835 if (error)
1049 goto done; 836 goto done;
1050 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 837 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1051 } 838 }
1052 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 839 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1053 ip->i_d.di_nextents > ip->i_df.if_ext_max) { 840 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
1054 error = xfs_bmap_extents_to_btree(tp, ip, 841 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
1055 first, flist, &cur, 1, &tmp_rval, 842 bma->firstblock, bma->flist, &bma->cur,
1056 XFS_DATA_FORK); 843 1, &tmp_rval, XFS_DATA_FORK);
1057 rval |= tmp_rval; 844 rval |= tmp_rval;
1058 if (error) 845 if (error)
1059 goto done; 846 goto done;
1060 } 847 }
1061 temp = xfs_bmap_worst_indlen(ip, temp); 848 temp = xfs_bmap_worst_indlen(bma->ip, temp);
1062 temp2 = xfs_bmap_worst_indlen(ip, temp2); 849 temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
1063 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - 850 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
1064 (cur ? cur->bc_private.b.allocated : 0)); 851 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
1065 if (diff > 0 && 852 if (diff > 0) {
1066 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, 853 error = xfs_icsb_modify_counters(bma->ip->i_mount,
1067 -((int64_t)diff), 0)) { 854 XFS_SBS_FDBLOCKS,
1068 /* 855 -((int64_t)diff), 0);
1069 * Ick gross gag me with a spoon. 856 ASSERT(!error);
1070 */ 857 if (error)
1071 ASSERT(0); /* want to see if this ever happens! */ 858 goto done;
1072 while (diff > 0) {
1073 if (temp) {
1074 temp--;
1075 diff--;
1076 if (!diff ||
1077 !xfs_icsb_modify_counters(ip->i_mount,
1078 XFS_SBS_FDBLOCKS,
1079 -((int64_t)diff), 0))
1080 break;
1081 }
1082 if (temp2) {
1083 temp2--;
1084 diff--;
1085 if (!diff ||
1086 !xfs_icsb_modify_counters(ip->i_mount,
1087 XFS_SBS_FDBLOCKS,
1088 -((int64_t)diff), 0))
1089 break;
1090 }
1091 }
1092 } 859 }
1093 ep = xfs_iext_get_ext(ifp, *idx); 860
861 ep = xfs_iext_get_ext(ifp, bma->idx);
1094 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 862 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1095 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 863 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1096 trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_); 864 trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
1097 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2), 865 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
1098 nullstartblock((int)temp2)); 866 nullstartblock((int)temp2));
1099 trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); 867 trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
1100 868
1101 ++*idx; 869 bma->idx++;
1102 *dnew = temp + temp2; 870 da_new = temp + temp2;
1103 break; 871 break;
1104 872
1105 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: 873 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1114,9 +882,40 @@ xfs_bmap_add_extent_delay_real(
1114 */ 882 */
1115 ASSERT(0); 883 ASSERT(0);
1116 } 884 }
1117 *curp = cur; 885
886 /* convert to a btree if necessary */
887 if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
888 XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
889 int tmp_logflags; /* partial log flag return val */
890
891 ASSERT(bma->cur == NULL);
892 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
893 bma->firstblock, bma->flist, &bma->cur,
894 da_old > 0, &tmp_logflags, XFS_DATA_FORK);
895 bma->logflags |= tmp_logflags;
896 if (error)
897 goto done;
898 }
899
900 /* adjust for changes in reserved delayed indirect blocks */
901 if (da_old || da_new) {
902 temp = da_new;
903 if (bma->cur)
904 temp += bma->cur->bc_private.b.allocated;
905 ASSERT(temp <= da_old);
906 if (temp < da_old)
907 xfs_icsb_modify_counters(bma->ip->i_mount,
908 XFS_SBS_FDBLOCKS,
909 (int64_t)(da_old - temp), 0);
910 }
911
912 /* clear out the allocated field, done with it now in any case. */
913 if (bma->cur)
914 bma->cur->bc_private.b.allocated = 0;
915
916 xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
1118done: 917done:
1119 *logflagsp = rval; 918 bma->logflags |= rval;
1120 return error; 919 return error;
1121#undef LEFT 920#undef LEFT
1122#undef RIGHT 921#undef RIGHT
@@ -1124,15 +923,17 @@ done:
1124} 923}
1125 924
1126/* 925/*
1127 * Called by xfs_bmap_add_extent to handle cases converting an unwritten 926 * Convert an unwritten allocation to a real allocation or vice versa.
1128 * allocation to a real allocation or vice versa.
1129 */ 927 */
1130STATIC int /* error */ 928STATIC int /* error */
1131xfs_bmap_add_extent_unwritten_real( 929xfs_bmap_add_extent_unwritten_real(
930 struct xfs_trans *tp,
1132 xfs_inode_t *ip, /* incore inode pointer */ 931 xfs_inode_t *ip, /* incore inode pointer */
1133 xfs_extnum_t *idx, /* extent number to update/insert */ 932 xfs_extnum_t *idx, /* extent number to update/insert */
1134 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 933 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
1135 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 934 xfs_bmbt_irec_t *new, /* new data to add to file extents */
935 xfs_fsblock_t *first, /* pointer to firstblock variable */
936 xfs_bmap_free_t *flist, /* list of extents to be freed */
1136 int *logflagsp) /* inode logging flags */ 937 int *logflagsp) /* inode logging flags */
1137{ 938{
1138 xfs_btree_cur_t *cur; /* btree cursor */ 939 xfs_btree_cur_t *cur; /* btree cursor */
@@ -1148,15 +949,25 @@ xfs_bmap_add_extent_unwritten_real(
1148 int rval=0; /* return value (logging flags) */ 949 int rval=0; /* return value (logging flags) */
1149 int state = 0;/* state bits, accessed thru macros */ 950 int state = 0;/* state bits, accessed thru macros */
1150 951
952 *logflagsp = 0;
953
954 cur = *curp;
955 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
956
957 ASSERT(*idx >= 0);
958 ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
959 ASSERT(!isnullstartblock(new->br_startblock));
960
961 XFS_STATS_INC(xs_add_exlist);
962
1151#define LEFT r[0] 963#define LEFT r[0]
1152#define RIGHT r[1] 964#define RIGHT r[1]
1153#define PREV r[2] 965#define PREV r[2]
966
1154 /* 967 /*
1155 * Set up a bunch of variables to make the tests simpler. 968 * Set up a bunch of variables to make the tests simpler.
1156 */ 969 */
1157 error = 0; 970 error = 0;
1158 cur = *curp;
1159 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1160 ep = xfs_iext_get_ext(ifp, *idx); 971 ep = xfs_iext_get_ext(ifp, *idx);
1161 xfs_bmbt_get_all(ep, &PREV); 972 xfs_bmbt_get_all(ep, &PREV);
1162 newext = new->br_state; 973 newext = new->br_state;
@@ -1406,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real(
1406 goto done; 1217 goto done;
1407 if ((error = xfs_btree_decrement(cur, 0, &i))) 1218 if ((error = xfs_btree_decrement(cur, 0, &i)))
1408 goto done; 1219 goto done;
1409 if (xfs_bmbt_update(cur, LEFT.br_startoff, 1220 error = xfs_bmbt_update(cur, LEFT.br_startoff,
1410 LEFT.br_startblock, 1221 LEFT.br_startblock,
1411 LEFT.br_blockcount + new->br_blockcount, 1222 LEFT.br_blockcount + new->br_blockcount,
1412 LEFT.br_state)) 1223 LEFT.br_state);
1224 if (error)
1413 goto done; 1225 goto done;
1414 } 1226 }
1415 break; 1227 break;
@@ -1607,9 +1419,29 @@ xfs_bmap_add_extent_unwritten_real(
1607 */ 1419 */
1608 ASSERT(0); 1420 ASSERT(0);
1609 } 1421 }
1610 *curp = cur; 1422
1423 /* convert to a btree if necessary */
1424 if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
1425 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
1426 int tmp_logflags; /* partial log flag return val */
1427
1428 ASSERT(cur == NULL);
1429 error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
1430 0, &tmp_logflags, XFS_DATA_FORK);
1431 *logflagsp |= tmp_logflags;
1432 if (error)
1433 goto done;
1434 }
1435
1436 /* clear out the allocated field, done with it now in any case. */
1437 if (cur) {
1438 cur->bc_private.b.allocated = 0;
1439 *curp = cur;
1440 }
1441
1442 xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
1611done: 1443done:
1612 *logflagsp = rval; 1444 *logflagsp |= rval;
1613 return error; 1445 return error;
1614#undef LEFT 1446#undef LEFT
1615#undef RIGHT 1447#undef RIGHT
@@ -1617,16 +1449,13 @@ done:
1617} 1449}
1618 1450
1619/* 1451/*
1620 * Called by xfs_bmap_add_extent to handle cases converting a hole 1452 * Convert a hole to a delayed allocation.
1621 * to a delayed allocation.
1622 */ 1453 */
1623/*ARGSUSED*/ 1454STATIC void
1624STATIC int /* error */
1625xfs_bmap_add_extent_hole_delay( 1455xfs_bmap_add_extent_hole_delay(
1626 xfs_inode_t *ip, /* incore inode pointer */ 1456 xfs_inode_t *ip, /* incore inode pointer */
1627 xfs_extnum_t *idx, /* extent number to update/insert */ 1457 xfs_extnum_t *idx, /* extent number to update/insert */
1628 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1458 xfs_bmbt_irec_t *new) /* new data to add to file extents */
1629 int *logflagsp) /* inode logging flags */
1630{ 1459{
1631 xfs_ifork_t *ifp; /* inode fork pointer */ 1460 xfs_ifork_t *ifp; /* inode fork pointer */
1632 xfs_bmbt_irec_t left; /* left neighbor extent entry */ 1461 xfs_bmbt_irec_t left; /* left neighbor extent entry */
@@ -1761,23 +1590,17 @@ xfs_bmap_add_extent_hole_delay(
1761 * Nothing to do for disk quota accounting here. 1590 * Nothing to do for disk quota accounting here.
1762 */ 1591 */
1763 } 1592 }
1764 *logflagsp = 0;
1765 return 0;
1766} 1593}
1767 1594
1768/* 1595/*
1769 * Called by xfs_bmap_add_extent to handle cases converting a hole 1596 * Convert a hole to a real allocation.
1770 * to a real allocation.
1771 */ 1597 */
1772STATIC int /* error */ 1598STATIC int /* error */
1773xfs_bmap_add_extent_hole_real( 1599xfs_bmap_add_extent_hole_real(
1774 xfs_inode_t *ip, /* incore inode pointer */ 1600 struct xfs_bmalloca *bma,
1775 xfs_extnum_t *idx, /* extent number to update/insert */ 1601 int whichfork)
1776 xfs_btree_cur_t *cur, /* if null, not a btree */
1777 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1778 int *logflagsp, /* inode logging flags */
1779 int whichfork) /* data or attr fork */
1780{ 1602{
1603 struct xfs_bmbt_irec *new = &bma->got;
1781 int error; /* error return value */ 1604 int error; /* error return value */
1782 int i; /* temp state */ 1605 int i; /* temp state */
1783 xfs_ifork_t *ifp; /* inode fork pointer */ 1606 xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1786,19 +1609,26 @@ xfs_bmap_add_extent_hole_real(
1786 int rval=0; /* return value (logging flags) */ 1609 int rval=0; /* return value (logging flags) */
1787 int state; /* state bits, accessed thru macros */ 1610 int state; /* state bits, accessed thru macros */
1788 1611
1789 ifp = XFS_IFORK_PTR(ip, whichfork); 1612 ifp = XFS_IFORK_PTR(bma->ip, whichfork);
1790 ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 1613
1791 state = 0; 1614 ASSERT(bma->idx >= 0);
1615 ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
1616 ASSERT(!isnullstartblock(new->br_startblock));
1617 ASSERT(!bma->cur ||
1618 !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
1619
1620 XFS_STATS_INC(xs_add_exlist);
1792 1621
1622 state = 0;
1793 if (whichfork == XFS_ATTR_FORK) 1623 if (whichfork == XFS_ATTR_FORK)
1794 state |= BMAP_ATTRFORK; 1624 state |= BMAP_ATTRFORK;
1795 1625
1796 /* 1626 /*
1797 * Check and set flags if this segment has a left neighbor. 1627 * Check and set flags if this segment has a left neighbor.
1798 */ 1628 */
1799 if (*idx > 0) { 1629 if (bma->idx > 0) {
1800 state |= BMAP_LEFT_VALID; 1630 state |= BMAP_LEFT_VALID;
1801 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); 1631 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
1802 if (isnullstartblock(left.br_startblock)) 1632 if (isnullstartblock(left.br_startblock))
1803 state |= BMAP_LEFT_DELAY; 1633 state |= BMAP_LEFT_DELAY;
1804 } 1634 }
@@ -1807,9 +1637,9 @@ xfs_bmap_add_extent_hole_real(
1807 * Check and set flags if this segment has a current value. 1637 * Check and set flags if this segment has a current value.
1808 * Not true if we're inserting into the "hole" at eof. 1638 * Not true if we're inserting into the "hole" at eof.
1809 */ 1639 */
1810 if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { 1640 if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
1811 state |= BMAP_RIGHT_VALID; 1641 state |= BMAP_RIGHT_VALID;
1812 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); 1642 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
1813 if (isnullstartblock(right.br_startblock)) 1643 if (isnullstartblock(right.br_startblock))
1814 state |= BMAP_RIGHT_DELAY; 1644 state |= BMAP_RIGHT_DELAY;
1815 } 1645 }
@@ -1846,39 +1676,42 @@ xfs_bmap_add_extent_hole_real(
1846 * left and on the right. 1676 * left and on the right.
1847 * Merge all three into a single extent record. 1677 * Merge all three into a single extent record.
1848 */ 1678 */
1849 --*idx; 1679 --bma->idx;
1850 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 1680 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1851 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), 1681 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
1852 left.br_blockcount + new->br_blockcount + 1682 left.br_blockcount + new->br_blockcount +
1853 right.br_blockcount); 1683 right.br_blockcount);
1854 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 1684 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1855 1685
1856 xfs_iext_remove(ip, *idx + 1, 1, state); 1686 xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
1857 1687
1858 XFS_IFORK_NEXT_SET(ip, whichfork, 1688 XFS_IFORK_NEXT_SET(bma->ip, whichfork,
1859 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 1689 XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
1860 if (cur == NULL) { 1690 if (bma->cur == NULL) {
1861 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 1691 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
1862 } else { 1692 } else {
1863 rval = XFS_ILOG_CORE; 1693 rval = XFS_ILOG_CORE;
1864 if ((error = xfs_bmbt_lookup_eq(cur, 1694 error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
1865 right.br_startoff, 1695 right.br_startblock, right.br_blockcount,
1866 right.br_startblock, 1696 &i);
1867 right.br_blockcount, &i))) 1697 if (error)
1868 goto done; 1698 goto done;
1869 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1699 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1870 if ((error = xfs_btree_delete(cur, &i))) 1700 error = xfs_btree_delete(bma->cur, &i);
1701 if (error)
1871 goto done; 1702 goto done;
1872 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1703 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1873 if ((error = xfs_btree_decrement(cur, 0, &i))) 1704 error = xfs_btree_decrement(bma->cur, 0, &i);
1705 if (error)
1874 goto done; 1706 goto done;
1875 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1707 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1876 if ((error = xfs_bmbt_update(cur, left.br_startoff, 1708 error = xfs_bmbt_update(bma->cur, left.br_startoff,
1877 left.br_startblock, 1709 left.br_startblock,
1878 left.br_blockcount + 1710 left.br_blockcount +
1879 new->br_blockcount + 1711 new->br_blockcount +
1880 right.br_blockcount, 1712 right.br_blockcount,
1881 left.br_state))) 1713 left.br_state);
1714 if (error)
1882 goto done; 1715 goto done;
1883 } 1716 }
1884 break; 1717 break;
@@ -1889,27 +1722,28 @@ xfs_bmap_add_extent_hole_real(
1889 * on the left. 1722 * on the left.
1890 * Merge the new allocation with the left neighbor. 1723 * Merge the new allocation with the left neighbor.
1891 */ 1724 */
1892 --*idx; 1725 --bma->idx;
1893 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 1726 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1894 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), 1727 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
1895 left.br_blockcount + new->br_blockcount); 1728 left.br_blockcount + new->br_blockcount);
1896 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 1729 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1897 1730
1898 if (cur == NULL) { 1731 if (bma->cur == NULL) {
1899 rval = xfs_ilog_fext(whichfork); 1732 rval = xfs_ilog_fext(whichfork);
1900 } else { 1733 } else {
1901 rval = 0; 1734 rval = 0;
1902 if ((error = xfs_bmbt_lookup_eq(cur, 1735 error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
1903 left.br_startoff, 1736 left.br_startblock, left.br_blockcount,
1904 left.br_startblock, 1737 &i);
1905 left.br_blockcount, &i))) 1738 if (error)
1906 goto done; 1739 goto done;
1907 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1740 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1908 if ((error = xfs_bmbt_update(cur, left.br_startoff, 1741 error = xfs_bmbt_update(bma->cur, left.br_startoff,
1909 left.br_startblock, 1742 left.br_startblock,
1910 left.br_blockcount + 1743 left.br_blockcount +
1911 new->br_blockcount, 1744 new->br_blockcount,
1912 left.br_state))) 1745 left.br_state);
1746 if (error)
1913 goto done; 1747 goto done;
1914 } 1748 }
1915 break; 1749 break;
@@ -1920,28 +1754,30 @@ xfs_bmap_add_extent_hole_real(
1920 * on the right. 1754 * on the right.
1921 * Merge the new allocation with the right neighbor. 1755 * Merge the new allocation with the right neighbor.
1922 */ 1756 */
1923 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); 1757 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1924 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), 1758 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
1925 new->br_startoff, new->br_startblock, 1759 new->br_startoff, new->br_startblock,
1926 new->br_blockcount + right.br_blockcount, 1760 new->br_blockcount + right.br_blockcount,
1927 right.br_state); 1761 right.br_state);
1928 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 1762 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1929 1763
1930 if (cur == NULL) { 1764 if (bma->cur == NULL) {
1931 rval = xfs_ilog_fext(whichfork); 1765 rval = xfs_ilog_fext(whichfork);
1932 } else { 1766 } else {
1933 rval = 0; 1767 rval = 0;
1934 if ((error = xfs_bmbt_lookup_eq(cur, 1768 error = xfs_bmbt_lookup_eq(bma->cur,
1935 right.br_startoff, 1769 right.br_startoff,
1936 right.br_startblock, 1770 right.br_startblock,
1937 right.br_blockcount, &i))) 1771 right.br_blockcount, &i);
1772 if (error)
1938 goto done; 1773 goto done;
1939 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1774 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1940 if ((error = xfs_bmbt_update(cur, new->br_startoff, 1775 error = xfs_bmbt_update(bma->cur, new->br_startoff,
1941 new->br_startblock, 1776 new->br_startblock,
1942 new->br_blockcount + 1777 new->br_blockcount +
1943 right.br_blockcount, 1778 right.br_blockcount,
1944 right.br_state))) 1779 right.br_state);
1780 if (error)
1945 goto done; 1781 goto done;
1946 } 1782 }
1947 break; 1783 break;
@@ -1952,28 +1788,50 @@ xfs_bmap_add_extent_hole_real(
1952 * real allocation. 1788 * real allocation.
1953 * Insert a new entry. 1789 * Insert a new entry.
1954 */ 1790 */
1955 xfs_iext_insert(ip, *idx, 1, new, state); 1791 xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
1956 XFS_IFORK_NEXT_SET(ip, whichfork, 1792 XFS_IFORK_NEXT_SET(bma->ip, whichfork,
1957 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 1793 XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
1958 if (cur == NULL) { 1794 if (bma->cur == NULL) {
1959 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 1795 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
1960 } else { 1796 } else {
1961 rval = XFS_ILOG_CORE; 1797 rval = XFS_ILOG_CORE;
1962 if ((error = xfs_bmbt_lookup_eq(cur, 1798 error = xfs_bmbt_lookup_eq(bma->cur,
1963 new->br_startoff, 1799 new->br_startoff,
1964 new->br_startblock, 1800 new->br_startblock,
1965 new->br_blockcount, &i))) 1801 new->br_blockcount, &i);
1802 if (error)
1966 goto done; 1803 goto done;
1967 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 1804 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1968 cur->bc_rec.b.br_state = new->br_state; 1805 bma->cur->bc_rec.b.br_state = new->br_state;
1969 if ((error = xfs_btree_insert(cur, &i))) 1806 error = xfs_btree_insert(bma->cur, &i);
1807 if (error)
1970 goto done; 1808 goto done;
1971 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1809 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1972 } 1810 }
1973 break; 1811 break;
1974 } 1812 }
1813
1814 /* convert to a btree if necessary */
1815 if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
1816 XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
1817 int tmp_logflags; /* partial log flag return val */
1818
1819 ASSERT(bma->cur == NULL);
1820 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
1821 bma->firstblock, bma->flist, &bma->cur,
1822 0, &tmp_logflags, whichfork);
1823 bma->logflags |= tmp_logflags;
1824 if (error)
1825 goto done;
1826 }
1827
1828 /* clear out the allocated field, done with it now in any case. */
1829 if (bma->cur)
1830 bma->cur->bc_private.b.allocated = 0;
1831
1832 xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
1975done: 1833done:
1976 *logflagsp = rval; 1834 bma->logflags |= rval;
1977 return error; 1835 return error;
1978} 1836}
1979 1837
@@ -2160,26 +2018,26 @@ xfs_bmap_adjacent(
2160 XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) 2018 XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
2161 2019
2162 mp = ap->ip->i_mount; 2020 mp = ap->ip->i_mount;
2163 nullfb = ap->firstblock == NULLFSBLOCK; 2021 nullfb = *ap->firstblock == NULLFSBLOCK;
2164 rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; 2022 rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
2165 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2023 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
2166 /* 2024 /*
2167 * If allocating at eof, and there's a previous real block, 2025 * If allocating at eof, and there's a previous real block,
2168 * try to use its last block as our starting point. 2026 * try to use its last block as our starting point.
2169 */ 2027 */
2170 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && 2028 if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
2171 !isnullstartblock(ap->prevp->br_startblock) && 2029 !isnullstartblock(ap->prev.br_startblock) &&
2172 ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, 2030 ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
2173 ap->prevp->br_startblock)) { 2031 ap->prev.br_startblock)) {
2174 ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; 2032 ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
2175 /* 2033 /*
2176 * Adjust for the gap between prevp and us. 2034 * Adjust for the gap between prevp and us.
2177 */ 2035 */
2178 adjust = ap->off - 2036 adjust = ap->offset -
2179 (ap->prevp->br_startoff + ap->prevp->br_blockcount); 2037 (ap->prev.br_startoff + ap->prev.br_blockcount);
2180 if (adjust && 2038 if (adjust &&
2181 ISVALID(ap->rval + adjust, ap->prevp->br_startblock)) 2039 ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
2182 ap->rval += adjust; 2040 ap->blkno += adjust;
2183 } 2041 }
2184 /* 2042 /*
2185 * If not at eof, then compare the two neighbor blocks. 2043 * If not at eof, then compare the two neighbor blocks.
@@ -2196,17 +2054,17 @@ xfs_bmap_adjacent(
2196 * If there's a previous (left) block, select a requested 2054 * If there's a previous (left) block, select a requested
2197 * start block based on it. 2055 * start block based on it.
2198 */ 2056 */
2199 if (ap->prevp->br_startoff != NULLFILEOFF && 2057 if (ap->prev.br_startoff != NULLFILEOFF &&
2200 !isnullstartblock(ap->prevp->br_startblock) && 2058 !isnullstartblock(ap->prev.br_startblock) &&
2201 (prevbno = ap->prevp->br_startblock + 2059 (prevbno = ap->prev.br_startblock +
2202 ap->prevp->br_blockcount) && 2060 ap->prev.br_blockcount) &&
2203 ISVALID(prevbno, ap->prevp->br_startblock)) { 2061 ISVALID(prevbno, ap->prev.br_startblock)) {
2204 /* 2062 /*
2205 * Calculate gap to end of previous block. 2063 * Calculate gap to end of previous block.
2206 */ 2064 */
2207 adjust = prevdiff = ap->off - 2065 adjust = prevdiff = ap->offset -
2208 (ap->prevp->br_startoff + 2066 (ap->prev.br_startoff +
2209 ap->prevp->br_blockcount); 2067 ap->prev.br_blockcount);
2210 /* 2068 /*
2211 * Figure the startblock based on the previous block's 2069 * Figure the startblock based on the previous block's
2212 * end and the gap size. 2070 * end and the gap size.
@@ -2215,9 +2073,9 @@ xfs_bmap_adjacent(
2215 * allocating, or using it gives us an invalid block 2073 * allocating, or using it gives us an invalid block
2216 * number, then just use the end of the previous block. 2074 * number, then just use the end of the previous block.
2217 */ 2075 */
2218 if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && 2076 if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
2219 ISVALID(prevbno + prevdiff, 2077 ISVALID(prevbno + prevdiff,
2220 ap->prevp->br_startblock)) 2078 ap->prev.br_startblock))
2221 prevbno += adjust; 2079 prevbno += adjust;
2222 else 2080 else
2223 prevdiff += adjust; 2081 prevdiff += adjust;
@@ -2238,16 +2096,16 @@ xfs_bmap_adjacent(
2238 * If there's a following (right) block, select a requested 2096 * If there's a following (right) block, select a requested
2239 * start block based on it. 2097 * start block based on it.
2240 */ 2098 */
2241 if (!isnullstartblock(ap->gotp->br_startblock)) { 2099 if (!isnullstartblock(ap->got.br_startblock)) {
2242 /* 2100 /*
2243 * Calculate gap to start of next block. 2101 * Calculate gap to start of next block.
2244 */ 2102 */
2245 adjust = gotdiff = ap->gotp->br_startoff - ap->off; 2103 adjust = gotdiff = ap->got.br_startoff - ap->offset;
2246 /* 2104 /*
2247 * Figure the startblock based on the next block's 2105 * Figure the startblock based on the next block's
2248 * start and the gap size. 2106 * start and the gap size.
2249 */ 2107 */
2250 gotbno = ap->gotp->br_startblock; 2108 gotbno = ap->got.br_startblock;
2251 /* 2109 /*
2252 * Heuristic! 2110 * Heuristic!
2253 * If the gap is large relative to the piece we're 2111 * If the gap is large relative to the piece we're
@@ -2255,12 +2113,12 @@ xfs_bmap_adjacent(
2255 * number, then just use the start of the next block 2113 * number, then just use the start of the next block
2256 * offset by our length. 2114 * offset by our length.
2257 */ 2115 */
2258 if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && 2116 if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
2259 ISVALID(gotbno - gotdiff, gotbno)) 2117 ISVALID(gotbno - gotdiff, gotbno))
2260 gotbno -= adjust; 2118 gotbno -= adjust;
2261 else if (ISVALID(gotbno - ap->alen, gotbno)) { 2119 else if (ISVALID(gotbno - ap->length, gotbno)) {
2262 gotbno -= ap->alen; 2120 gotbno -= ap->length;
2263 gotdiff += adjust - ap->alen; 2121 gotdiff += adjust - ap->length;
2264 } else 2122 } else
2265 gotdiff += adjust; 2123 gotdiff += adjust;
2266 /* 2124 /*
@@ -2278,14 +2136,14 @@ xfs_bmap_adjacent(
2278 gotbno = NULLFSBLOCK; 2136 gotbno = NULLFSBLOCK;
2279 /* 2137 /*
2280 * If both valid, pick the better one, else the only good 2138 * If both valid, pick the better one, else the only good
2281 * one, else ap->rval is already set (to 0 or the inode block). 2139 * one, else ap->blkno is already set (to 0 or the inode block).
2282 */ 2140 */
2283 if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) 2141 if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
2284 ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; 2142 ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
2285 else if (prevbno != NULLFSBLOCK) 2143 else if (prevbno != NULLFSBLOCK)
2286 ap->rval = prevbno; 2144 ap->blkno = prevbno;
2287 else if (gotbno != NULLFSBLOCK) 2145 else if (gotbno != NULLFSBLOCK)
2288 ap->rval = gotbno; 2146 ap->blkno = gotbno;
2289 } 2147 }
2290#undef ISVALID 2148#undef ISVALID
2291} 2149}
@@ -2305,24 +2163,24 @@ xfs_bmap_rtalloc(
2305 mp = ap->ip->i_mount; 2163 mp = ap->ip->i_mount;
2306 align = xfs_get_extsz_hint(ap->ip); 2164 align = xfs_get_extsz_hint(ap->ip);
2307 prod = align / mp->m_sb.sb_rextsize; 2165 prod = align / mp->m_sb.sb_rextsize;
2308 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, 2166 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
2309 align, 1, ap->eof, 0, 2167 align, 1, ap->eof, 0,
2310 ap->conv, &ap->off, &ap->alen); 2168 ap->conv, &ap->offset, &ap->length);
2311 if (error) 2169 if (error)
2312 return error; 2170 return error;
2313 ASSERT(ap->alen); 2171 ASSERT(ap->length);
2314 ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); 2172 ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
2315 2173
2316 /* 2174 /*
2317 * If the offset & length are not perfectly aligned 2175 * If the offset & length are not perfectly aligned
2318 * then kill prod, it will just get us in trouble. 2176 * then kill prod, it will just get us in trouble.
2319 */ 2177 */
2320 if (do_mod(ap->off, align) || ap->alen % align) 2178 if (do_mod(ap->offset, align) || ap->length % align)
2321 prod = 1; 2179 prod = 1;
2322 /* 2180 /*
2323 * Set ralen to be the actual requested length in rtextents. 2181 * Set ralen to be the actual requested length in rtextents.
2324 */ 2182 */
2325 ralen = ap->alen / mp->m_sb.sb_rextsize; 2183 ralen = ap->length / mp->m_sb.sb_rextsize;
2326 /* 2184 /*
2327 * If the old value was close enough to MAXEXTLEN that 2185 * If the old value was close enough to MAXEXTLEN that
2328 * we rounded up to it, cut it back so it's valid again. 2186 * we rounded up to it, cut it back so it's valid again.
@@ -2337,21 +2195,21 @@ xfs_bmap_rtalloc(
2337 * Lock out other modifications to the RT bitmap inode. 2195 * Lock out other modifications to the RT bitmap inode.
2338 */ 2196 */
2339 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); 2197 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2340 xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); 2198 xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2341 2199
2342 /* 2200 /*
2343 * If it's an allocation to an empty file at offset 0, 2201 * If it's an allocation to an empty file at offset 0,
2344 * pick an extent that will space things out in the rt area. 2202 * pick an extent that will space things out in the rt area.
2345 */ 2203 */
2346 if (ap->eof && ap->off == 0) { 2204 if (ap->eof && ap->offset == 0) {
2347 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ 2205 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
2348 2206
2349 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); 2207 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
2350 if (error) 2208 if (error)
2351 return error; 2209 return error;
2352 ap->rval = rtx * mp->m_sb.sb_rextsize; 2210 ap->blkno = rtx * mp->m_sb.sb_rextsize;
2353 } else { 2211 } else {
2354 ap->rval = 0; 2212 ap->blkno = 0;
2355 } 2213 }
2356 2214
2357 xfs_bmap_adjacent(ap); 2215 xfs_bmap_adjacent(ap);
@@ -2359,23 +2217,23 @@ xfs_bmap_rtalloc(
2359 /* 2217 /*
2360 * Realtime allocation, done through xfs_rtallocate_extent. 2218 * Realtime allocation, done through xfs_rtallocate_extent.
2361 */ 2219 */
2362 atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; 2220 atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
2363 do_div(ap->rval, mp->m_sb.sb_rextsize); 2221 do_div(ap->blkno, mp->m_sb.sb_rextsize);
2364 rtb = ap->rval; 2222 rtb = ap->blkno;
2365 ap->alen = ralen; 2223 ap->length = ralen;
2366 if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, 2224 if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
2367 &ralen, atype, ap->wasdel, prod, &rtb))) 2225 &ralen, atype, ap->wasdel, prod, &rtb)))
2368 return error; 2226 return error;
2369 if (rtb == NULLFSBLOCK && prod > 1 && 2227 if (rtb == NULLFSBLOCK && prod > 1 &&
2370 (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, 2228 (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
2371 ap->alen, &ralen, atype, 2229 ap->length, &ralen, atype,
2372 ap->wasdel, 1, &rtb))) 2230 ap->wasdel, 1, &rtb)))
2373 return error; 2231 return error;
2374 ap->rval = rtb; 2232 ap->blkno = rtb;
2375 if (ap->rval != NULLFSBLOCK) { 2233 if (ap->blkno != NULLFSBLOCK) {
2376 ap->rval *= mp->m_sb.sb_rextsize; 2234 ap->blkno *= mp->m_sb.sb_rextsize;
2377 ralen *= mp->m_sb.sb_rextsize; 2235 ralen *= mp->m_sb.sb_rextsize;
2378 ap->alen = ralen; 2236 ap->length = ralen;
2379 ap->ip->i_d.di_nblocks += ralen; 2237 ap->ip->i_d.di_nblocks += ralen;
2380 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); 2238 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
2381 if (ap->wasdel) 2239 if (ap->wasdel)
@@ -2388,7 +2246,7 @@ xfs_bmap_rtalloc(
2388 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : 2246 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
2389 XFS_TRANS_DQ_RTBCOUNT, (long) ralen); 2247 XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
2390 } else { 2248 } else {
2391 ap->alen = 0; 2249 ap->length = 0;
2392 } 2250 }
2393 return 0; 2251 return 0;
2394} 2252}
@@ -2503,7 +2361,7 @@ xfs_bmap_btalloc_nullfb(
2503 * AG as the stream may have moved. 2361 * AG as the stream may have moved.
2504 */ 2362 */
2505 if (xfs_inode_is_filestream(ap->ip)) 2363 if (xfs_inode_is_filestream(ap->ip))
2506 ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); 2364 ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2507 2365
2508 return 0; 2366 return 0;
2509} 2367}
@@ -2528,52 +2386,52 @@ xfs_bmap_btalloc(
2528 mp = ap->ip->i_mount; 2386 mp = ap->ip->i_mount;
2529 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; 2387 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
2530 if (unlikely(align)) { 2388 if (unlikely(align)) {
2531 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, 2389 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
2532 align, 0, ap->eof, 0, ap->conv, 2390 align, 0, ap->eof, 0, ap->conv,
2533 &ap->off, &ap->alen); 2391 &ap->offset, &ap->length);
2534 ASSERT(!error); 2392 ASSERT(!error);
2535 ASSERT(ap->alen); 2393 ASSERT(ap->length);
2536 } 2394 }
2537 nullfb = ap->firstblock == NULLFSBLOCK; 2395 nullfb = *ap->firstblock == NULLFSBLOCK;
2538 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2396 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
2539 if (nullfb) { 2397 if (nullfb) {
2540 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { 2398 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
2541 ag = xfs_filestream_lookup_ag(ap->ip); 2399 ag = xfs_filestream_lookup_ag(ap->ip);
2542 ag = (ag != NULLAGNUMBER) ? ag : 0; 2400 ag = (ag != NULLAGNUMBER) ? ag : 0;
2543 ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); 2401 ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
2544 } else { 2402 } else {
2545 ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); 2403 ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
2546 } 2404 }
2547 } else 2405 } else
2548 ap->rval = ap->firstblock; 2406 ap->blkno = *ap->firstblock;
2549 2407
2550 xfs_bmap_adjacent(ap); 2408 xfs_bmap_adjacent(ap);
2551 2409
2552 /* 2410 /*
2553 * If allowed, use ap->rval; otherwise must use firstblock since 2411 * If allowed, use ap->blkno; otherwise must use firstblock since
2554 * it's in the right allocation group. 2412 * it's in the right allocation group.
2555 */ 2413 */
2556 if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) 2414 if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
2557 ; 2415 ;
2558 else 2416 else
2559 ap->rval = ap->firstblock; 2417 ap->blkno = *ap->firstblock;
2560 /* 2418 /*
2561 * Normal allocation, done through xfs_alloc_vextent. 2419 * Normal allocation, done through xfs_alloc_vextent.
2562 */ 2420 */
2563 tryagain = isaligned = 0; 2421 tryagain = isaligned = 0;
2564 args.tp = ap->tp; 2422 args.tp = ap->tp;
2565 args.mp = mp; 2423 args.mp = mp;
2566 args.fsbno = ap->rval; 2424 args.fsbno = ap->blkno;
2567 2425
2568 /* Trim the allocation back to the maximum an AG can fit. */ 2426 /* Trim the allocation back to the maximum an AG can fit. */
2569 args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); 2427 args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
2570 args.firstblock = ap->firstblock; 2428 args.firstblock = *ap->firstblock;
2571 blen = 0; 2429 blen = 0;
2572 if (nullfb) { 2430 if (nullfb) {
2573 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); 2431 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
2574 if (error) 2432 if (error)
2575 return error; 2433 return error;
2576 } else if (ap->low) { 2434 } else if (ap->flist->xbf_low) {
2577 if (xfs_inode_is_filestream(ap->ip)) 2435 if (xfs_inode_is_filestream(ap->ip))
2578 args.type = XFS_ALLOCTYPE_FIRST_AG; 2436 args.type = XFS_ALLOCTYPE_FIRST_AG;
2579 else 2437 else
@@ -2587,14 +2445,14 @@ xfs_bmap_btalloc(
2587 /* apply extent size hints if obtained earlier */ 2445 /* apply extent size hints if obtained earlier */
2588 if (unlikely(align)) { 2446 if (unlikely(align)) {
2589 args.prod = align; 2447 args.prod = align;
2590 if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) 2448 if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
2591 args.mod = (xfs_extlen_t)(args.prod - args.mod); 2449 args.mod = (xfs_extlen_t)(args.prod - args.mod);
2592 } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { 2450 } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
2593 args.prod = 1; 2451 args.prod = 1;
2594 args.mod = 0; 2452 args.mod = 0;
2595 } else { 2453 } else {
2596 args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; 2454 args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
2597 if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) 2455 if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
2598 args.mod = (xfs_extlen_t)(args.prod - args.mod); 2456 args.mod = (xfs_extlen_t)(args.prod - args.mod);
2599 } 2457 }
2600 /* 2458 /*
@@ -2606,8 +2464,8 @@ xfs_bmap_btalloc(
2606 * is >= the stripe unit and the allocation offset is 2464 * is >= the stripe unit and the allocation offset is
2607 * at the end of file. 2465 * at the end of file.
2608 */ 2466 */
2609 if (!ap->low && ap->aeof) { 2467 if (!ap->flist->xbf_low && ap->aeof) {
2610 if (!ap->off) { 2468 if (!ap->offset) {
2611 args.alignment = mp->m_dalign; 2469 args.alignment = mp->m_dalign;
2612 atype = args.type; 2470 atype = args.type;
2613 isaligned = 1; 2471 isaligned = 1;
@@ -2660,7 +2518,7 @@ xfs_bmap_btalloc(
2660 * turned on. 2518 * turned on.
2661 */ 2519 */
2662 args.type = atype; 2520 args.type = atype;
2663 args.fsbno = ap->rval; 2521 args.fsbno = ap->blkno;
2664 args.alignment = mp->m_dalign; 2522 args.alignment = mp->m_dalign;
2665 args.minlen = nextminlen; 2523 args.minlen = nextminlen;
2666 args.minalignslop = 0; 2524 args.minalignslop = 0;
@@ -2674,7 +2532,7 @@ xfs_bmap_btalloc(
2674 * try again. 2532 * try again.
2675 */ 2533 */
2676 args.type = atype; 2534 args.type = atype;
2677 args.fsbno = ap->rval; 2535 args.fsbno = ap->blkno;
2678 args.alignment = 0; 2536 args.alignment = 0;
2679 if ((error = xfs_alloc_vextent(&args))) 2537 if ((error = xfs_alloc_vextent(&args)))
2680 return error; 2538 return error;
@@ -2683,7 +2541,7 @@ xfs_bmap_btalloc(
2683 args.minlen > ap->minlen) { 2541 args.minlen > ap->minlen) {
2684 args.minlen = ap->minlen; 2542 args.minlen = ap->minlen;
2685 args.type = XFS_ALLOCTYPE_START_BNO; 2543 args.type = XFS_ALLOCTYPE_START_BNO;
2686 args.fsbno = ap->rval; 2544 args.fsbno = ap->blkno;
2687 if ((error = xfs_alloc_vextent(&args))) 2545 if ((error = xfs_alloc_vextent(&args)))
2688 return error; 2546 return error;
2689 } 2547 }
@@ -2694,13 +2552,26 @@ xfs_bmap_btalloc(
2694 args.minleft = 0; 2552 args.minleft = 0;
2695 if ((error = xfs_alloc_vextent(&args))) 2553 if ((error = xfs_alloc_vextent(&args)))
2696 return error; 2554 return error;
2697 ap->low = 1; 2555 ap->flist->xbf_low = 1;
2698 } 2556 }
2699 if (args.fsbno != NULLFSBLOCK) { 2557 if (args.fsbno != NULLFSBLOCK) {
2700 ap->firstblock = ap->rval = args.fsbno; 2558 /*
2559 * check the allocation happened at the same or higher AG than
2560 * the first block that was allocated.
2561 */
2562 ASSERT(*ap->firstblock == NULLFSBLOCK ||
2563 XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
2564 XFS_FSB_TO_AGNO(mp, args.fsbno) ||
2565 (ap->flist->xbf_low &&
2566 XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
2567 XFS_FSB_TO_AGNO(mp, args.fsbno)));
2568
2569 ap->blkno = args.fsbno;
2570 if (*ap->firstblock == NULLFSBLOCK)
2571 *ap->firstblock = args.fsbno;
2701 ASSERT(nullfb || fb_agno == args.agno || 2572 ASSERT(nullfb || fb_agno == args.agno ||
2702 (ap->low && fb_agno < args.agno)); 2573 (ap->flist->xbf_low && fb_agno < args.agno));
2703 ap->alen = args.len; 2574 ap->length = args.len;
2704 ap->ip->i_d.di_nblocks += args.len; 2575 ap->ip->i_d.di_nblocks += args.len;
2705 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); 2576 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
2706 if (ap->wasdel) 2577 if (ap->wasdel)
@@ -2714,8 +2585,8 @@ xfs_bmap_btalloc(
2714 XFS_TRANS_DQ_BCOUNT, 2585 XFS_TRANS_DQ_BCOUNT,
2715 (long) args.len); 2586 (long) args.len);
2716 } else { 2587 } else {
2717 ap->rval = NULLFSBLOCK; 2588 ap->blkno = NULLFSBLOCK;
2718 ap->alen = 0; 2589 ap->length = 0;
2719 } 2590 }
2720 return 0; 2591 return 0;
2721} 2592}
@@ -3383,8 +3254,7 @@ xfs_bmap_local_to_extents(
3383 ASSERT(args.len == 1); 3254 ASSERT(args.len == 1);
3384 *firstblock = args.fsbno; 3255 *firstblock = args.fsbno;
3385 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); 3256 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
3386 memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data, 3257 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
3387 ifp->if_bytes);
3388 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); 3258 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
3389 xfs_bmap_forkoff_reset(args.mp, ip, whichfork); 3259 xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
3390 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); 3260 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
@@ -3590,7 +3460,7 @@ xfs_bmap_add_attrfork(
3590 } 3460 }
3591 ASSERT(ip->i_d.di_anextents == 0); 3461 ASSERT(ip->i_d.di_anextents == 0);
3592 3462
3593 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 3463 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3594 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 3464 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3595 3465
3596 switch (ip->i_d.di_format) { 3466 switch (ip->i_d.di_format) {
@@ -3783,19 +3653,11 @@ xfs_bmap_compute_maxlevels(
3783 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi 3653 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
3784 * caller. Frees all the extents that need freeing, which must be done 3654 * caller. Frees all the extents that need freeing, which must be done
3785 * last due to locking considerations. We never free any extents in 3655 * last due to locking considerations. We never free any extents in
3786 * the first transaction. This is to allow the caller to make the first 3656 * the first transaction.
3787 * transaction a synchronous one so that the pointers to the data being
3788 * broken in this transaction will be permanent before the data is actually
3789 * freed. This is necessary to prevent blocks from being reallocated
3790 * and written to before the free and reallocation are actually permanent.
3791 * We do not just make the first transaction synchronous here, because
3792 * there are more efficient ways to gain the same protection in some cases
3793 * (see the file truncation code).
3794 * 3657 *
3795 * Return 1 if the given transaction was committed and a new one 3658 * Return 1 if the given transaction was committed and a new one
3796 * started, and 0 otherwise in the committed parameter. 3659 * started, and 0 otherwise in the committed parameter.
3797 */ 3660 */
3798/*ARGSUSED*/
3799int /* error */ 3661int /* error */
3800xfs_bmap_finish( 3662xfs_bmap_finish(
3801 xfs_trans_t **tp, /* transaction pointer addr */ 3663 xfs_trans_t **tp, /* transaction pointer addr */
@@ -3995,42 +3857,122 @@ xfs_bmap_last_before(
3995 return 0; 3857 return 0;
3996} 3858}
3997 3859
3860STATIC int
3861xfs_bmap_last_extent(
3862 struct xfs_trans *tp,
3863 struct xfs_inode *ip,
3864 int whichfork,
3865 struct xfs_bmbt_irec *rec,
3866 int *is_empty)
3867{
3868 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
3869 int error;
3870 int nextents;
3871
3872 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
3873 error = xfs_iread_extents(tp, ip, whichfork);
3874 if (error)
3875 return error;
3876 }
3877
3878 nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
3879 if (nextents == 0) {
3880 *is_empty = 1;
3881 return 0;
3882 }
3883
3884 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
3885 *is_empty = 0;
3886 return 0;
3887}
3888
3889/*
3890 * Check the last inode extent to determine whether this allocation will result
3891 * in blocks being allocated at the end of the file. When we allocate new data
3892 * blocks at the end of the file which do not start at the previous data block,
3893 * we will try to align the new blocks at stripe unit boundaries.
3894 *
3895 * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
3896 * at, or past the EOF.
3897 */
3898STATIC int
3899xfs_bmap_isaeof(
3900 struct xfs_bmalloca *bma,
3901 int whichfork)
3902{
3903 struct xfs_bmbt_irec rec;
3904 int is_empty;
3905 int error;
3906
3907 bma->aeof = 0;
3908 error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
3909 &is_empty);
3910 if (error || is_empty)
3911 return error;
3912
3913 /*
3914 * Check if we are allocation or past the last extent, or at least into
3915 * the last delayed allocated extent.
3916 */
3917 bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
3918 (bma->offset >= rec.br_startoff &&
3919 isnullstartblock(rec.br_startblock));
3920 return 0;
3921}
3922
3923/*
3924 * Check if the endoff is outside the last extent. If so the caller will grow
3925 * the allocation to a stripe unit boundary. All offsets are considered outside
3926 * the end of file for an empty fork, so 1 is returned in *eof in that case.
3927 */
3928int
3929xfs_bmap_eof(
3930 struct xfs_inode *ip,
3931 xfs_fileoff_t endoff,
3932 int whichfork,
3933 int *eof)
3934{
3935 struct xfs_bmbt_irec rec;
3936 int error;
3937
3938 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
3939 if (error || *eof)
3940 return error;
3941
3942 *eof = endoff >= rec.br_startoff + rec.br_blockcount;
3943 return 0;
3944}
3945
3998/* 3946/*
3999 * Returns the file-relative block number of the first block past eof in 3947 * Returns the file-relative block number of the first block past eof in
4000 * the file. This is not based on i_size, it is based on the extent records. 3948 * the file. This is not based on i_size, it is based on the extent records.
4001 * Returns 0 for local files, as they do not have extent records. 3949 * Returns 0 for local files, as they do not have extent records.
4002 */ 3950 */
4003int /* error */ 3951int
4004xfs_bmap_last_offset( 3952xfs_bmap_last_offset(
4005 xfs_trans_t *tp, /* transaction pointer */ 3953 struct xfs_trans *tp,
4006 xfs_inode_t *ip, /* incore inode */ 3954 struct xfs_inode *ip,
4007 xfs_fileoff_t *last_block, /* last block */ 3955 xfs_fileoff_t *last_block,
4008 int whichfork) /* data or attr fork */ 3956 int whichfork)
4009{ 3957{
4010 xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ 3958 struct xfs_bmbt_irec rec;
4011 int error; /* error return value */ 3959 int is_empty;
4012 xfs_ifork_t *ifp; /* inode fork pointer */ 3960 int error;
4013 xfs_extnum_t nextents; /* number of extent entries */ 3961
3962 *last_block = 0;
3963
3964 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
3965 return 0;
4014 3966
4015 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && 3967 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
4016 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 3968 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
4017 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
4018 return XFS_ERROR(EIO); 3969 return XFS_ERROR(EIO);
4019 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 3970
4020 *last_block = 0; 3971 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
4021 return 0; 3972 if (error || is_empty)
4022 }
4023 ifp = XFS_IFORK_PTR(ip, whichfork);
4024 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
4025 (error = xfs_iread_extents(tp, ip, whichfork)))
4026 return error; 3973 return error;
4027 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3974
4028 if (!nextents) { 3975 *last_block = rec.br_startoff + rec.br_blockcount;
4029 *last_block = 0;
4030 return 0;
4031 }
4032 ep = xfs_iext_get_ext(ifp, nextents - 1);
4033 *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
4034 return 0; 3976 return 0;
4035} 3977}
4036 3978
@@ -4160,7 +4102,6 @@ xfs_bmap_read_extents(
4160 xfs_extnum_t num_recs; 4102 xfs_extnum_t num_recs;
4161 xfs_extnum_t start; 4103 xfs_extnum_t start;
4162 4104
4163
4164 num_recs = xfs_btree_get_numrecs(block); 4105 num_recs = xfs_btree_get_numrecs(block);
4165 if (unlikely(i + num_recs > room)) { 4106 if (unlikely(i + num_recs > room)) {
4166 ASSERT(i + num_recs <= room); 4107 ASSERT(i + num_recs <= room);
@@ -4283,9 +4224,8 @@ xfs_bmap_validate_ret(
4283 ASSERT(i == 0 || 4224 ASSERT(i == 0 ||
4284 mval[i - 1].br_startoff + mval[i - 1].br_blockcount == 4225 mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
4285 mval[i].br_startoff); 4226 mval[i].br_startoff);
4286 if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) 4227 ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
4287 ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && 4228 mval[i].br_startblock != HOLESTARTBLOCK);
4288 mval[i].br_startblock != HOLESTARTBLOCK);
4289 ASSERT(mval[i].br_state == XFS_EXT_NORM || 4229 ASSERT(mval[i].br_state == XFS_EXT_NORM ||
4290 mval[i].br_state == XFS_EXT_UNWRITTEN); 4230 mval[i].br_state == XFS_EXT_UNWRITTEN);
4291 } 4231 }
@@ -4294,66 +4234,609 @@ xfs_bmap_validate_ret(
4294 4234
4295 4235
4296/* 4236/*
4297 * Map file blocks to filesystem blocks. 4237 * Trim the returned map to the required bounds
4298 * File range is given by the bno/len pair. 4238 */
4299 * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) 4239STATIC void
4300 * into a hole or past eof. 4240xfs_bmapi_trim_map(
4301 * Only allocates blocks from a single allocation group, 4241 struct xfs_bmbt_irec *mval,
4302 * to avoid locking problems. 4242 struct xfs_bmbt_irec *got,
4243 xfs_fileoff_t *bno,
4244 xfs_filblks_t len,
4245 xfs_fileoff_t obno,
4246 xfs_fileoff_t end,
4247 int n,
4248 int flags)
4249{
4250 if ((flags & XFS_BMAPI_ENTIRE) ||
4251 got->br_startoff + got->br_blockcount <= obno) {
4252 *mval = *got;
4253 if (isnullstartblock(got->br_startblock))
4254 mval->br_startblock = DELAYSTARTBLOCK;
4255 return;
4256 }
4257
4258 if (obno > *bno)
4259 *bno = obno;
4260 ASSERT((*bno >= obno) || (n == 0));
4261 ASSERT(*bno < end);
4262 mval->br_startoff = *bno;
4263 if (isnullstartblock(got->br_startblock))
4264 mval->br_startblock = DELAYSTARTBLOCK;
4265 else
4266 mval->br_startblock = got->br_startblock +
4267 (*bno - got->br_startoff);
4268 /*
4269 * Return the minimum of what we got and what we asked for for
4270 * the length. We can use the len variable here because it is
4271 * modified below and we could have been there before coming
4272 * here if the first part of the allocation didn't overlap what
4273 * was asked for.
4274 */
4275 mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
4276 got->br_blockcount - (*bno - got->br_startoff));
4277 mval->br_state = got->br_state;
4278 ASSERT(mval->br_blockcount <= len);
4279 return;
4280}
4281
4282/*
4283 * Update and validate the extent map to return
4284 */
4285STATIC void
4286xfs_bmapi_update_map(
4287 struct xfs_bmbt_irec **map,
4288 xfs_fileoff_t *bno,
4289 xfs_filblks_t *len,
4290 xfs_fileoff_t obno,
4291 xfs_fileoff_t end,
4292 int *n,
4293 int flags)
4294{
4295 xfs_bmbt_irec_t *mval = *map;
4296
4297 ASSERT((flags & XFS_BMAPI_ENTIRE) ||
4298 ((mval->br_startoff + mval->br_blockcount) <= end));
4299 ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
4300 (mval->br_startoff < obno));
4301
4302 *bno = mval->br_startoff + mval->br_blockcount;
4303 *len = end - *bno;
4304 if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
4305 /* update previous map with new information */
4306 ASSERT(mval->br_startblock == mval[-1].br_startblock);
4307 ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
4308 ASSERT(mval->br_state == mval[-1].br_state);
4309 mval[-1].br_blockcount = mval->br_blockcount;
4310 mval[-1].br_state = mval->br_state;
4311 } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
4312 mval[-1].br_startblock != DELAYSTARTBLOCK &&
4313 mval[-1].br_startblock != HOLESTARTBLOCK &&
4314 mval->br_startblock == mval[-1].br_startblock +
4315 mval[-1].br_blockcount &&
4316 ((flags & XFS_BMAPI_IGSTATE) ||
4317 mval[-1].br_state == mval->br_state)) {
4318 ASSERT(mval->br_startoff ==
4319 mval[-1].br_startoff + mval[-1].br_blockcount);
4320 mval[-1].br_blockcount += mval->br_blockcount;
4321 } else if (*n > 0 &&
4322 mval->br_startblock == DELAYSTARTBLOCK &&
4323 mval[-1].br_startblock == DELAYSTARTBLOCK &&
4324 mval->br_startoff ==
4325 mval[-1].br_startoff + mval[-1].br_blockcount) {
4326 mval[-1].br_blockcount += mval->br_blockcount;
4327 mval[-1].br_state = mval->br_state;
4328 } else if (!((*n == 0) &&
4329 ((mval->br_startoff + mval->br_blockcount) <=
4330 obno))) {
4331 mval++;
4332 (*n)++;
4333 }
4334 *map = mval;
4335}
4336
4337/*
4338 * Map file blocks to filesystem blocks without allocation.
4339 */
4340int
4341xfs_bmapi_read(
4342 struct xfs_inode *ip,
4343 xfs_fileoff_t bno,
4344 xfs_filblks_t len,
4345 struct xfs_bmbt_irec *mval,
4346 int *nmap,
4347 int flags)
4348{
4349 struct xfs_mount *mp = ip->i_mount;
4350 struct xfs_ifork *ifp;
4351 struct xfs_bmbt_irec got;
4352 struct xfs_bmbt_irec prev;
4353 xfs_fileoff_t obno;
4354 xfs_fileoff_t end;
4355 xfs_extnum_t lastx;
4356 int error;
4357 int eof;
4358 int n = 0;
4359 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4360 XFS_ATTR_FORK : XFS_DATA_FORK;
4361
4362 ASSERT(*nmap >= 1);
4363 ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
4364 XFS_BMAPI_IGSTATE)));
4365
4366 if (unlikely(XFS_TEST_ERROR(
4367 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4368 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
4369 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4370 XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
4371 return XFS_ERROR(EFSCORRUPTED);
4372 }
4373
4374 if (XFS_FORCED_SHUTDOWN(mp))
4375 return XFS_ERROR(EIO);
4376
4377 XFS_STATS_INC(xs_blk_mapr);
4378
4379 ifp = XFS_IFORK_PTR(ip, whichfork);
4380 ASSERT(ifp->if_ext_max ==
4381 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4382
4383 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4384 error = xfs_iread_extents(NULL, ip, whichfork);
4385 if (error)
4386 return error;
4387 }
4388
4389 xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
4390 end = bno + len;
4391 obno = bno;
4392
4393 while (bno < end && n < *nmap) {
4394 /* Reading past eof, act as though there's a hole up to end. */
4395 if (eof)
4396 got.br_startoff = end;
4397 if (got.br_startoff > bno) {
4398 /* Reading in a hole. */
4399 mval->br_startoff = bno;
4400 mval->br_startblock = HOLESTARTBLOCK;
4401 mval->br_blockcount =
4402 XFS_FILBLKS_MIN(len, got.br_startoff - bno);
4403 mval->br_state = XFS_EXT_NORM;
4404 bno += mval->br_blockcount;
4405 len -= mval->br_blockcount;
4406 mval++;
4407 n++;
4408 continue;
4409 }
4410
4411 /* set up the extent map to return. */
4412 xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
4413 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4414
4415 /* If we're done, stop now. */
4416 if (bno >= end || n >= *nmap)
4417 break;
4418
4419 /* Else go on to the next record. */
4420 if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
4421 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
4422 else
4423 eof = 1;
4424 }
4425 *nmap = n;
4426 return 0;
4427}
4428
4429STATIC int
4430xfs_bmapi_reserve_delalloc(
4431 struct xfs_inode *ip,
4432 xfs_fileoff_t aoff,
4433 xfs_filblks_t len,
4434 struct xfs_bmbt_irec *got,
4435 struct xfs_bmbt_irec *prev,
4436 xfs_extnum_t *lastx,
4437 int eof)
4438{
4439 struct xfs_mount *mp = ip->i_mount;
4440 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
4441 xfs_extlen_t alen;
4442 xfs_extlen_t indlen;
4443 char rt = XFS_IS_REALTIME_INODE(ip);
4444 xfs_extlen_t extsz;
4445 int error;
4446
4447 alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
4448 if (!eof)
4449 alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
4450
4451 /* Figure out the extent size, adjust alen */
4452 extsz = xfs_get_extsz_hint(ip);
4453 if (extsz) {
4454 /*
4455 * Make sure we don't exceed a single extent length when we
4456 * align the extent by reducing length we are going to
4457 * allocate by the maximum amount extent size aligment may
4458 * require.
4459 */
4460 alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
4461 error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
4462 1, 0, &aoff, &alen);
4463 ASSERT(!error);
4464 }
4465
4466 if (rt)
4467 extsz = alen / mp->m_sb.sb_rextsize;
4468
4469 /*
4470 * Make a transaction-less quota reservation for delayed allocation
4471 * blocks. This number gets adjusted later. We return if we haven't
4472 * allocated blocks already inside this loop.
4473 */
4474 error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
4475 rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4476 if (error)
4477 return error;
4478
4479 /*
4480 * Split changing sb for alen and indlen since they could be coming
4481 * from different places.
4482 */
4483 indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
4484 ASSERT(indlen > 0);
4485
4486 if (rt) {
4487 error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
4488 -((int64_t)extsz), 0);
4489 } else {
4490 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
4491 -((int64_t)alen), 0);
4492 }
4493
4494 if (error)
4495 goto out_unreserve_quota;
4496
4497 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
4498 -((int64_t)indlen), 0);
4499 if (error)
4500 goto out_unreserve_blocks;
4501
4502
4503 ip->i_delayed_blks += alen;
4504
4505 got->br_startoff = aoff;
4506 got->br_startblock = nullstartblock(indlen);
4507 got->br_blockcount = alen;
4508 got->br_state = XFS_EXT_NORM;
4509 xfs_bmap_add_extent_hole_delay(ip, lastx, got);
4510
4511 /*
4512 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
4513 * might have merged it into one of the neighbouring ones.
4514 */
4515 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
4516
4517 ASSERT(got->br_startoff <= aoff);
4518 ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
4519 ASSERT(isnullstartblock(got->br_startblock));
4520 ASSERT(got->br_state == XFS_EXT_NORM);
4521 return 0;
4522
4523out_unreserve_blocks:
4524 if (rt)
4525 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
4526 else
4527 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
4528out_unreserve_quota:
4529 if (XFS_IS_QUOTA_ON(mp))
4530 xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ?
4531 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4532 return error;
4533}
4534
4535/*
4536 * Map file blocks to filesystem blocks, adding delayed allocations as needed.
4537 */
4538int
4539xfs_bmapi_delay(
4540 struct xfs_inode *ip, /* incore inode */
4541 xfs_fileoff_t bno, /* starting file offs. mapped */
4542 xfs_filblks_t len, /* length to map in file */
4543 struct xfs_bmbt_irec *mval, /* output: map values */
4544 int *nmap, /* i/o: mval size/count */
4545 int flags) /* XFS_BMAPI_... */
4546{
4547 struct xfs_mount *mp = ip->i_mount;
4548 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
4549 struct xfs_bmbt_irec got; /* current file extent record */
4550 struct xfs_bmbt_irec prev; /* previous file extent record */
4551 xfs_fileoff_t obno; /* old block number (offset) */
4552 xfs_fileoff_t end; /* end of mapped file region */
4553 xfs_extnum_t lastx; /* last useful extent number */
4554 int eof; /* we've hit the end of extents */
4555 int n = 0; /* current extent index */
4556 int error = 0;
4557
4558 ASSERT(*nmap >= 1);
4559 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4560 ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
4561
4562 if (unlikely(XFS_TEST_ERROR(
4563 (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
4564 XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
4565 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4566 XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
4567 return XFS_ERROR(EFSCORRUPTED);
4568 }
4569
4570 if (XFS_FORCED_SHUTDOWN(mp))
4571 return XFS_ERROR(EIO);
4572
4573 XFS_STATS_INC(xs_blk_mapw);
4574
4575 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4576 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
4577 if (error)
4578 return error;
4579 }
4580
4581 xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
4582 end = bno + len;
4583 obno = bno;
4584
4585 while (bno < end && n < *nmap) {
4586 if (eof || got.br_startoff > bno) {
4587 error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
4588 &prev, &lastx, eof);
4589 if (error) {
4590 if (n == 0) {
4591 *nmap = 0;
4592 return error;
4593 }
4594 break;
4595 }
4596 }
4597
4598 /* set up the extent map to return. */
4599 xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
4600 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4601
4602 /* If we're done, stop now. */
4603 if (bno >= end || n >= *nmap)
4604 break;
4605
4606 /* Else go on to the next record. */
4607 prev = got;
4608 if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
4609 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
4610 else
4611 eof = 1;
4612 }
4613
4614 *nmap = n;
4615 return 0;
4616}
4617
4618
4619STATIC int
4620xfs_bmapi_allocate(
4621 struct xfs_bmalloca *bma,
4622 int flags)
4623{
4624 struct xfs_mount *mp = bma->ip->i_mount;
4625 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4626 XFS_ATTR_FORK : XFS_DATA_FORK;
4627 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4628 int tmp_logflags = 0;
4629 int error;
4630 int rt;
4631
4632 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
4633
4634 /*
4635 * For the wasdelay case, we could also just allocate the stuff asked
4636 * for in this bmap call but that wouldn't be as good.
4637 */
4638 if (bma->wasdel) {
4639 bma->length = (xfs_extlen_t)bma->got.br_blockcount;
4640 bma->offset = bma->got.br_startoff;
4641 if (bma->idx != NULLEXTNUM && bma->idx) {
4642 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
4643 &bma->prev);
4644 }
4645 } else {
4646 bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
4647 if (!bma->eof)
4648 bma->length = XFS_FILBLKS_MIN(bma->length,
4649 bma->got.br_startoff - bma->offset);
4650 }
4651
4652 /*
4653 * Indicate if this is the first user data in the file, or just any
4654 * user data.
4655 */
4656 if (!(flags & XFS_BMAPI_METADATA)) {
4657 bma->userdata = (bma->offset == 0) ?
4658 XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
4659 }
4660
4661 bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
4662
4663 /*
4664 * Only want to do the alignment at the eof if it is userdata and
4665 * allocation length is larger than a stripe unit.
4666 */
4667 if (mp->m_dalign && bma->length >= mp->m_dalign &&
4668 !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
4669 error = xfs_bmap_isaeof(bma, whichfork);
4670 if (error)
4671 return error;
4672 }
4673
4674 error = xfs_bmap_alloc(bma);
4675 if (error)
4676 return error;
4677
4678 if (bma->flist->xbf_low)
4679 bma->minleft = 0;
4680 if (bma->cur)
4681 bma->cur->bc_private.b.firstblock = *bma->firstblock;
4682 if (bma->blkno == NULLFSBLOCK)
4683 return 0;
4684 if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
4685 bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
4686 bma->cur->bc_private.b.firstblock = *bma->firstblock;
4687 bma->cur->bc_private.b.flist = bma->flist;
4688 }
4689 /*
4690 * Bump the number of extents we've allocated
4691 * in this call.
4692 */
4693 bma->nallocs++;
4694
4695 if (bma->cur)
4696 bma->cur->bc_private.b.flags =
4697 bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
4698
4699 bma->got.br_startoff = bma->offset;
4700 bma->got.br_startblock = bma->blkno;
4701 bma->got.br_blockcount = bma->length;
4702 bma->got.br_state = XFS_EXT_NORM;
4703
4704 /*
4705 * A wasdelay extent has been initialized, so shouldn't be flagged
4706 * as unwritten.
4707 */
4708 if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
4709 xfs_sb_version_hasextflgbit(&mp->m_sb))
4710 bma->got.br_state = XFS_EXT_UNWRITTEN;
4711
4712 if (bma->wasdel)
4713 error = xfs_bmap_add_extent_delay_real(bma);
4714 else
4715 error = xfs_bmap_add_extent_hole_real(bma, whichfork);
4716
4717 bma->logflags |= tmp_logflags;
4718 if (error)
4719 return error;
4720
4721 /*
4722 * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
4723 * or xfs_bmap_add_extent_hole_real might have merged it into one of
4724 * the neighbouring ones.
4725 */
4726 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
4727
4728 ASSERT(bma->got.br_startoff <= bma->offset);
4729 ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
4730 bma->offset + bma->length);
4731 ASSERT(bma->got.br_state == XFS_EXT_NORM ||
4732 bma->got.br_state == XFS_EXT_UNWRITTEN);
4733 return 0;
4734}
4735
4736STATIC int
4737xfs_bmapi_convert_unwritten(
4738 struct xfs_bmalloca *bma,
4739 struct xfs_bmbt_irec *mval,
4740 xfs_filblks_t len,
4741 int flags)
4742{
4743 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4744 XFS_ATTR_FORK : XFS_DATA_FORK;
4745 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4746 int tmp_logflags = 0;
4747 int error;
4748
4749 /* check if we need to do unwritten->real conversion */
4750 if (mval->br_state == XFS_EXT_UNWRITTEN &&
4751 (flags & XFS_BMAPI_PREALLOC))
4752 return 0;
4753
4754 /* check if we need to do real->unwritten conversion */
4755 if (mval->br_state == XFS_EXT_NORM &&
4756 (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
4757 (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
4758 return 0;
4759
4760 /*
4761 * Modify (by adding) the state flag, if writing.
4762 */
4763 ASSERT(mval->br_blockcount <= len);
4764 if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
4765 bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
4766 bma->ip, whichfork);
4767 bma->cur->bc_private.b.firstblock = *bma->firstblock;
4768 bma->cur->bc_private.b.flist = bma->flist;
4769 }
4770 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4771 ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
4772
4773 error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
4774 &bma->cur, mval, bma->firstblock, bma->flist,
4775 &tmp_logflags);
4776 bma->logflags |= tmp_logflags;
4777 if (error)
4778 return error;
4779
4780 /*
4781 * Update our extent pointer, given that
4782 * xfs_bmap_add_extent_unwritten_real might have merged it into one
4783 * of the neighbouring ones.
4784 */
4785 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
4786
4787 /*
4788 * We may have combined previously unwritten space with written space,
4789 * so generate another request.
4790 */
4791 if (mval->br_blockcount < len)
4792 return EAGAIN;
4793 return 0;
4794}
4795
4796/*
4797 * Map file blocks to filesystem blocks, and allocate blocks or convert the
4798 * extent state if necessary. Details behaviour is controlled by the flags
4799 * parameter. Only allocates blocks from a single allocation group, to avoid
4800 * locking problems.
4801 *
4303 * The returned value in "firstblock" from the first call in a transaction 4802 * The returned value in "firstblock" from the first call in a transaction
4304 * must be remembered and presented to subsequent calls in "firstblock". 4803 * must be remembered and presented to subsequent calls in "firstblock".
4305 * An upper bound for the number of blocks to be allocated is supplied to 4804 * An upper bound for the number of blocks to be allocated is supplied to
4306 * the first call in "total"; if no allocation group has that many free 4805 * the first call in "total"; if no allocation group has that many free
4307 * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). 4806 * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
4308 */ 4807 */
4309int /* error */ 4808int
4310xfs_bmapi( 4809xfs_bmapi_write(
4311 xfs_trans_t *tp, /* transaction pointer */ 4810 struct xfs_trans *tp, /* transaction pointer */
4312 xfs_inode_t *ip, /* incore inode */ 4811 struct xfs_inode *ip, /* incore inode */
4313 xfs_fileoff_t bno, /* starting file offs. mapped */ 4812 xfs_fileoff_t bno, /* starting file offs. mapped */
4314 xfs_filblks_t len, /* length to map in file */ 4813 xfs_filblks_t len, /* length to map in file */
4315 int flags, /* XFS_BMAPI_... */ 4814 int flags, /* XFS_BMAPI_... */
4316 xfs_fsblock_t *firstblock, /* first allocated block 4815 xfs_fsblock_t *firstblock, /* first allocated block
4317 controls a.g. for allocs */ 4816 controls a.g. for allocs */
4318 xfs_extlen_t total, /* total blocks needed */ 4817 xfs_extlen_t total, /* total blocks needed */
4319 xfs_bmbt_irec_t *mval, /* output: map values */ 4818 struct xfs_bmbt_irec *mval, /* output: map values */
4320 int *nmap, /* i/o: mval size/count */ 4819 int *nmap, /* i/o: mval size/count */
4321 xfs_bmap_free_t *flist) /* i/o: list extents to free */ 4820 struct xfs_bmap_free *flist) /* i/o: list extents to free */
4322{ 4821{
4323 xfs_fsblock_t abno; /* allocated block number */ 4822 struct xfs_mount *mp = ip->i_mount;
4324 xfs_extlen_t alen; /* allocated extent length */ 4823 struct xfs_ifork *ifp;
4325 xfs_fileoff_t aoff; /* allocated file offset */ 4824 struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */
4326 xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ 4825 xfs_fileoff_t end; /* end of mapped file region */
4327 xfs_btree_cur_t *cur; /* bmap btree cursor */ 4826 int eof; /* after the end of extents */
4328 xfs_fileoff_t end; /* end of mapped file region */ 4827 int error; /* error return */
4329 int eof; /* we've hit the end of extents */ 4828 int n; /* current extent index */
4330 xfs_bmbt_rec_host_t *ep; /* extent record pointer */ 4829 xfs_fileoff_t obno; /* old block number (offset) */
4331 int error; /* error return */ 4830 int whichfork; /* data or attr fork */
4332 xfs_bmbt_irec_t got; /* current file extent record */ 4831 char inhole; /* current location is hole in file */
4333 xfs_ifork_t *ifp; /* inode fork pointer */ 4832 char wasdelay; /* old extent was delayed */
4334 xfs_extlen_t indlen; /* indirect blocks length */ 4833
4335 xfs_extnum_t lastx; /* last useful extent number */
4336 int logflags; /* flags for transaction logging */
4337 xfs_extlen_t minleft; /* min blocks left after allocation */
4338 xfs_extlen_t minlen; /* min allocation size */
4339 xfs_mount_t *mp; /* xfs mount structure */
4340 int n; /* current extent index */
4341 int nallocs; /* number of extents alloc'd */
4342 xfs_extnum_t nextents; /* number of extents in file */
4343 xfs_fileoff_t obno; /* old block number (offset) */
4344 xfs_bmbt_irec_t prev; /* previous file extent record */
4345 int tmp_logflags; /* temp flags holder */
4346 int whichfork; /* data or attr fork */
4347 char inhole; /* current location is hole in file */
4348 char wasdelay; /* old extent was delayed */
4349 char wr; /* this is a write request */
4350 char rt; /* this is a realtime file */
4351#ifdef DEBUG 4834#ifdef DEBUG
4352 xfs_fileoff_t orig_bno; /* original block number value */ 4835 xfs_fileoff_t orig_bno; /* original block number value */
4353 int orig_flags; /* original flags arg value */ 4836 int orig_flags; /* original flags arg value */
4354 xfs_filblks_t orig_len; /* original value of len arg */ 4837 xfs_filblks_t orig_len; /* original value of len arg */
4355 xfs_bmbt_irec_t *orig_mval; /* original value of mval */ 4838 struct xfs_bmbt_irec *orig_mval; /* original value of mval */
4356 int orig_nmap; /* original value of *nmap */ 4839 int orig_nmap; /* original value of *nmap */
4357 4840
4358 orig_bno = bno; 4841 orig_bno = bno;
4359 orig_len = len; 4842 orig_len = len;
@@ -4361,488 +4844,133 @@ xfs_bmapi(
4361 orig_mval = mval; 4844 orig_mval = mval;
4362 orig_nmap = *nmap; 4845 orig_nmap = *nmap;
4363#endif 4846#endif
4847
4364 ASSERT(*nmap >= 1); 4848 ASSERT(*nmap >= 1);
4365 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); 4849 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4850 ASSERT(!(flags & XFS_BMAPI_IGSTATE));
4851 ASSERT(tp != NULL);
4852
4366 whichfork = (flags & XFS_BMAPI_ATTRFORK) ? 4853 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4367 XFS_ATTR_FORK : XFS_DATA_FORK; 4854 XFS_ATTR_FORK : XFS_DATA_FORK;
4368 mp = ip->i_mount; 4855
4369 if (unlikely(XFS_TEST_ERROR( 4856 if (unlikely(XFS_TEST_ERROR(
4370 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 4857 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4371 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && 4858 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
4372 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), 4859 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
4373 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { 4860 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4374 XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp); 4861 XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
4375 return XFS_ERROR(EFSCORRUPTED); 4862 return XFS_ERROR(EFSCORRUPTED);
4376 } 4863 }
4864
4377 if (XFS_FORCED_SHUTDOWN(mp)) 4865 if (XFS_FORCED_SHUTDOWN(mp))
4378 return XFS_ERROR(EIO); 4866 return XFS_ERROR(EIO);
4379 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); 4867
4380 ifp = XFS_IFORK_PTR(ip, whichfork); 4868 ifp = XFS_IFORK_PTR(ip, whichfork);
4381 ASSERT(ifp->if_ext_max == 4869 ASSERT(ifp->if_ext_max ==
4382 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); 4870 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4383 if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) 4871
4384 XFS_STATS_INC(xs_blk_mapw); 4872 XFS_STATS_INC(xs_blk_mapw);
4385 else 4873
4386 XFS_STATS_INC(xs_blk_mapr);
4387 /*
4388 * IGSTATE flag is used to combine extents which
4389 * differ only due to the state of the extents.
4390 * This technique is used from xfs_getbmap()
4391 * when the caller does not wish to see the
4392 * separation (which is the default).
4393 *
4394 * This technique is also used when writing a
4395 * buffer which has been partially written,
4396 * (usually by being flushed during a chunkread),
4397 * to ensure one write takes place. This also
4398 * prevents a change in the xfs inode extents at
4399 * this time, intentionally. This change occurs
4400 * on completion of the write operation, in
4401 * xfs_strat_comp(), where the xfs_bmapi() call
4402 * is transactioned, and the extents combined.
4403 */
4404 if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */
4405 wr = 0; /* no allocations are allowed */
4406 ASSERT(wr || !(flags & XFS_BMAPI_DELAY));
4407 logflags = 0;
4408 nallocs = 0;
4409 cur = NULL;
4410 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 4874 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
4411 ASSERT(wr && tp); 4875 error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
4412 if ((error = xfs_bmap_local_to_extents(tp, ip, 4876 &bma.logflags, whichfork);
4413 firstblock, total, &logflags, whichfork))) 4877 if (error)
4414 goto error0; 4878 goto error0;
4415 } 4879 }
4416 if (wr && *firstblock == NULLFSBLOCK) { 4880
4881 if (*firstblock == NULLFSBLOCK) {
4417 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) 4882 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
4418 minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; 4883 bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
4419 else 4884 else
4420 minleft = 1; 4885 bma.minleft = 1;
4421 } else 4886 } else {
4422 minleft = 0; 4887 bma.minleft = 0;
4423 if (!(ifp->if_flags & XFS_IFEXTENTS) && 4888 }
4424 (error = xfs_iread_extents(tp, ip, whichfork))) 4889
4425 goto error0; 4890 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4426 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, 4891 error = xfs_iread_extents(tp, ip, whichfork);
4427 &prev); 4892 if (error)
4428 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4893 goto error0;
4894 }
4895
4896 xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
4897 &bma.prev);
4429 n = 0; 4898 n = 0;
4430 end = bno + len; 4899 end = bno + len;
4431 obno = bno; 4900 obno = bno;
4432 bma.ip = NULL; 4901
4902 bma.tp = tp;
4903 bma.ip = ip;
4904 bma.total = total;
4905 bma.userdata = 0;
4906 bma.flist = flist;
4907 bma.firstblock = firstblock;
4433 4908
4434 while (bno < end && n < *nmap) { 4909 while (bno < end && n < *nmap) {
4435 /* 4910 inhole = eof || bma.got.br_startoff > bno;
4436 * Reading past eof, act as though there's a hole 4911 wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
4437 * up to end. 4912
4438 */
4439 if (eof && !wr)
4440 got.br_startoff = end;
4441 inhole = eof || got.br_startoff > bno;
4442 wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
4443 isnullstartblock(got.br_startblock);
4444 /* 4913 /*
4445 * First, deal with the hole before the allocated space 4914 * First, deal with the hole before the allocated space
4446 * that we found, if any. 4915 * that we found, if any.
4447 */ 4916 */
4448 if (wr && (inhole || wasdelay)) { 4917 if (inhole || wasdelay) {
4449 /* 4918 bma.eof = eof;
4450 * For the wasdelay case, we could also just 4919 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4451 * allocate the stuff asked for in this bmap call 4920 bma.wasdel = wasdelay;
4452 * but that wouldn't be as good. 4921 bma.length = len;
4453 */ 4922 bma.offset = bno;
4454 if (wasdelay) { 4923
4455 alen = (xfs_extlen_t)got.br_blockcount; 4924 error = xfs_bmapi_allocate(&bma, flags);
4456 aoff = got.br_startoff;
4457 if (lastx != NULLEXTNUM && lastx) {
4458 ep = xfs_iext_get_ext(ifp, lastx - 1);
4459 xfs_bmbt_get_all(ep, &prev);
4460 }
4461 } else {
4462 alen = (xfs_extlen_t)
4463 XFS_FILBLKS_MIN(len, MAXEXTLEN);
4464 if (!eof)
4465 alen = (xfs_extlen_t)
4466 XFS_FILBLKS_MIN(alen,
4467 got.br_startoff - bno);
4468 aoff = bno;
4469 }
4470 minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
4471 if (flags & XFS_BMAPI_DELAY) {
4472 xfs_extlen_t extsz;
4473
4474 /* Figure out the extent size, adjust alen */
4475 extsz = xfs_get_extsz_hint(ip);
4476 if (extsz) {
4477 /*
4478 * make sure we don't exceed a single
4479 * extent length when we align the
4480 * extent by reducing length we are
4481 * going to allocate by the maximum
4482 * amount extent size aligment may
4483 * require.
4484 */
4485 alen = XFS_FILBLKS_MIN(len,
4486 MAXEXTLEN - (2 * extsz - 1));
4487 error = xfs_bmap_extsize_align(mp,
4488 &got, &prev, extsz,
4489 rt, eof,
4490 flags&XFS_BMAPI_DELAY,
4491 flags&XFS_BMAPI_CONVERT,
4492 &aoff, &alen);
4493 ASSERT(!error);
4494 }
4495
4496 if (rt)
4497 extsz = alen / mp->m_sb.sb_rextsize;
4498
4499 /*
4500 * Make a transaction-less quota reservation for
4501 * delayed allocation blocks. This number gets
4502 * adjusted later. We return if we haven't
4503 * allocated blocks already inside this loop.
4504 */
4505 error = xfs_trans_reserve_quota_nblks(
4506 NULL, ip, (long)alen, 0,
4507 rt ? XFS_QMOPT_RES_RTBLKS :
4508 XFS_QMOPT_RES_REGBLKS);
4509 if (error) {
4510 if (n == 0) {
4511 *nmap = 0;
4512 ASSERT(cur == NULL);
4513 return error;
4514 }
4515 break;
4516 }
4517
4518 /*
4519 * Split changing sb for alen and indlen since
4520 * they could be coming from different places.
4521 */
4522 indlen = (xfs_extlen_t)
4523 xfs_bmap_worst_indlen(ip, alen);
4524 ASSERT(indlen > 0);
4525
4526 if (rt) {
4527 error = xfs_mod_incore_sb(mp,
4528 XFS_SBS_FREXTENTS,
4529 -((int64_t)extsz), 0);
4530 } else {
4531 error = xfs_icsb_modify_counters(mp,
4532 XFS_SBS_FDBLOCKS,
4533 -((int64_t)alen), 0);
4534 }
4535 if (!error) {
4536 error = xfs_icsb_modify_counters(mp,
4537 XFS_SBS_FDBLOCKS,
4538 -((int64_t)indlen), 0);
4539 if (error && rt)
4540 xfs_mod_incore_sb(mp,
4541 XFS_SBS_FREXTENTS,
4542 (int64_t)extsz, 0);
4543 else if (error)
4544 xfs_icsb_modify_counters(mp,
4545 XFS_SBS_FDBLOCKS,
4546 (int64_t)alen, 0);
4547 }
4548
4549 if (error) {
4550 if (XFS_IS_QUOTA_ON(mp))
4551 /* unreserve the blocks now */
4552 (void)
4553 xfs_trans_unreserve_quota_nblks(
4554 NULL, ip,
4555 (long)alen, 0, rt ?
4556 XFS_QMOPT_RES_RTBLKS :
4557 XFS_QMOPT_RES_REGBLKS);
4558 break;
4559 }
4560
4561 ip->i_delayed_blks += alen;
4562 abno = nullstartblock(indlen);
4563 } else {
4564 /*
4565 * If first time, allocate and fill in
4566 * once-only bma fields.
4567 */
4568 if (bma.ip == NULL) {
4569 bma.tp = tp;
4570 bma.ip = ip;
4571 bma.prevp = &prev;
4572 bma.gotp = &got;
4573 bma.total = total;
4574 bma.userdata = 0;
4575 }
4576 /* Indicate if this is the first user data
4577 * in the file, or just any user data.
4578 */
4579 if (!(flags & XFS_BMAPI_METADATA)) {
4580 bma.userdata = (aoff == 0) ?
4581 XFS_ALLOC_INITIAL_USER_DATA :
4582 XFS_ALLOC_USERDATA;
4583 }
4584 /*
4585 * Fill in changeable bma fields.
4586 */
4587 bma.eof = eof;
4588 bma.firstblock = *firstblock;
4589 bma.alen = alen;
4590 bma.off = aoff;
4591 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4592 bma.wasdel = wasdelay;
4593 bma.minlen = minlen;
4594 bma.low = flist->xbf_low;
4595 bma.minleft = minleft;
4596 /*
4597 * Only want to do the alignment at the
4598 * eof if it is userdata and allocation length
4599 * is larger than a stripe unit.
4600 */
4601 if (mp->m_dalign && alen >= mp->m_dalign &&
4602 (!(flags & XFS_BMAPI_METADATA)) &&
4603 (whichfork == XFS_DATA_FORK)) {
4604 if ((error = xfs_bmap_isaeof(ip, aoff,
4605 whichfork, &bma.aeof)))
4606 goto error0;
4607 } else
4608 bma.aeof = 0;
4609 /*
4610 * Call allocator.
4611 */
4612 if ((error = xfs_bmap_alloc(&bma)))
4613 goto error0;
4614 /*
4615 * Copy out result fields.
4616 */
4617 abno = bma.rval;
4618 if ((flist->xbf_low = bma.low))
4619 minleft = 0;
4620 alen = bma.alen;
4621 aoff = bma.off;
4622 ASSERT(*firstblock == NULLFSBLOCK ||
4623 XFS_FSB_TO_AGNO(mp, *firstblock) ==
4624 XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
4625 (flist->xbf_low &&
4626 XFS_FSB_TO_AGNO(mp, *firstblock) <
4627 XFS_FSB_TO_AGNO(mp, bma.firstblock)));
4628 *firstblock = bma.firstblock;
4629 if (cur)
4630 cur->bc_private.b.firstblock =
4631 *firstblock;
4632 if (abno == NULLFSBLOCK)
4633 break;
4634 if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
4635 cur = xfs_bmbt_init_cursor(mp, tp,
4636 ip, whichfork);
4637 cur->bc_private.b.firstblock =
4638 *firstblock;
4639 cur->bc_private.b.flist = flist;
4640 }
4641 /*
4642 * Bump the number of extents we've allocated
4643 * in this call.
4644 */
4645 nallocs++;
4646 }
4647 if (cur)
4648 cur->bc_private.b.flags =
4649 wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
4650 got.br_startoff = aoff;
4651 got.br_startblock = abno;
4652 got.br_blockcount = alen;
4653 got.br_state = XFS_EXT_NORM; /* assume normal */
4654 /*
4655 * Determine state of extent, and the filesystem.
4656 * A wasdelay extent has been initialized, so
4657 * shouldn't be flagged as unwritten.
4658 */
4659 if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) {
4660 if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
4661 got.br_state = XFS_EXT_UNWRITTEN;
4662 }
4663 error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got,
4664 firstblock, flist, &tmp_logflags,
4665 whichfork);
4666 logflags |= tmp_logflags;
4667 if (error) 4925 if (error)
4668 goto error0; 4926 goto error0;
4669 ep = xfs_iext_get_ext(ifp, lastx); 4927 if (bma.blkno == NULLFSBLOCK)
4670 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4928 break;
4671 xfs_bmbt_get_all(ep, &got);
4672 ASSERT(got.br_startoff <= aoff);
4673 ASSERT(got.br_startoff + got.br_blockcount >=
4674 aoff + alen);
4675#ifdef DEBUG
4676 if (flags & XFS_BMAPI_DELAY) {
4677 ASSERT(isnullstartblock(got.br_startblock));
4678 ASSERT(startblockval(got.br_startblock) > 0);
4679 }
4680 ASSERT(got.br_state == XFS_EXT_NORM ||
4681 got.br_state == XFS_EXT_UNWRITTEN);
4682#endif
4683 /*
4684 * Fall down into the found allocated space case.
4685 */
4686 } else if (inhole) {
4687 /*
4688 * Reading in a hole.
4689 */
4690 mval->br_startoff = bno;
4691 mval->br_startblock = HOLESTARTBLOCK;
4692 mval->br_blockcount =
4693 XFS_FILBLKS_MIN(len, got.br_startoff - bno);
4694 mval->br_state = XFS_EXT_NORM;
4695 bno += mval->br_blockcount;
4696 len -= mval->br_blockcount;
4697 mval++;
4698 n++;
4699 continue;
4700 }
4701 /*
4702 * Then deal with the allocated space we found.
4703 */
4704 ASSERT(ep != NULL);
4705 if (!(flags & XFS_BMAPI_ENTIRE) &&
4706 (got.br_startoff + got.br_blockcount > obno)) {
4707 if (obno > bno)
4708 bno = obno;
4709 ASSERT((bno >= obno) || (n == 0));
4710 ASSERT(bno < end);
4711 mval->br_startoff = bno;
4712 if (isnullstartblock(got.br_startblock)) {
4713 ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
4714 mval->br_startblock = DELAYSTARTBLOCK;
4715 } else
4716 mval->br_startblock =
4717 got.br_startblock +
4718 (bno - got.br_startoff);
4719 /*
4720 * Return the minimum of what we got and what we
4721 * asked for for the length. We can use the len
4722 * variable here because it is modified below
4723 * and we could have been there before coming
4724 * here if the first part of the allocation
4725 * didn't overlap what was asked for.
4726 */
4727 mval->br_blockcount =
4728 XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
4729 (bno - got.br_startoff));
4730 mval->br_state = got.br_state;
4731 ASSERT(mval->br_blockcount <= len);
4732 } else {
4733 *mval = got;
4734 if (isnullstartblock(mval->br_startblock)) {
4735 ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
4736 mval->br_startblock = DELAYSTARTBLOCK;
4737 }
4738 } 4929 }
4739 4930
4740 /* 4931 /* Deal with the allocated space we found. */
4741 * Check if writing previously allocated but 4932 xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
4742 * unwritten extents. 4933 end, n, flags);
4743 */ 4934
4744 if (wr && 4935 /* Execute unwritten extent conversion if necessary */
4745 ((mval->br_state == XFS_EXT_UNWRITTEN && 4936 error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
4746 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) || 4937 if (error == EAGAIN)
4747 (mval->br_state == XFS_EXT_NORM && 4938 continue;
4748 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) == 4939 if (error)
4749 (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) { 4940 goto error0;
4750 /* 4941
4751 * Modify (by adding) the state flag, if writing. 4942 /* update the extent map to return */
4752 */ 4943 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4753 ASSERT(mval->br_blockcount <= len);
4754 if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
4755 cur = xfs_bmbt_init_cursor(mp,
4756 tp, ip, whichfork);
4757 cur->bc_private.b.firstblock =
4758 *firstblock;
4759 cur->bc_private.b.flist = flist;
4760 }
4761 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4762 ? XFS_EXT_NORM
4763 : XFS_EXT_UNWRITTEN;
4764 error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval,
4765 firstblock, flist, &tmp_logflags,
4766 whichfork);
4767 logflags |= tmp_logflags;
4768 if (error)
4769 goto error0;
4770 ep = xfs_iext_get_ext(ifp, lastx);
4771 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4772 xfs_bmbt_get_all(ep, &got);
4773 /*
4774 * We may have combined previously unwritten
4775 * space with written space, so generate
4776 * another request.
4777 */
4778 if (mval->br_blockcount < len)
4779 continue;
4780 }
4781 4944
4782 ASSERT((flags & XFS_BMAPI_ENTIRE) ||
4783 ((mval->br_startoff + mval->br_blockcount) <= end));
4784 ASSERT((flags & XFS_BMAPI_ENTIRE) ||
4785 (mval->br_blockcount <= len) ||
4786 (mval->br_startoff < obno));
4787 bno = mval->br_startoff + mval->br_blockcount;
4788 len = end - bno;
4789 if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
4790 ASSERT(mval->br_startblock == mval[-1].br_startblock);
4791 ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
4792 ASSERT(mval->br_state == mval[-1].br_state);
4793 mval[-1].br_blockcount = mval->br_blockcount;
4794 mval[-1].br_state = mval->br_state;
4795 } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
4796 mval[-1].br_startblock != DELAYSTARTBLOCK &&
4797 mval[-1].br_startblock != HOLESTARTBLOCK &&
4798 mval->br_startblock ==
4799 mval[-1].br_startblock + mval[-1].br_blockcount &&
4800 ((flags & XFS_BMAPI_IGSTATE) ||
4801 mval[-1].br_state == mval->br_state)) {
4802 ASSERT(mval->br_startoff ==
4803 mval[-1].br_startoff + mval[-1].br_blockcount);
4804 mval[-1].br_blockcount += mval->br_blockcount;
4805 } else if (n > 0 &&
4806 mval->br_startblock == DELAYSTARTBLOCK &&
4807 mval[-1].br_startblock == DELAYSTARTBLOCK &&
4808 mval->br_startoff ==
4809 mval[-1].br_startoff + mval[-1].br_blockcount) {
4810 mval[-1].br_blockcount += mval->br_blockcount;
4811 mval[-1].br_state = mval->br_state;
4812 } else if (!((n == 0) &&
4813 ((mval->br_startoff + mval->br_blockcount) <=
4814 obno))) {
4815 mval++;
4816 n++;
4817 }
4818 /* 4945 /*
4819 * If we're done, stop now. Stop when we've allocated 4946 * If we're done, stop now. Stop when we've allocated
4820 * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise 4947 * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise
4821 * the transaction may get too big. 4948 * the transaction may get too big.
4822 */ 4949 */
4823 if (bno >= end || n >= *nmap || nallocs >= *nmap) 4950 if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
4824 break; 4951 break;
4825 /* 4952
4826 * Else go on to the next record. 4953 /* Else go on to the next record. */
4827 */ 4954 bma.prev = bma.got;
4828 prev = got; 4955 if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
4829 if (++lastx < nextents) { 4956 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
4830 ep = xfs_iext_get_ext(ifp, lastx); 4957 &bma.got);
4831 xfs_bmbt_get_all(ep, &got); 4958 } else
4832 } else {
4833 eof = 1; 4959 eof = 1;
4834 }
4835 } 4960 }
4836 *nmap = n; 4961 *nmap = n;
4962
4837 /* 4963 /*
4838 * Transform from btree to extents, give it cur. 4964 * Transform from btree to extents, give it cur.
4839 */ 4965 */
4840 if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && 4966 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
4841 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { 4967 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
4842 ASSERT(wr && cur); 4968 int tmp_logflags = 0;
4843 error = xfs_bmap_btree_to_extents(tp, ip, cur, 4969
4970 ASSERT(bma.cur);
4971 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
4844 &tmp_logflags, whichfork); 4972 &tmp_logflags, whichfork);
4845 logflags |= tmp_logflags; 4973 bma.logflags |= tmp_logflags;
4846 if (error) 4974 if (error)
4847 goto error0; 4975 goto error0;
4848 } 4976 }
@@ -4856,34 +4984,33 @@ error0:
4856 * Log everything. Do this after conversion, there's no point in 4984 * Log everything. Do this after conversion, there's no point in
4857 * logging the extent records if we've converted to btree format. 4985 * logging the extent records if we've converted to btree format.
4858 */ 4986 */
4859 if ((logflags & xfs_ilog_fext(whichfork)) && 4987 if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
4860 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) 4988 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
4861 logflags &= ~xfs_ilog_fext(whichfork); 4989 bma.logflags &= ~xfs_ilog_fext(whichfork);
4862 else if ((logflags & xfs_ilog_fbroot(whichfork)) && 4990 else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
4863 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) 4991 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
4864 logflags &= ~xfs_ilog_fbroot(whichfork); 4992 bma.logflags &= ~xfs_ilog_fbroot(whichfork);
4865 /* 4993 /*
4866 * Log whatever the flags say, even if error. Otherwise we might miss 4994 * Log whatever the flags say, even if error. Otherwise we might miss
4867 * detecting a case where the data is changed, there's an error, 4995 * detecting a case where the data is changed, there's an error,
4868 * and it's not logged so we don't shutdown when we should. 4996 * and it's not logged so we don't shutdown when we should.
4869 */ 4997 */
4870 if (logflags) { 4998 if (bma.logflags)
4871 ASSERT(tp && wr); 4999 xfs_trans_log_inode(tp, ip, bma.logflags);
4872 xfs_trans_log_inode(tp, ip, logflags); 5000
4873 } 5001 if (bma.cur) {
4874 if (cur) {
4875 if (!error) { 5002 if (!error) {
4876 ASSERT(*firstblock == NULLFSBLOCK || 5003 ASSERT(*firstblock == NULLFSBLOCK ||
4877 XFS_FSB_TO_AGNO(mp, *firstblock) == 5004 XFS_FSB_TO_AGNO(mp, *firstblock) ==
4878 XFS_FSB_TO_AGNO(mp, 5005 XFS_FSB_TO_AGNO(mp,
4879 cur->bc_private.b.firstblock) || 5006 bma.cur->bc_private.b.firstblock) ||
4880 (flist->xbf_low && 5007 (flist->xbf_low &&
4881 XFS_FSB_TO_AGNO(mp, *firstblock) < 5008 XFS_FSB_TO_AGNO(mp, *firstblock) <
4882 XFS_FSB_TO_AGNO(mp, 5009 XFS_FSB_TO_AGNO(mp,
4883 cur->bc_private.b.firstblock))); 5010 bma.cur->bc_private.b.firstblock)));
4884 *firstblock = cur->bc_private.b.firstblock; 5011 *firstblock = bma.cur->bc_private.b.firstblock;
4885 } 5012 }
4886 xfs_btree_del_cursor(cur, 5013 xfs_btree_del_cursor(bma.cur,
4887 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); 5014 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
4888 } 5015 }
4889 if (!error) 5016 if (!error)
@@ -4893,58 +5020,6 @@ error0:
4893} 5020}
4894 5021
4895/* 5022/*
4896 * Map file blocks to filesystem blocks, simple version.
4897 * One block (extent) only, read-only.
4898 * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
4899 * For the other flag values, the effect is as if XFS_BMAPI_METADATA
4900 * was set and all the others were clear.
4901 */
4902int /* error */
4903xfs_bmapi_single(
4904 xfs_trans_t *tp, /* transaction pointer */
4905 xfs_inode_t *ip, /* incore inode */
4906 int whichfork, /* data or attr fork */
4907 xfs_fsblock_t *fsb, /* output: mapped block */
4908 xfs_fileoff_t bno) /* starting file offs. mapped */
4909{
4910 int eof; /* we've hit the end of extents */
4911 int error; /* error return */
4912 xfs_bmbt_irec_t got; /* current file extent record */
4913 xfs_ifork_t *ifp; /* inode fork pointer */
4914 xfs_extnum_t lastx; /* last useful extent number */
4915 xfs_bmbt_irec_t prev; /* previous file extent record */
4916
4917 ifp = XFS_IFORK_PTR(ip, whichfork);
4918 if (unlikely(
4919 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
4920 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
4921 XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
4922 ip->i_mount);
4923 return XFS_ERROR(EFSCORRUPTED);
4924 }
4925 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
4926 return XFS_ERROR(EIO);
4927 XFS_STATS_INC(xs_blk_mapr);
4928 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
4929 (error = xfs_iread_extents(tp, ip, whichfork)))
4930 return error;
4931 (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
4932 &prev);
4933 /*
4934 * Reading past eof, act as though there's a hole
4935 * up to end.
4936 */
4937 if (eof || got.br_startoff > bno) {
4938 *fsb = NULLFSBLOCK;
4939 return 0;
4940 }
4941 ASSERT(!isnullstartblock(got.br_startblock));
4942 ASSERT(bno < got.br_startoff + got.br_blockcount);
4943 *fsb = got.br_startblock + (bno - got.br_startoff);
4944 return 0;
4945}
4946
4947/*
4948 * Unmap (remove) blocks from a file. 5023 * Unmap (remove) blocks from a file.
4949 * If nexts is nonzero then the number of extents to remove is limited to 5024 * If nexts is nonzero then the number of extents to remove is limited to
4950 * that value. If not all extents in the block range can be removed then 5025 * that value. If not all extents in the block range can be removed then
@@ -5115,9 +5190,9 @@ xfs_bunmapi(
5115 del.br_blockcount = mod; 5190 del.br_blockcount = mod;
5116 } 5191 }
5117 del.br_state = XFS_EXT_UNWRITTEN; 5192 del.br_state = XFS_EXT_UNWRITTEN;
5118 error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del, 5193 error = xfs_bmap_add_extent_unwritten_real(tp, ip,
5119 firstblock, flist, &logflags, 5194 &lastx, &cur, &del, firstblock, flist,
5120 XFS_DATA_FORK); 5195 &logflags);
5121 if (error) 5196 if (error)
5122 goto error0; 5197 goto error0;
5123 goto nodelete; 5198 goto nodelete;
@@ -5173,18 +5248,18 @@ xfs_bunmapi(
5173 } 5248 }
5174 prev.br_state = XFS_EXT_UNWRITTEN; 5249 prev.br_state = XFS_EXT_UNWRITTEN;
5175 lastx--; 5250 lastx--;
5176 error = xfs_bmap_add_extent(tp, ip, &lastx, 5251 error = xfs_bmap_add_extent_unwritten_real(tp,
5177 &cur, &prev, firstblock, flist, 5252 ip, &lastx, &cur, &prev,
5178 &logflags, XFS_DATA_FORK); 5253 firstblock, flist, &logflags);
5179 if (error) 5254 if (error)
5180 goto error0; 5255 goto error0;
5181 goto nodelete; 5256 goto nodelete;
5182 } else { 5257 } else {
5183 ASSERT(del.br_state == XFS_EXT_NORM); 5258 ASSERT(del.br_state == XFS_EXT_NORM);
5184 del.br_state = XFS_EXT_UNWRITTEN; 5259 del.br_state = XFS_EXT_UNWRITTEN;
5185 error = xfs_bmap_add_extent(tp, ip, &lastx, 5260 error = xfs_bmap_add_extent_unwritten_real(tp,
5186 &cur, &del, firstblock, flist, 5261 ip, &lastx, &cur, &del,
5187 &logflags, XFS_DATA_FORK); 5262 firstblock, flist, &logflags);
5188 if (error) 5263 if (error)
5189 goto error0; 5264 goto error0;
5190 goto nodelete; 5265 goto nodelete;
@@ -5506,10 +5581,9 @@ xfs_getbmap(
5506 5581
5507 do { 5582 do {
5508 nmap = (nexleft > subnex) ? subnex : nexleft; 5583 nmap = (nexleft > subnex) ? subnex : nexleft;
5509 error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), 5584 error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
5510 XFS_BB_TO_FSB(mp, bmv->bmv_length), 5585 XFS_BB_TO_FSB(mp, bmv->bmv_length),
5511 bmapi_flags, NULL, 0, map, &nmap, 5586 map, &nmap, bmapi_flags);
5512 NULL);
5513 if (error) 5587 if (error)
5514 goto out_free_map; 5588 goto out_free_map;
5515 ASSERT(nmap <= subnex); 5589 ASSERT(nmap <= subnex);
@@ -5583,89 +5657,6 @@ xfs_getbmap(
5583 return error; 5657 return error;
5584} 5658}
5585 5659
5586/*
5587 * Check the last inode extent to determine whether this allocation will result
5588 * in blocks being allocated at the end of the file. When we allocate new data
5589 * blocks at the end of the file which do not start at the previous data block,
5590 * we will try to align the new blocks at stripe unit boundaries.
5591 */
5592STATIC int /* error */
5593xfs_bmap_isaeof(
5594 xfs_inode_t *ip, /* incore inode pointer */
5595 xfs_fileoff_t off, /* file offset in fsblocks */
5596 int whichfork, /* data or attribute fork */
5597 char *aeof) /* return value */
5598{
5599 int error; /* error return value */
5600 xfs_ifork_t *ifp; /* inode fork pointer */
5601 xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */
5602 xfs_extnum_t nextents; /* number of file extents */
5603 xfs_bmbt_irec_t s; /* expanded extent record */
5604
5605 ASSERT(whichfork == XFS_DATA_FORK);
5606 ifp = XFS_IFORK_PTR(ip, whichfork);
5607 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5608 (error = xfs_iread_extents(NULL, ip, whichfork)))
5609 return error;
5610 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
5611 if (nextents == 0) {
5612 *aeof = 1;
5613 return 0;
5614 }
5615 /*
5616 * Go to the last extent
5617 */
5618 lastrec = xfs_iext_get_ext(ifp, nextents - 1);
5619 xfs_bmbt_get_all(lastrec, &s);
5620 /*
5621 * Check we are allocating in the last extent (for delayed allocations)
5622 * or past the last extent for non-delayed allocations.
5623 */
5624 *aeof = (off >= s.br_startoff &&
5625 off < s.br_startoff + s.br_blockcount &&
5626 isnullstartblock(s.br_startblock)) ||
5627 off >= s.br_startoff + s.br_blockcount;
5628 return 0;
5629}
5630
5631/*
5632 * Check if the endoff is outside the last extent. If so the caller will grow
5633 * the allocation to a stripe unit boundary.
5634 */
5635int /* error */
5636xfs_bmap_eof(
5637 xfs_inode_t *ip, /* incore inode pointer */
5638 xfs_fileoff_t endoff, /* file offset in fsblocks */
5639 int whichfork, /* data or attribute fork */
5640 int *eof) /* result value */
5641{
5642 xfs_fsblock_t blockcount; /* extent block count */
5643 int error; /* error return value */
5644 xfs_ifork_t *ifp; /* inode fork pointer */
5645 xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */
5646 xfs_extnum_t nextents; /* number of file extents */
5647 xfs_fileoff_t startoff; /* extent starting file offset */
5648
5649 ASSERT(whichfork == XFS_DATA_FORK);
5650 ifp = XFS_IFORK_PTR(ip, whichfork);
5651 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5652 (error = xfs_iread_extents(NULL, ip, whichfork)))
5653 return error;
5654 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
5655 if (nextents == 0) {
5656 *eof = 1;
5657 return 0;
5658 }
5659 /*
5660 * Go to the last extent
5661 */
5662 lastrec = xfs_iext_get_ext(ifp, nextents - 1);
5663 startoff = xfs_bmbt_get_startoff(lastrec);
5664 blockcount = xfs_bmbt_get_blockcount(lastrec);
5665 *eof = endoff >= startoff + blockcount;
5666 return 0;
5667}
5668
5669#ifdef DEBUG 5660#ifdef DEBUG
5670STATIC struct xfs_buf * 5661STATIC struct xfs_buf *
5671xfs_bmap_get_bp( 5662xfs_bmap_get_bp(
@@ -6100,9 +6091,8 @@ xfs_bmap_punch_delalloc_range(
6100 * trying to remove a real extent (which requires a 6091 * trying to remove a real extent (which requires a
6101 * transaction) or a hole, which is probably a bad idea... 6092 * transaction) or a hole, which is probably a bad idea...
6102 */ 6093 */
6103 error = xfs_bmapi(NULL, ip, start_fsb, 1, 6094 error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
6104 XFS_BMAPI_ENTIRE, NULL, 0, &imap, 6095 XFS_BMAPI_ENTIRE);
6105 &nimaps, NULL);
6106 6096
6107 if (error) { 6097 if (error) {
6108 /* something screwed, just bail */ 6098 /* something screwed, just bail */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index c62234bde053..89ee672d378a 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -62,27 +62,23 @@ typedef struct xfs_bmap_free
62#define XFS_BMAP_MAX_NMAP 4 62#define XFS_BMAP_MAX_NMAP 4
63 63
64/* 64/*
65 * Flags for xfs_bmapi 65 * Flags for xfs_bmapi_*
66 */ 66 */
67#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ 67#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
68#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ 68#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ 69#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ 70#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ 71#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */
72#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
73#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
74 /* combine contig. space */ 72 /* combine contig. space */
75#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ 73#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
76/* 74/*
77 * unwritten extent conversion - this needs write cache flushing and no additional 75 * unwritten extent conversion - this needs write cache flushing and no additional
78 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts 76 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
79 * from written to unwritten, otherwise convert from unwritten to written. 77 * from written to unwritten, otherwise convert from unwritten to written.
80 */ 78 */
81#define XFS_BMAPI_CONVERT 0x200 79#define XFS_BMAPI_CONVERT 0x040
82 80
83#define XFS_BMAPI_FLAGS \ 81#define XFS_BMAPI_FLAGS \
84 { XFS_BMAPI_WRITE, "WRITE" }, \
85 { XFS_BMAPI_DELAY, "DELAY" }, \
86 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 82 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
87 { XFS_BMAPI_METADATA, "METADATA" }, \ 83 { XFS_BMAPI_METADATA, "METADATA" }, \
88 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ 84 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
@@ -113,21 +109,28 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
113 * Argument structure for xfs_bmap_alloc. 109 * Argument structure for xfs_bmap_alloc.
114 */ 110 */
115typedef struct xfs_bmalloca { 111typedef struct xfs_bmalloca {
116 xfs_fsblock_t firstblock; /* i/o first block allocated */ 112 xfs_fsblock_t *firstblock; /* i/o first block allocated */
117 xfs_fsblock_t rval; /* starting block of new extent */ 113 struct xfs_bmap_free *flist; /* bmap freelist */
118 xfs_fileoff_t off; /* offset in file filling in */
119 struct xfs_trans *tp; /* transaction pointer */ 114 struct xfs_trans *tp; /* transaction pointer */
120 struct xfs_inode *ip; /* incore inode pointer */ 115 struct xfs_inode *ip; /* incore inode pointer */
121 struct xfs_bmbt_irec *prevp; /* extent before the new one */ 116 struct xfs_bmbt_irec prev; /* extent before the new one */
122 struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ 117 struct xfs_bmbt_irec got; /* extent after, or delayed */
123 xfs_extlen_t alen; /* i/o length asked/allocated */ 118
119 xfs_fileoff_t offset; /* offset in file filling in */
120 xfs_extlen_t length; /* i/o length asked/allocated */
121 xfs_fsblock_t blkno; /* starting block of new extent */
122
123 struct xfs_btree_cur *cur; /* btree cursor */
124 xfs_extnum_t idx; /* current extent index */
125 int nallocs;/* number of extents alloc'd */
126 int logflags;/* flags for transaction logging */
127
124 xfs_extlen_t total; /* total blocks needed for xaction */ 128 xfs_extlen_t total; /* total blocks needed for xaction */
125 xfs_extlen_t minlen; /* minimum allocation size (blocks) */ 129 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
126 xfs_extlen_t minleft; /* amount must be left after alloc */ 130 xfs_extlen_t minleft; /* amount must be left after alloc */
127 char eof; /* set if allocating past last extent */ 131 char eof; /* set if allocating past last extent */
128 char wasdel; /* replacing a delayed allocation */ 132 char wasdel; /* replacing a delayed allocation */
129 char userdata;/* set if is user data */ 133 char userdata;/* set if is user data */
130 char low; /* low on space, using seq'l ags */
131 char aeof; /* allocated space at eof */ 134 char aeof; /* allocated space at eof */
132 char conv; /* overwriting unwritten extents */ 135 char conv; /* overwriting unwritten extents */
133} xfs_bmalloca_t; 136} xfs_bmalloca_t;
@@ -152,251 +155,62 @@ typedef struct xfs_bmalloca {
152 { BMAP_RIGHT_FILLING, "RF" }, \ 155 { BMAP_RIGHT_FILLING, "RF" }, \
153 { BMAP_ATTRFORK, "ATTR" } 156 { BMAP_ATTRFORK, "ATTR" }
154 157
155/*
156 * Add bmap trace insert entries for all the contents of the extent list.
157 *
158 * Quite excessive tracing. Only do this for debug builds.
159 */
160#if defined(__KERNEL) && defined(DEBUG) 158#if defined(__KERNEL) && defined(DEBUG)
161void 159void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
162xfs_bmap_trace_exlist( 160 int whichfork, unsigned long caller_ip);
163 struct xfs_inode *ip, /* incore inode pointer */
164 xfs_extnum_t cnt, /* count of entries in list */
165 int whichfork,
166 unsigned long caller_ip); /* data or attr fork */
167#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ 161#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
168 xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) 162 xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
169#else 163#else
170#define XFS_BMAP_TRACE_EXLIST(ip,c,w) 164#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
171#endif 165#endif
172 166
173/* 167int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
174 * Convert inode from non-attributed to attributed. 168void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
175 * Must not be in a transaction, ip must not be locked. 169 struct xfs_bmap_free *flist, struct xfs_mount *mp);
176 */ 170void xfs_bmap_cancel(struct xfs_bmap_free *flist);
177int /* error code */ 171void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
178xfs_bmap_add_attrfork( 172int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
179 struct xfs_inode *ip, /* incore inode pointer */ 173 xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
180 int size, /* space needed for new attribute */ 174int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
181 int rsvd); /* flag for reserved block allocation */ 175 xfs_fileoff_t *last_block, int whichfork);
182 176int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
183/* 177 xfs_fileoff_t *unused, int whichfork);
184 * Add the extent to the list of extents to be free at transaction end. 178int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
185 * The list is maintained sorted (by block number). 179int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
186 */ 180 int whichfork);
187void 181int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
188xfs_bmap_add_free( 182 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
189 xfs_fsblock_t bno, /* fs block number of extent */ 183 int *nmap, int flags);
190 xfs_filblks_t len, /* length of extent */ 184int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
191 xfs_bmap_free_t *flist, /* list of extents */ 185 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
192 struct xfs_mount *mp); /* mount point structure */ 186 int *nmap, int flags);
193 187int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
194/* 188 xfs_fileoff_t bno, xfs_filblks_t len, int flags,
195 * Routine to clean up the free list data structure when 189 xfs_fsblock_t *firstblock, xfs_extlen_t total,
196 * an error occurs during a transaction. 190 struct xfs_bmbt_irec *mval, int *nmap,
197 */ 191 struct xfs_bmap_free *flist);
198void 192int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
199xfs_bmap_cancel( 193 xfs_fileoff_t bno, xfs_filblks_t len, int flags,
200 xfs_bmap_free_t *flist); /* free list to clean up */ 194 xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
201 195 struct xfs_bmap_free *flist, int *done);
202/* 196int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
203 * Compute and fill in the value of the maximum depth of a bmap btree 197 xfs_extnum_t num);
204 * in this filesystem. Done once, during mount. 198uint xfs_default_attroffset(struct xfs_inode *ip);
205 */
206void
207xfs_bmap_compute_maxlevels(
208 struct xfs_mount *mp, /* file system mount structure */
209 int whichfork); /* data or attr fork */
210
211/*
212 * Returns the file-relative block number of the first unused block in the file.
213 * This is the lowest-address hole if the file has holes, else the first block
214 * past the end of file.
215 */
216int /* error */
217xfs_bmap_first_unused(
218 struct xfs_trans *tp, /* transaction pointer */
219 struct xfs_inode *ip, /* incore inode */
220 xfs_extlen_t len, /* size of hole to find */
221 xfs_fileoff_t *unused, /* unused block num */
222 int whichfork); /* data or attr fork */
223
224/*
225 * Returns the file-relative block number of the last block + 1 before
226 * last_block (input value) in the file.
227 * This is not based on i_size, it is based on the extent list.
228 * Returns 0 for local files, as they do not have an extent list.
229 */
230int /* error */
231xfs_bmap_last_before(
232 struct xfs_trans *tp, /* transaction pointer */
233 struct xfs_inode *ip, /* incore inode */
234 xfs_fileoff_t *last_block, /* last block */
235 int whichfork); /* data or attr fork */
236
237/*
238 * Returns the file-relative block number of the first block past eof in
239 * the file. This is not based on i_size, it is based on the extent list.
240 * Returns 0 for local files, as they do not have an extent list.
241 */
242int /* error */
243xfs_bmap_last_offset(
244 struct xfs_trans *tp, /* transaction pointer */
245 struct xfs_inode *ip, /* incore inode */
246 xfs_fileoff_t *unused, /* last block num */
247 int whichfork); /* data or attr fork */
248
249/*
250 * Returns whether the selected fork of the inode has exactly one
251 * block or not. For the data fork we check this matches di_size,
252 * implying the file's range is 0..bsize-1.
253 */
254int
255xfs_bmap_one_block(
256 struct xfs_inode *ip, /* incore inode */
257 int whichfork); /* data or attr fork */
258
259/*
260 * Read in the extents to iu_extents.
261 * All inode fields are set up by caller, we just traverse the btree
262 * and copy the records in.
263 */
264int /* error */
265xfs_bmap_read_extents(
266 struct xfs_trans *tp, /* transaction pointer */
267 struct xfs_inode *ip, /* incore inode */
268 int whichfork); /* data or attr fork */
269
270/*
271 * Map file blocks to filesystem blocks.
272 * File range is given by the bno/len pair.
273 * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
274 * into a hole or past eof.
275 * Only allocates blocks from a single allocation group,
276 * to avoid locking problems.
277 * The returned value in "firstblock" from the first call in a transaction
278 * must be remembered and presented to subsequent calls in "firstblock".
279 * An upper bound for the number of blocks to be allocated is supplied to
280 * the first call in "total"; if no allocation group has that many free
281 * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
282 */
283int /* error */
284xfs_bmapi(
285 struct xfs_trans *tp, /* transaction pointer */
286 struct xfs_inode *ip, /* incore inode */
287 xfs_fileoff_t bno, /* starting file offs. mapped */
288 xfs_filblks_t len, /* length to map in file */
289 int flags, /* XFS_BMAPI_... */
290 xfs_fsblock_t *firstblock, /* first allocated block
291 controls a.g. for allocs */
292 xfs_extlen_t total, /* total blocks needed */
293 struct xfs_bmbt_irec *mval, /* output: map values */
294 int *nmap, /* i/o: mval size/count */
295 xfs_bmap_free_t *flist); /* i/o: list extents to free */
296
297/*
298 * Map file blocks to filesystem blocks, simple version.
299 * One block only, read-only.
300 * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
301 * For the other flag values, the effect is as if XFS_BMAPI_METADATA
302 * was set and all the others were clear.
303 */
304int /* error */
305xfs_bmapi_single(
306 struct xfs_trans *tp, /* transaction pointer */
307 struct xfs_inode *ip, /* incore inode */
308 int whichfork, /* data or attr fork */
309 xfs_fsblock_t *fsb, /* output: mapped block */
310 xfs_fileoff_t bno); /* starting file offs. mapped */
311
312/*
313 * Unmap (remove) blocks from a file.
314 * If nexts is nonzero then the number of extents to remove is limited to
315 * that value. If not all extents in the block range can be removed then
316 * *done is set.
317 */
318int /* error */
319xfs_bunmapi(
320 struct xfs_trans *tp, /* transaction pointer */
321 struct xfs_inode *ip, /* incore inode */
322 xfs_fileoff_t bno, /* starting offset to unmap */
323 xfs_filblks_t len, /* length to unmap in file */
324 int flags, /* XFS_BMAPI_... */
325 xfs_extnum_t nexts, /* number of extents max */
326 xfs_fsblock_t *firstblock, /* first allocated block
327 controls a.g. for allocs */
328 xfs_bmap_free_t *flist, /* i/o: list extents to free */
329 int *done); /* set if not done yet */
330
331/*
332 * Check an extent list, which has just been read, for
333 * any bit in the extent flag field.
334 */
335int
336xfs_check_nostate_extents(
337 struct xfs_ifork *ifp,
338 xfs_extnum_t idx,
339 xfs_extnum_t num);
340
341uint
342xfs_default_attroffset(
343 struct xfs_inode *ip);
344 199
345#ifdef __KERNEL__ 200#ifdef __KERNEL__
346
347/*
348 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
349 * caller. Frees all the extents that need freeing, which must be done
350 * last due to locking considerations.
351 *
352 * Return 1 if the given transaction was committed and a new one allocated,
353 * and 0 otherwise.
354 */
355int /* error */
356xfs_bmap_finish(
357 struct xfs_trans **tp, /* transaction pointer addr */
358 xfs_bmap_free_t *flist, /* i/o: list extents to free */
359 int *committed); /* xact committed or not */
360
361/* bmap to userspace formatter - copy to user & advance pointer */ 201/* bmap to userspace formatter - copy to user & advance pointer */
362typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); 202typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
363 203
364/* 204int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
365 * Get inode's extents as described in bmv, and format for output. 205 int *committed);
366 */ 206int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
367int /* error code */ 207 xfs_bmap_format_t formatter, void *arg);
368xfs_getbmap( 208int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
369 xfs_inode_t *ip, 209 int whichfork, int *eof);
370 struct getbmapx *bmv, /* user bmap structure */ 210int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
371 xfs_bmap_format_t formatter, /* format to user */ 211 int whichfork, int *count);
372 void *arg); /* formatter arg */ 212int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
373 213 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
374/*
375 * Check if the endoff is outside the last extent. If so the caller will grow
376 * the allocation to a stripe unit boundary
377 */
378int
379xfs_bmap_eof(
380 struct xfs_inode *ip,
381 xfs_fileoff_t endoff,
382 int whichfork,
383 int *eof);
384
385/*
386 * Count fsblocks of the given fork.
387 */
388int
389xfs_bmap_count_blocks(
390 xfs_trans_t *tp,
391 struct xfs_inode *ip,
392 int whichfork,
393 int *count);
394
395int
396xfs_bmap_punch_delalloc_range(
397 struct xfs_inode *ip,
398 xfs_fileoff_t start_fsb,
399 xfs_fileoff_t length);
400#endif /* __KERNEL__ */ 214#endif /* __KERNEL__ */
401 215
402#endif /* __XFS_BMAP_H__ */ 216#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index cabf4b5604aa..1f19f03af9d3 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -275,8 +275,7 @@ xfs_btree_dup_cursor(
275 return error; 275 return error;
276 } 276 }
277 new->bc_bufs[i] = bp; 277 new->bc_bufs[i] = bp;
278 ASSERT(bp); 278 ASSERT(!xfs_buf_geterror(bp));
279 ASSERT(!XFS_BUF_GETERROR(bp));
280 } else 279 } else
281 new->bc_bufs[i] = NULL; 280 new->bc_bufs[i] = NULL;
282 } 281 }
@@ -467,8 +466,7 @@ xfs_btree_get_bufl(
467 ASSERT(fsbno != NULLFSBLOCK); 466 ASSERT(fsbno != NULLFSBLOCK);
468 d = XFS_FSB_TO_DADDR(mp, fsbno); 467 d = XFS_FSB_TO_DADDR(mp, fsbno);
469 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); 468 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
470 ASSERT(bp); 469 ASSERT(!xfs_buf_geterror(bp));
471 ASSERT(!XFS_BUF_GETERROR(bp));
472 return bp; 470 return bp;
473} 471}
474 472
@@ -491,8 +489,7 @@ xfs_btree_get_bufs(
491 ASSERT(agbno != NULLAGBLOCK); 489 ASSERT(agbno != NULLAGBLOCK);
492 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 490 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
493 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); 491 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
494 ASSERT(bp); 492 ASSERT(!xfs_buf_geterror(bp));
495 ASSERT(!XFS_BUF_GETERROR(bp));
496 return bp; 493 return bp;
497} 494}
498 495
@@ -632,9 +629,9 @@ xfs_btree_read_bufl(
632 mp->m_bsize, lock, &bp))) { 629 mp->m_bsize, lock, &bp))) {
633 return error; 630 return error;
634 } 631 }
635 ASSERT(!bp || !XFS_BUF_GETERROR(bp)); 632 ASSERT(!xfs_buf_geterror(bp));
636 if (bp) 633 if (bp)
637 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); 634 xfs_buf_set_ref(bp, refval);
638 *bpp = bp; 635 *bpp = bp;
639 return 0; 636 return 0;
640} 637}
@@ -942,13 +939,13 @@ xfs_btree_set_refs(
942 switch (cur->bc_btnum) { 939 switch (cur->bc_btnum) {
943 case XFS_BTNUM_BNO: 940 case XFS_BTNUM_BNO:
944 case XFS_BTNUM_CNT: 941 case XFS_BTNUM_CNT:
945 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF); 942 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
946 break; 943 break;
947 case XFS_BTNUM_INO: 944 case XFS_BTNUM_INO:
948 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF); 945 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
949 break; 946 break;
950 case XFS_BTNUM_BMAP: 947 case XFS_BTNUM_BMAP:
951 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF); 948 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
952 break; 949 break;
953 default: 950 default:
954 ASSERT(0); 951 ASSERT(0);
@@ -973,8 +970,8 @@ xfs_btree_get_buf_block(
973 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, 970 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
974 mp->m_bsize, flags); 971 mp->m_bsize, flags);
975 972
976 ASSERT(*bpp); 973 if (!*bpp)
977 ASSERT(!XFS_BUF_GETERROR(*bpp)); 974 return ENOMEM;
978 975
979 *block = XFS_BUF_TO_BLOCK(*bpp); 976 *block = XFS_BUF_TO_BLOCK(*bpp);
980 return 0; 977 return 0;
@@ -1006,8 +1003,7 @@ xfs_btree_read_buf_block(
1006 if (error) 1003 if (error)
1007 return error; 1004 return error;
1008 1005
1009 ASSERT(*bpp != NULL); 1006 ASSERT(!xfs_buf_geterror(*bpp));
1010 ASSERT(!XFS_BUF_GETERROR(*bpp));
1011 1007
1012 xfs_btree_set_refs(cur, *bpp); 1008 xfs_btree_set_refs(cur, *bpp);
1013 *block = XFS_BUF_TO_BLOCK(*bpp); 1009 *block = XFS_BUF_TO_BLOCK(*bpp);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 8d05a6a46ce3..5b240de104c0 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -262,7 +262,7 @@ typedef struct xfs_btree_cur
262/* 262/*
263 * Convert from buffer to btree block header. 263 * Convert from buffer to btree block header.
264 */ 264 */
265#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp)) 265#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
266 266
267 267
268/* 268/*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/xfs_buf.c
index d1fe74506c4c..cf0ac056815f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,6 @@
43 43
44static kmem_zone_t *xfs_buf_zone; 44static kmem_zone_t *xfs_buf_zone;
45STATIC int xfsbufd(void *); 45STATIC int xfsbufd(void *);
46STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
47 46
48static struct workqueue_struct *xfslogd_workqueue; 47static struct workqueue_struct *xfslogd_workqueue;
49struct workqueue_struct *xfsdatad_workqueue; 48struct workqueue_struct *xfsdatad_workqueue;
@@ -66,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue;
66#define xb_to_km(flags) \ 65#define xb_to_km(flags) \
67 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) 66 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
68 67
69#define xfs_buf_allocate(flags) \
70 kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
71#define xfs_buf_deallocate(bp) \
72 kmem_zone_free(xfs_buf_zone, (bp));
73 68
74static inline int 69static inline int
75xfs_buf_is_vmapped( 70xfs_buf_is_vmapped(
@@ -152,6 +147,7 @@ xfs_buf_stale(
152 struct xfs_buf *bp) 147 struct xfs_buf *bp)
153{ 148{
154 bp->b_flags |= XBF_STALE; 149 bp->b_flags |= XBF_STALE;
150 xfs_buf_delwri_dequeue(bp);
155 atomic_set(&(bp)->b_lru_ref, 0); 151 atomic_set(&(bp)->b_lru_ref, 0);
156 if (!list_empty(&bp->b_lru)) { 152 if (!list_empty(&bp->b_lru)) {
157 struct xfs_buftarg *btp = bp->b_target; 153 struct xfs_buftarg *btp = bp->b_target;
@@ -167,14 +163,19 @@ xfs_buf_stale(
167 ASSERT(atomic_read(&bp->b_hold) >= 1); 163 ASSERT(atomic_read(&bp->b_hold) >= 1);
168} 164}
169 165
170STATIC void 166struct xfs_buf *
171_xfs_buf_initialize( 167xfs_buf_alloc(
172 xfs_buf_t *bp, 168 struct xfs_buftarg *target,
173 xfs_buftarg_t *target,
174 xfs_off_t range_base, 169 xfs_off_t range_base,
175 size_t range_length, 170 size_t range_length,
176 xfs_buf_flags_t flags) 171 xfs_buf_flags_t flags)
177{ 172{
173 struct xfs_buf *bp;
174
175 bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
176 if (unlikely(!bp))
177 return NULL;
178
178 /* 179 /*
179 * We don't want certain flags to appear in b_flags. 180 * We don't want certain flags to appear in b_flags.
180 */ 181 */
@@ -203,8 +204,9 @@ _xfs_buf_initialize(
203 init_waitqueue_head(&bp->b_waiters); 204 init_waitqueue_head(&bp->b_waiters);
204 205
205 XFS_STATS_INC(xb_create); 206 XFS_STATS_INC(xb_create);
206
207 trace_xfs_buf_init(bp, _RET_IP_); 207 trace_xfs_buf_init(bp, _RET_IP_);
208
209 return bp;
208} 210}
209 211
210/* 212/*
@@ -277,7 +279,7 @@ xfs_buf_free(
277 } else if (bp->b_flags & _XBF_KMEM) 279 } else if (bp->b_flags & _XBF_KMEM)
278 kmem_free(bp->b_addr); 280 kmem_free(bp->b_addr);
279 _xfs_buf_free_pages(bp); 281 _xfs_buf_free_pages(bp);
280 xfs_buf_deallocate(bp); 282 kmem_zone_free(xfs_buf_zone, bp);
281} 283}
282 284
283/* 285/*
@@ -416,10 +418,7 @@ _xfs_buf_map_pages(
416/* 418/*
417 * Look up, and creates if absent, a lockable buffer for 419 * Look up, and creates if absent, a lockable buffer for
418 * a given range of an inode. The buffer is returned 420 * a given range of an inode. The buffer is returned
419 * locked. If other overlapping buffers exist, they are 421 * locked. No I/O is implied by this call.
420 * released before the new buffer is created and locked,
421 * which may imply that this call will block until those buffers
422 * are unlocked. No I/O is implied by this call.
423 */ 422 */
424xfs_buf_t * 423xfs_buf_t *
425_xfs_buf_find( 424_xfs_buf_find(
@@ -481,8 +480,6 @@ _xfs_buf_find(
481 480
482 /* No match found */ 481 /* No match found */
483 if (new_bp) { 482 if (new_bp) {
484 _xfs_buf_initialize(new_bp, btp, range_base,
485 range_length, flags);
486 rb_link_node(&new_bp->b_rbnode, parent, rbp); 483 rb_link_node(&new_bp->b_rbnode, parent, rbp);
487 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); 484 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
488 /* the buffer keeps the perag reference until it is freed */ 485 /* the buffer keeps the perag reference until it is freed */
@@ -525,35 +522,51 @@ found:
525} 522}
526 523
527/* 524/*
528 * Assembles a buffer covering the specified range. 525 * Assembles a buffer covering the specified range. The code is optimised for
529 * Storage in memory for all portions of the buffer will be allocated, 526 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
530 * although backing storage may not be. 527 * more hits than misses.
531 */ 528 */
532xfs_buf_t * 529struct xfs_buf *
533xfs_buf_get( 530xfs_buf_get(
534 xfs_buftarg_t *target,/* target for buffer */ 531 xfs_buftarg_t *target,/* target for buffer */
535 xfs_off_t ioff, /* starting offset of range */ 532 xfs_off_t ioff, /* starting offset of range */
536 size_t isize, /* length of range */ 533 size_t isize, /* length of range */
537 xfs_buf_flags_t flags) 534 xfs_buf_flags_t flags)
538{ 535{
539 xfs_buf_t *bp, *new_bp; 536 struct xfs_buf *bp;
537 struct xfs_buf *new_bp;
540 int error = 0; 538 int error = 0;
541 539
542 new_bp = xfs_buf_allocate(flags); 540 bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
541 if (likely(bp))
542 goto found;
543
544 new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
545 flags);
543 if (unlikely(!new_bp)) 546 if (unlikely(!new_bp))
544 return NULL; 547 return NULL;
545 548
546 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 549 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
550 if (!bp) {
551 kmem_zone_free(xfs_buf_zone, new_bp);
552 return NULL;
553 }
554
547 if (bp == new_bp) { 555 if (bp == new_bp) {
548 error = xfs_buf_allocate_memory(bp, flags); 556 error = xfs_buf_allocate_memory(bp, flags);
549 if (error) 557 if (error)
550 goto no_buffer; 558 goto no_buffer;
551 } else { 559 } else
552 xfs_buf_deallocate(new_bp); 560 kmem_zone_free(xfs_buf_zone, new_bp);
553 if (unlikely(bp == NULL))
554 return NULL;
555 }
556 561
562 /*
563 * Now we have a workable buffer, fill in the block number so
564 * that we can do IO on it.
565 */
566 bp->b_bn = ioff;
567 bp->b_count_desired = bp->b_buffer_length;
568
569found:
557 if (!(bp->b_flags & XBF_MAPPED)) { 570 if (!(bp->b_flags & XBF_MAPPED)) {
558 error = _xfs_buf_map_pages(bp, flags); 571 error = _xfs_buf_map_pages(bp, flags);
559 if (unlikely(error)) { 572 if (unlikely(error)) {
@@ -564,18 +577,10 @@ xfs_buf_get(
564 } 577 }
565 578
566 XFS_STATS_INC(xb_get); 579 XFS_STATS_INC(xb_get);
567
568 /*
569 * Always fill in the block number now, the mapped cases can do
570 * their own overlay of this later.
571 */
572 bp->b_bn = ioff;
573 bp->b_count_desired = bp->b_buffer_length;
574
575 trace_xfs_buf_get(bp, flags, _RET_IP_); 580 trace_xfs_buf_get(bp, flags, _RET_IP_);
576 return bp; 581 return bp;
577 582
578 no_buffer: 583no_buffer:
579 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 584 if (flags & (XBF_LOCK | XBF_TRYLOCK))
580 xfs_buf_unlock(bp); 585 xfs_buf_unlock(bp);
581 xfs_buf_rele(bp); 586 xfs_buf_rele(bp);
@@ -596,7 +601,7 @@ _xfs_buf_read(
596 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 601 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
597 602
598 status = xfs_buf_iorequest(bp); 603 status = xfs_buf_iorequest(bp);
599 if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC)) 604 if (status || bp->b_error || (flags & XBF_ASYNC))
600 return status; 605 return status;
601 return xfs_buf_iowait(bp); 606 return xfs_buf_iowait(bp);
602} 607}
@@ -679,7 +684,6 @@ xfs_buf_read_uncached(
679 /* set up the buffer for a read IO */ 684 /* set up the buffer for a read IO */
680 XFS_BUF_SET_ADDR(bp, daddr); 685 XFS_BUF_SET_ADDR(bp, daddr);
681 XFS_BUF_READ(bp); 686 XFS_BUF_READ(bp);
682 XFS_BUF_BUSY(bp);
683 687
684 xfsbdstrat(mp, bp); 688 xfsbdstrat(mp, bp);
685 error = xfs_buf_iowait(bp); 689 error = xfs_buf_iowait(bp);
@@ -690,19 +694,6 @@ xfs_buf_read_uncached(
690 return bp; 694 return bp;
691} 695}
692 696
693xfs_buf_t *
694xfs_buf_get_empty(
695 size_t len,
696 xfs_buftarg_t *target)
697{
698 xfs_buf_t *bp;
699
700 bp = xfs_buf_allocate(0);
701 if (bp)
702 _xfs_buf_initialize(bp, target, 0, len, 0);
703 return bp;
704}
705
706/* 697/*
707 * Return a buffer allocated as an empty buffer and associated to external 698 * Return a buffer allocated as an empty buffer and associated to external
708 * memory via xfs_buf_associate_memory() back to it's empty state. 699 * memory via xfs_buf_associate_memory() back to it's empty state.
@@ -788,10 +779,9 @@ xfs_buf_get_uncached(
788 int error, i; 779 int error, i;
789 xfs_buf_t *bp; 780 xfs_buf_t *bp;
790 781
791 bp = xfs_buf_allocate(0); 782 bp = xfs_buf_alloc(target, 0, len, 0);
792 if (unlikely(bp == NULL)) 783 if (unlikely(bp == NULL))
793 goto fail; 784 goto fail;
794 _xfs_buf_initialize(bp, target, 0, len, 0);
795 785
796 error = _xfs_buf_get_pages(bp, page_count, 0); 786 error = _xfs_buf_get_pages(bp, page_count, 0);
797 if (error) 787 if (error)
@@ -819,7 +809,7 @@ xfs_buf_get_uncached(
819 __free_page(bp->b_pages[i]); 809 __free_page(bp->b_pages[i]);
820 _xfs_buf_free_pages(bp); 810 _xfs_buf_free_pages(bp);
821 fail_free_buf: 811 fail_free_buf:
822 xfs_buf_deallocate(bp); 812 kmem_zone_free(xfs_buf_zone, bp);
823 fail: 813 fail:
824 return NULL; 814 return NULL;
825} 815}
@@ -938,12 +928,6 @@ void
938xfs_buf_unlock( 928xfs_buf_unlock(
939 struct xfs_buf *bp) 929 struct xfs_buf *bp)
940{ 930{
941 if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
942 atomic_inc(&bp->b_hold);
943 bp->b_flags |= XBF_ASYNC;
944 xfs_buf_delwri_queue(bp, 0);
945 }
946
947 XB_CLEAR_OWNER(bp); 931 XB_CLEAR_OWNER(bp);
948 up(&bp->b_sema); 932 up(&bp->b_sema);
949 933
@@ -1020,9 +1004,19 @@ xfs_buf_ioerror(
1020 trace_xfs_buf_ioerror(bp, error, _RET_IP_); 1004 trace_xfs_buf_ioerror(bp, error, _RET_IP_);
1021} 1005}
1022 1006
1007void
1008xfs_buf_ioerror_alert(
1009 struct xfs_buf *bp,
1010 const char *func)
1011{
1012 xfs_alert(bp->b_target->bt_mount,
1013"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
1014 (__uint64_t)XFS_BUF_ADDR(bp), func,
1015 bp->b_error, XFS_BUF_COUNT(bp));
1016}
1017
1023int 1018int
1024xfs_bwrite( 1019xfs_bwrite(
1025 struct xfs_mount *mp,
1026 struct xfs_buf *bp) 1020 struct xfs_buf *bp)
1027{ 1021{
1028 int error; 1022 int error;
@@ -1034,25 +1028,13 @@ xfs_bwrite(
1034 xfs_bdstrat_cb(bp); 1028 xfs_bdstrat_cb(bp);
1035 1029
1036 error = xfs_buf_iowait(bp); 1030 error = xfs_buf_iowait(bp);
1037 if (error) 1031 if (error) {
1038 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1032 xfs_force_shutdown(bp->b_target->bt_mount,
1039 xfs_buf_relse(bp); 1033 SHUTDOWN_META_IO_ERROR);
1034 }
1040 return error; 1035 return error;
1041} 1036}
1042 1037
1043void
1044xfs_bdwrite(
1045 void *mp,
1046 struct xfs_buf *bp)
1047{
1048 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1049
1050 bp->b_flags &= ~XBF_READ;
1051 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1052
1053 xfs_buf_delwri_queue(bp, 1);
1054}
1055
1056/* 1038/*
1057 * Called when we want to stop a buffer from getting written or read. 1039 * Called when we want to stop a buffer from getting written or read.
1058 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend 1040 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
@@ -1069,15 +1051,14 @@ xfs_bioerror(
1069 /* 1051 /*
1070 * No need to wait until the buffer is unpinned, we aren't flushing it. 1052 * No need to wait until the buffer is unpinned, we aren't flushing it.
1071 */ 1053 */
1072 XFS_BUF_ERROR(bp, EIO); 1054 xfs_buf_ioerror(bp, EIO);
1073 1055
1074 /* 1056 /*
1075 * We're calling xfs_buf_ioend, so delete XBF_DONE flag. 1057 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1076 */ 1058 */
1077 XFS_BUF_UNREAD(bp); 1059 XFS_BUF_UNREAD(bp);
1078 XFS_BUF_UNDELAYWRITE(bp);
1079 XFS_BUF_UNDONE(bp); 1060 XFS_BUF_UNDONE(bp);
1080 XFS_BUF_STALE(bp); 1061 xfs_buf_stale(bp);
1081 1062
1082 xfs_buf_ioend(bp, 0); 1063 xfs_buf_ioend(bp, 0);
1083 1064
@@ -1094,7 +1075,7 @@ STATIC int
1094xfs_bioerror_relse( 1075xfs_bioerror_relse(
1095 struct xfs_buf *bp) 1076 struct xfs_buf *bp)
1096{ 1077{
1097 int64_t fl = XFS_BUF_BFLAGS(bp); 1078 int64_t fl = bp->b_flags;
1098 /* 1079 /*
1099 * No need to wait until the buffer is unpinned. 1080 * No need to wait until the buffer is unpinned.
1100 * We aren't flushing it. 1081 * We aren't flushing it.
@@ -1104,9 +1085,8 @@ xfs_bioerror_relse(
1104 * change that interface. 1085 * change that interface.
1105 */ 1086 */
1106 XFS_BUF_UNREAD(bp); 1087 XFS_BUF_UNREAD(bp);
1107 XFS_BUF_UNDELAYWRITE(bp);
1108 XFS_BUF_DONE(bp); 1088 XFS_BUF_DONE(bp);
1109 XFS_BUF_STALE(bp); 1089 xfs_buf_stale(bp);
1110 bp->b_iodone = NULL; 1090 bp->b_iodone = NULL;
1111 if (!(fl & XBF_ASYNC)) { 1091 if (!(fl & XBF_ASYNC)) {
1112 /* 1092 /*
@@ -1115,8 +1095,8 @@ xfs_bioerror_relse(
1115 * There's no reason to mark error for 1095 * There's no reason to mark error for
1116 * ASYNC buffers. 1096 * ASYNC buffers.
1117 */ 1097 */
1118 XFS_BUF_ERROR(bp, EIO); 1098 xfs_buf_ioerror(bp, EIO);
1119 XFS_BUF_FINISH_IOWAIT(bp); 1099 complete(&bp->b_iowait);
1120 } else { 1100 } else {
1121 xfs_buf_relse(bp); 1101 xfs_buf_relse(bp);
1122 } 1102 }
@@ -1276,15 +1256,10 @@ xfs_buf_iorequest(
1276{ 1256{
1277 trace_xfs_buf_iorequest(bp, _RET_IP_); 1257 trace_xfs_buf_iorequest(bp, _RET_IP_);
1278 1258
1279 if (bp->b_flags & XBF_DELWRI) { 1259 ASSERT(!(bp->b_flags & XBF_DELWRI));
1280 xfs_buf_delwri_queue(bp, 1);
1281 return 0;
1282 }
1283 1260
1284 if (bp->b_flags & XBF_WRITE) { 1261 if (bp->b_flags & XBF_WRITE)
1285 xfs_buf_wait_unpin(bp); 1262 xfs_buf_wait_unpin(bp);
1286 }
1287
1288 xfs_buf_hold(bp); 1263 xfs_buf_hold(bp);
1289 1264
1290 /* Set the count to 1 initially, this will stop an I/O 1265 /* Set the count to 1 initially, this will stop an I/O
@@ -1324,7 +1299,7 @@ xfs_buf_offset(
1324 struct page *page; 1299 struct page *page;
1325 1300
1326 if (bp->b_flags & XBF_MAPPED) 1301 if (bp->b_flags & XBF_MAPPED)
1327 return XFS_BUF_PTR(bp) + offset; 1302 return bp->b_addr + offset;
1328 1303
1329 offset += bp->b_offset; 1304 offset += bp->b_offset;
1330 page = bp->b_pages[offset >> PAGE_SHIFT]; 1305 page = bp->b_pages[offset >> PAGE_SHIFT];
@@ -1482,9 +1457,13 @@ xfs_setsize_buftarg_flags(
1482 btp->bt_smask = sectorsize - 1; 1457 btp->bt_smask = sectorsize - 1;
1483 1458
1484 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1459 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1460 char name[BDEVNAME_SIZE];
1461
1462 bdevname(btp->bt_bdev, name);
1463
1485 xfs_warn(btp->bt_mount, 1464 xfs_warn(btp->bt_mount,
1486 "Cannot set_blocksize to %u on device %s\n", 1465 "Cannot set_blocksize to %u on device %s\n",
1487 sectorsize, XFS_BUFTARG_NAME(btp)); 1466 sectorsize, name);
1488 return EINVAL; 1467 return EINVAL;
1489 } 1468 }
1490 1469
@@ -1515,12 +1494,12 @@ xfs_setsize_buftarg(
1515} 1494}
1516 1495
1517STATIC int 1496STATIC int
1518xfs_alloc_delwrite_queue( 1497xfs_alloc_delwri_queue(
1519 xfs_buftarg_t *btp, 1498 xfs_buftarg_t *btp,
1520 const char *fsname) 1499 const char *fsname)
1521{ 1500{
1522 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1501 INIT_LIST_HEAD(&btp->bt_delwri_queue);
1523 spin_lock_init(&btp->bt_delwrite_lock); 1502 spin_lock_init(&btp->bt_delwri_lock);
1524 btp->bt_flags = 0; 1503 btp->bt_flags = 0;
1525 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1504 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1526 if (IS_ERR(btp->bt_task)) 1505 if (IS_ERR(btp->bt_task))
@@ -1550,7 +1529,7 @@ xfs_alloc_buftarg(
1550 spin_lock_init(&btp->bt_lru_lock); 1529 spin_lock_init(&btp->bt_lru_lock);
1551 if (xfs_setsize_buftarg_early(btp, bdev)) 1530 if (xfs_setsize_buftarg_early(btp, bdev))
1552 goto error; 1531 goto error;
1553 if (xfs_alloc_delwrite_queue(btp, fsname)) 1532 if (xfs_alloc_delwri_queue(btp, fsname))
1554 goto error; 1533 goto error;
1555 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1534 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1556 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1535 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1566,56 +1545,48 @@ error:
1566/* 1545/*
1567 * Delayed write buffer handling 1546 * Delayed write buffer handling
1568 */ 1547 */
1569STATIC void 1548void
1570xfs_buf_delwri_queue( 1549xfs_buf_delwri_queue(
1571 xfs_buf_t *bp, 1550 xfs_buf_t *bp)
1572 int unlock)
1573{ 1551{
1574 struct list_head *dwq = &bp->b_target->bt_delwrite_queue; 1552 struct xfs_buftarg *btp = bp->b_target;
1575 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1576 1553
1577 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1554 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1578 1555
1579 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); 1556 ASSERT(!(bp->b_flags & XBF_READ));
1580 1557
1581 spin_lock(dwlk); 1558 spin_lock(&btp->bt_delwri_lock);
1582 /* If already in the queue, dequeue and place at tail */
1583 if (!list_empty(&bp->b_list)) { 1559 if (!list_empty(&bp->b_list)) {
1560 /* if already in the queue, move it to the tail */
1584 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1561 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1585 if (unlock) 1562 list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
1586 atomic_dec(&bp->b_hold); 1563 } else {
1587 list_del(&bp->b_list);
1588 }
1589
1590 if (list_empty(dwq)) {
1591 /* start xfsbufd as it is about to have something to do */ 1564 /* start xfsbufd as it is about to have something to do */
1592 wake_up_process(bp->b_target->bt_task); 1565 if (list_empty(&btp->bt_delwri_queue))
1593 } 1566 wake_up_process(bp->b_target->bt_task);
1594 1567
1595 bp->b_flags |= _XBF_DELWRI_Q; 1568 atomic_inc(&bp->b_hold);
1596 list_add_tail(&bp->b_list, dwq); 1569 bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
1570 list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
1571 }
1597 bp->b_queuetime = jiffies; 1572 bp->b_queuetime = jiffies;
1598 spin_unlock(dwlk); 1573 spin_unlock(&btp->bt_delwri_lock);
1599
1600 if (unlock)
1601 xfs_buf_unlock(bp);
1602} 1574}
1603 1575
1604void 1576void
1605xfs_buf_delwri_dequeue( 1577xfs_buf_delwri_dequeue(
1606 xfs_buf_t *bp) 1578 xfs_buf_t *bp)
1607{ 1579{
1608 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1609 int dequeued = 0; 1580 int dequeued = 0;
1610 1581
1611 spin_lock(dwlk); 1582 spin_lock(&bp->b_target->bt_delwri_lock);
1612 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { 1583 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1613 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1584 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1614 list_del_init(&bp->b_list); 1585 list_del_init(&bp->b_list);
1615 dequeued = 1; 1586 dequeued = 1;
1616 } 1587 }
1617 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); 1588 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1618 spin_unlock(dwlk); 1589 spin_unlock(&bp->b_target->bt_delwri_lock);
1619 1590
1620 if (dequeued) 1591 if (dequeued)
1621 xfs_buf_rele(bp); 1592 xfs_buf_rele(bp);
@@ -1647,16 +1618,9 @@ xfs_buf_delwri_promote(
1647 if (bp->b_queuetime < jiffies - age) 1618 if (bp->b_queuetime < jiffies - age)
1648 return; 1619 return;
1649 bp->b_queuetime = jiffies - age; 1620 bp->b_queuetime = jiffies - age;
1650 spin_lock(&btp->bt_delwrite_lock); 1621 spin_lock(&btp->bt_delwri_lock);
1651 list_move(&bp->b_list, &btp->bt_delwrite_queue); 1622 list_move(&bp->b_list, &btp->bt_delwri_queue);
1652 spin_unlock(&btp->bt_delwrite_lock); 1623 spin_unlock(&btp->bt_delwri_lock);
1653}
1654
1655STATIC void
1656xfs_buf_runall_queues(
1657 struct workqueue_struct *queue)
1658{
1659 flush_workqueue(queue);
1660} 1624}
1661 1625
1662/* 1626/*
@@ -1670,18 +1634,16 @@ xfs_buf_delwri_split(
1670 unsigned long age) 1634 unsigned long age)
1671{ 1635{
1672 xfs_buf_t *bp, *n; 1636 xfs_buf_t *bp, *n;
1673 struct list_head *dwq = &target->bt_delwrite_queue;
1674 spinlock_t *dwlk = &target->bt_delwrite_lock;
1675 int skipped = 0; 1637 int skipped = 0;
1676 int force; 1638 int force;
1677 1639
1678 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1640 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1679 INIT_LIST_HEAD(list); 1641 INIT_LIST_HEAD(list);
1680 spin_lock(dwlk); 1642 spin_lock(&target->bt_delwri_lock);
1681 list_for_each_entry_safe(bp, n, dwq, b_list) { 1643 list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
1682 ASSERT(bp->b_flags & XBF_DELWRI); 1644 ASSERT(bp->b_flags & XBF_DELWRI);
1683 1645
1684 if (!XFS_BUF_ISPINNED(bp) && xfs_buf_trylock(bp)) { 1646 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
1685 if (!force && 1647 if (!force &&
1686 time_before(jiffies, bp->b_queuetime + age)) { 1648 time_before(jiffies, bp->b_queuetime + age)) {
1687 xfs_buf_unlock(bp); 1649 xfs_buf_unlock(bp);
@@ -1695,10 +1657,9 @@ xfs_buf_delwri_split(
1695 } else 1657 } else
1696 skipped++; 1658 skipped++;
1697 } 1659 }
1698 spin_unlock(dwlk);
1699 1660
1661 spin_unlock(&target->bt_delwri_lock);
1700 return skipped; 1662 return skipped;
1701
1702} 1663}
1703 1664
1704/* 1665/*
@@ -1748,7 +1709,7 @@ xfsbufd(
1748 } 1709 }
1749 1710
1750 /* sleep for a long time if there is nothing to do. */ 1711 /* sleep for a long time if there is nothing to do. */
1751 if (list_empty(&target->bt_delwrite_queue)) 1712 if (list_empty(&target->bt_delwri_queue))
1752 tout = MAX_SCHEDULE_TIMEOUT; 1713 tout = MAX_SCHEDULE_TIMEOUT;
1753 schedule_timeout_interruptible(tout); 1714 schedule_timeout_interruptible(tout);
1754 1715
@@ -1784,9 +1745,7 @@ xfs_flush_buftarg(
1784 LIST_HEAD(wait_list); 1745 LIST_HEAD(wait_list);
1785 struct blk_plug plug; 1746 struct blk_plug plug;
1786 1747
1787 xfs_buf_runall_queues(xfsconvertd_workqueue); 1748 flush_workqueue(xfslogd_workqueue);
1788 xfs_buf_runall_queues(xfsdatad_workqueue);
1789 xfs_buf_runall_queues(xfslogd_workqueue);
1790 1749
1791 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1750 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1792 pincount = xfs_buf_delwri_split(target, &tmp_list, 0); 1751 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
@@ -1867,11 +1826,3 @@ xfs_buf_terminate(void)
1867 destroy_workqueue(xfslogd_workqueue); 1826 destroy_workqueue(xfslogd_workqueue);
1868 kmem_zone_destroy(xfs_buf_zone); 1827 kmem_zone_destroy(xfs_buf_zone);
1869} 1828}
1870
1871#ifdef CONFIG_KDB_MODULES
1872struct list_head *
1873xfs_get_buftarg_list(void)
1874{
1875 return &xfs_buftarg_list;
1876}
1877#endif
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/xfs_buf.h
index 6a83b46b4bcf..5bab046e859f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -105,8 +105,8 @@ typedef struct xfs_buftarg {
105 105
106 /* per device delwri queue */ 106 /* per device delwri queue */
107 struct task_struct *bt_task; 107 struct task_struct *bt_task;
108 struct list_head bt_delwrite_queue; 108 struct list_head bt_delwri_queue;
109 spinlock_t bt_delwrite_lock; 109 spinlock_t bt_delwri_lock;
110 unsigned long bt_flags; 110 unsigned long bt_flags;
111 111
112 /* LRU control structures */ 112 /* LRU control structures */
@@ -175,7 +175,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
175extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, 175extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
176 xfs_buf_flags_t); 176 xfs_buf_flags_t);
177 177
178extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 178struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t,
179 xfs_buf_flags_t);
179extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); 180extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
180extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); 181extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
181extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 182extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
@@ -197,14 +198,14 @@ extern void xfs_buf_unlock(xfs_buf_t *);
197 ((bp)->b_sema.count <= 0) 198 ((bp)->b_sema.count <= 0)
198 199
199/* Buffer Read and Write Routines */ 200/* Buffer Read and Write Routines */
200extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); 201extern int xfs_bwrite(struct xfs_buf *bp);
201extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
202 202
203extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); 203extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
204extern int xfs_bdstrat_cb(struct xfs_buf *); 204extern int xfs_bdstrat_cb(struct xfs_buf *);
205 205
206extern void xfs_buf_ioend(xfs_buf_t *, int); 206extern void xfs_buf_ioend(xfs_buf_t *, int);
207extern void xfs_buf_ioerror(xfs_buf_t *, int); 207extern void xfs_buf_ioerror(xfs_buf_t *, int);
208extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
208extern int xfs_buf_iorequest(xfs_buf_t *); 209extern int xfs_buf_iorequest(xfs_buf_t *);
209extern int xfs_buf_iowait(xfs_buf_t *); 210extern int xfs_buf_iowait(xfs_buf_t *);
210extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 211extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
@@ -221,53 +222,32 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
221extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); 222extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
222 223
223/* Delayed Write Buffer Routines */ 224/* Delayed Write Buffer Routines */
224extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 225extern void xfs_buf_delwri_queue(struct xfs_buf *);
225extern void xfs_buf_delwri_promote(xfs_buf_t *); 226extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
227extern void xfs_buf_delwri_promote(struct xfs_buf *);
226 228
227/* Buffer Daemon Setup Routines */ 229/* Buffer Daemon Setup Routines */
228extern int xfs_buf_init(void); 230extern int xfs_buf_init(void);
229extern void xfs_buf_terminate(void); 231extern void xfs_buf_terminate(void);
230 232
231#define xfs_buf_target_name(target) \
232 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
233
234
235#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
236#define XFS_BUF_ZEROFLAGS(bp) \ 233#define XFS_BUF_ZEROFLAGS(bp) \
237 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ 234 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
238 XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) 235 XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
239 236
240void xfs_buf_stale(struct xfs_buf *bp); 237void xfs_buf_stale(struct xfs_buf *bp);
241#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
242#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 238#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
243#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 239#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
244#define XFS_BUF_SUPER_STALE(bp) do { \
245 XFS_BUF_STALE(bp); \
246 xfs_buf_delwri_dequeue(bp); \
247 XFS_BUF_DONE(bp); \
248 } while (0)
249
250#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
251#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
252#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
253 240
254#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no) 241#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
255#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp)
256#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0)
257 242
258#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) 243#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
259#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) 244#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
260#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) 245#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
261 246
262#define XFS_BUF_BUSY(bp) do { } while (0)
263#define XFS_BUF_UNBUSY(bp) do { } while (0)
264#define XFS_BUF_ISBUSY(bp) (1)
265
266#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) 247#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
267#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) 248#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
268#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) 249#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
269 250
270#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp)
271#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) 251#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
272#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) 252#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
273#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) 253#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
@@ -276,10 +256,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
276#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) 256#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
277#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) 257#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
278 258
279#define XFS_BUF_SET_START(bp) do { } while (0)
280
281#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
282#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
283#define XFS_BUF_ADDR(bp) ((bp)->b_bn) 259#define XFS_BUF_ADDR(bp) ((bp)->b_bn)
284#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) 260#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
285#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) 261#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset)
@@ -289,23 +265,15 @@ void xfs_buf_stale(struct xfs_buf *bp);
289#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 265#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
290#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 266#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
291 267
292static inline void 268static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
293xfs_buf_set_ref(
294 struct xfs_buf *bp,
295 int lru_ref)
296{ 269{
297 atomic_set(&bp->b_lru_ref, lru_ref); 270 atomic_set(&bp->b_lru_ref, lru_ref);
298} 271}
299#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
300#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
301
302#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
303 272
304#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); 273static inline int xfs_buf_ispinned(struct xfs_buf *bp)
305 274{
306#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) 275 return atomic_read(&bp->b_pin_count);
307#define XFS_BUF_TARGET(bp) ((bp)->b_target) 276}
308#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target)
309 277
310static inline void xfs_buf_relse(xfs_buf_t *bp) 278static inline void xfs_buf_relse(xfs_buf_t *bp)
311{ 279{
@@ -323,14 +291,7 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *);
323extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 291extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
324extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 292extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
325 293
326#ifdef CONFIG_KDB_MODULES
327extern struct list_head *xfs_get_buftarg_list(void);
328#endif
329
330#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 294#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
331#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 295#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
332 296
333#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1)
334#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1)
335
336#endif /* __XFS_BUF_H__ */ 297#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 88492916c3dc..1a3513881bce 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -124,9 +124,9 @@ xfs_buf_item_log_check(
124 124
125 bp = bip->bli_buf; 125 bp = bip->bli_buf;
126 ASSERT(XFS_BUF_COUNT(bp) > 0); 126 ASSERT(XFS_BUF_COUNT(bp) > 0);
127 ASSERT(XFS_BUF_PTR(bp) != NULL); 127 ASSERT(bp->b_addr != NULL);
128 orig = bip->bli_orig; 128 orig = bip->bli_orig;
129 buffer = XFS_BUF_PTR(bp); 129 buffer = bp->b_addr;
130 for (x = 0; x < XFS_BUF_COUNT(bp); x++) { 130 for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
131 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { 131 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
132 xfs_emerg(bp->b_mount, 132 xfs_emerg(bp->b_mount,
@@ -371,7 +371,6 @@ xfs_buf_item_pin(
371{ 371{
372 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 372 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
373 373
374 ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
375 ASSERT(atomic_read(&bip->bli_refcount) > 0); 374 ASSERT(atomic_read(&bip->bli_refcount) > 0);
376 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 375 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
377 (bip->bli_flags & XFS_BLI_STALE)); 376 (bip->bli_flags & XFS_BLI_STALE));
@@ -479,13 +478,13 @@ xfs_buf_item_trylock(
479 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 478 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
480 struct xfs_buf *bp = bip->bli_buf; 479 struct xfs_buf *bp = bip->bli_buf;
481 480
482 if (XFS_BUF_ISPINNED(bp)) 481 if (xfs_buf_ispinned(bp))
483 return XFS_ITEM_PINNED; 482 return XFS_ITEM_PINNED;
484 if (!xfs_buf_trylock(bp)) 483 if (!xfs_buf_trylock(bp))
485 return XFS_ITEM_LOCKED; 484 return XFS_ITEM_LOCKED;
486 485
487 /* take a reference to the buffer. */ 486 /* take a reference to the buffer. */
488 XFS_BUF_HOLD(bp); 487 xfs_buf_hold(bp);
489 488
490 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 489 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
491 trace_xfs_buf_item_trylock(bip); 490 trace_xfs_buf_item_trylock(bip);
@@ -630,7 +629,7 @@ xfs_buf_item_push(
630 * the xfsbufd to get this buffer written. We have to unlock the buffer 629 * the xfsbufd to get this buffer written. We have to unlock the buffer
631 * to allow the xfsbufd to write it, too. 630 * to allow the xfsbufd to write it, too.
632 */ 631 */
633STATIC void 632STATIC bool
634xfs_buf_item_pushbuf( 633xfs_buf_item_pushbuf(
635 struct xfs_log_item *lip) 634 struct xfs_log_item *lip)
636{ 635{
@@ -644,6 +643,7 @@ xfs_buf_item_pushbuf(
644 643
645 xfs_buf_delwri_promote(bp); 644 xfs_buf_delwri_promote(bp);
646 xfs_buf_relse(bp); 645 xfs_buf_relse(bp);
646 return true;
647} 647}
648 648
649STATIC void 649STATIC void
@@ -726,7 +726,7 @@ xfs_buf_item_init(
726 * to have logged. 726 * to have logged.
727 */ 727 */
728 bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); 728 bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
729 memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); 729 memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp));
730 bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); 730 bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
731#endif 731#endif
732 732
@@ -895,7 +895,6 @@ xfs_buf_attach_iodone(
895{ 895{
896 xfs_log_item_t *head_lip; 896 xfs_log_item_t *head_lip;
897 897
898 ASSERT(XFS_BUF_ISBUSY(bp));
899 ASSERT(xfs_buf_islocked(bp)); 898 ASSERT(xfs_buf_islocked(bp));
900 899
901 lip->li_cb = cb; 900 lip->li_cb = cb;
@@ -960,7 +959,7 @@ xfs_buf_iodone_callbacks(
960 static ulong lasttime; 959 static ulong lasttime;
961 static xfs_buftarg_t *lasttarg; 960 static xfs_buftarg_t *lasttarg;
962 961
963 if (likely(!XFS_BUF_GETERROR(bp))) 962 if (likely(!xfs_buf_geterror(bp)))
964 goto do_callbacks; 963 goto do_callbacks;
965 964
966 /* 965 /*
@@ -968,19 +967,18 @@ xfs_buf_iodone_callbacks(
968 * I/O errors, there's no point in giving this a retry. 967 * I/O errors, there's no point in giving this a retry.
969 */ 968 */
970 if (XFS_FORCED_SHUTDOWN(mp)) { 969 if (XFS_FORCED_SHUTDOWN(mp)) {
971 XFS_BUF_SUPER_STALE(bp); 970 xfs_buf_stale(bp);
971 XFS_BUF_DONE(bp);
972 trace_xfs_buf_item_iodone(bp, _RET_IP_); 972 trace_xfs_buf_item_iodone(bp, _RET_IP_);
973 goto do_callbacks; 973 goto do_callbacks;
974 } 974 }
975 975
976 if (XFS_BUF_TARGET(bp) != lasttarg || 976 if (bp->b_target != lasttarg ||
977 time_after(jiffies, (lasttime + 5*HZ))) { 977 time_after(jiffies, (lasttime + 5*HZ))) {
978 lasttime = jiffies; 978 lasttime = jiffies;
979 xfs_alert(mp, "Device %s: metadata write error block 0x%llx", 979 xfs_buf_ioerror_alert(bp, __func__);
980 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
981 (__uint64_t)XFS_BUF_ADDR(bp));
982 } 980 }
983 lasttarg = XFS_BUF_TARGET(bp); 981 lasttarg = bp->b_target;
984 982
985 /* 983 /*
986 * If the write was asynchronous then no one will be looking for the 984 * If the write was asynchronous then no one will be looking for the
@@ -991,12 +989,11 @@ xfs_buf_iodone_callbacks(
991 * around. 989 * around.
992 */ 990 */
993 if (XFS_BUF_ISASYNC(bp)) { 991 if (XFS_BUF_ISASYNC(bp)) {
994 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ 992 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
995 993
996 if (!XFS_BUF_ISSTALE(bp)) { 994 if (!XFS_BUF_ISSTALE(bp)) {
997 XFS_BUF_DELAYWRITE(bp); 995 xfs_buf_delwri_queue(bp);
998 XFS_BUF_DONE(bp); 996 XFS_BUF_DONE(bp);
999 XFS_BUF_SET_START(bp);
1000 } 997 }
1001 ASSERT(bp->b_iodone != NULL); 998 ASSERT(bp->b_iodone != NULL);
1002 trace_xfs_buf_item_iodone_async(bp, _RET_IP_); 999 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
@@ -1008,12 +1005,10 @@ xfs_buf_iodone_callbacks(
1008 * If the write of the buffer was synchronous, we want to make 1005 * If the write of the buffer was synchronous, we want to make
1009 * sure to return the error to the caller of xfs_bwrite(). 1006 * sure to return the error to the caller of xfs_bwrite().
1010 */ 1007 */
1011 XFS_BUF_STALE(bp); 1008 xfs_buf_stale(bp);
1012 XFS_BUF_DONE(bp); 1009 XFS_BUF_DONE(bp);
1013 XFS_BUF_UNDELAYWRITE(bp);
1014 1010
1015 trace_xfs_buf_error_relse(bp, _RET_IP_); 1011 trace_xfs_buf_error_relse(bp, _RET_IP_);
1016 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1017 1012
1018do_callbacks: 1013do_callbacks:
1019 xfs_buf_do_callbacks(bp); 1014 xfs_buf_do_callbacks(bp);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 5bfcb8779f9f..77c74257c2a3 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1578,9 +1578,8 @@ xfs_da_grow_inode_int(
1578 */ 1578 */
1579 nmap = 1; 1579 nmap = 1;
1580 ASSERT(args->firstblock != NULL); 1580 ASSERT(args->firstblock != NULL);
1581 error = xfs_bmapi(tp, dp, *bno, count, 1581 error = xfs_bmapi_write(tp, dp, *bno, count,
1582 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| 1582 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
1583 XFS_BMAPI_CONTIG,
1584 args->firstblock, args->total, &map, &nmap, 1583 args->firstblock, args->total, &map, &nmap,
1585 args->flist); 1584 args->flist);
1586 if (error) 1585 if (error)
@@ -1602,9 +1601,8 @@ xfs_da_grow_inode_int(
1602 for (b = *bno, mapi = 0; b < *bno + count; ) { 1601 for (b = *bno, mapi = 0; b < *bno + count; ) {
1603 nmap = MIN(XFS_BMAP_MAX_NMAP, count); 1602 nmap = MIN(XFS_BMAP_MAX_NMAP, count);
1604 c = (int)(*bno + count - b); 1603 c = (int)(*bno + count - b);
1605 error = xfs_bmapi(tp, dp, b, c, 1604 error = xfs_bmapi_write(tp, dp, b, c,
1606 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| 1605 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
1607 XFS_BMAPI_METADATA,
1608 args->firstblock, args->total, 1606 args->firstblock, args->total,
1609 &mapp[mapi], &nmap, args->flist); 1607 &mapp[mapi], &nmap, args->flist);
1610 if (error) 1608 if (error)
@@ -1975,33 +1973,16 @@ xfs_da_do_buf(
1975 /* 1973 /*
1976 * Optimize the one-block case. 1974 * Optimize the one-block case.
1977 */ 1975 */
1978 if (nfsb == 1) { 1976 if (nfsb == 1)
1979 xfs_fsblock_t fsb;
1980
1981 if ((error =
1982 xfs_bmapi_single(trans, dp, whichfork, &fsb,
1983 (xfs_fileoff_t)bno))) {
1984 return error;
1985 }
1986 mapp = &map; 1977 mapp = &map;
1987 if (fsb == NULLFSBLOCK) { 1978 else
1988 nmap = 0;
1989 } else {
1990 map.br_startblock = fsb;
1991 map.br_startoff = (xfs_fileoff_t)bno;
1992 map.br_blockcount = 1;
1993 nmap = 1;
1994 }
1995 } else {
1996 mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); 1979 mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
1997 nmap = nfsb; 1980
1998 if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, 1981 nmap = nfsb;
1999 nfsb, 1982 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp,
2000 XFS_BMAPI_METADATA | 1983 &nmap, xfs_bmapi_aflag(whichfork));
2001 xfs_bmapi_aflag(whichfork), 1984 if (error)
2002 NULL, 0, mapp, &nmap, NULL))) 1985 goto exit0;
2003 goto exit0;
2004 }
2005 } else { 1986 } else {
2006 map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); 1987 map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
2007 map.br_startoff = (xfs_fileoff_t)bno; 1988 map.br_startoff = (xfs_fileoff_t)bno;
@@ -2050,7 +2031,7 @@ xfs_da_do_buf(
2050 case 0: 2031 case 0:
2051 bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, 2032 bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
2052 mappedbno, nmapped, 0); 2033 mappedbno, nmapped, 0);
2053 error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO); 2034 error = bp ? bp->b_error : XFS_ERROR(EIO);
2054 break; 2035 break;
2055 case 1: 2036 case 1:
2056 case 2: 2037 case 2:
@@ -2072,13 +2053,10 @@ xfs_da_do_buf(
2072 if (!bp) 2053 if (!bp)
2073 continue; 2054 continue;
2074 if (caller == 1) { 2055 if (caller == 1) {
2075 if (whichfork == XFS_ATTR_FORK) { 2056 if (whichfork == XFS_ATTR_FORK)
2076 XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, 2057 xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
2077 XFS_ATTR_BTREE_REF); 2058 else
2078 } else { 2059 xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
2079 XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
2080 XFS_DIR_BTREE_REF);
2081 }
2082 } 2060 }
2083 if (bplist) { 2061 if (bplist) {
2084 bplist[nbplist++] = bp; 2062 bplist[nbplist++] = bp;
@@ -2268,7 +2246,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
2268 dabuf->nbuf = 1; 2246 dabuf->nbuf = 1;
2269 bp = bps[0]; 2247 bp = bps[0];
2270 dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); 2248 dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
2271 dabuf->data = XFS_BUF_PTR(bp); 2249 dabuf->data = bp->b_addr;
2272 dabuf->bps[0] = bp; 2250 dabuf->bps[0] = bp;
2273 } else { 2251 } else {
2274 dabuf->nbuf = nbuf; 2252 dabuf->nbuf = nbuf;
@@ -2279,7 +2257,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
2279 dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); 2257 dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
2280 for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { 2258 for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
2281 bp = bps[i]; 2259 bp = bps[i];
2282 memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp), 2260 memcpy((char *)dabuf->data + off, bp->b_addr,
2283 XFS_BUF_COUNT(bp)); 2261 XFS_BUF_COUNT(bp));
2284 } 2262 }
2285 } 2263 }
@@ -2302,8 +2280,8 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf)
2302 for (i = off = 0; i < dabuf->nbuf; 2280 for (i = off = 0; i < dabuf->nbuf;
2303 i++, off += XFS_BUF_COUNT(bp)) { 2281 i++, off += XFS_BUF_COUNT(bp)) {
2304 bp = dabuf->bps[i]; 2282 bp = dabuf->bps[i];
2305 memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off, 2283 memcpy(bp->b_addr, dabuf->data + off,
2306 XFS_BUF_COUNT(bp)); 2284 XFS_BUF_COUNT(bp));
2307 } 2285 }
2308 } 2286 }
2309} 2287}
@@ -2340,7 +2318,7 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
2340 2318
2341 ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); 2319 ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2342 if (dabuf->nbuf == 1) { 2320 if (dabuf->nbuf == 1) {
2343 ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0])); 2321 ASSERT(dabuf->data == dabuf->bps[0]->b_addr);
2344 xfs_trans_log_buf(tp, dabuf->bps[0], first, last); 2322 xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
2345 return; 2323 return;
2346 } 2324 }
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 9a84a85c03b1..654dc6f05bac 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -425,8 +425,8 @@ xfs_swap_extents(
425 } 425 }
426 426
427 427
428 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 428 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
429 xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 429 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
430 430
431 xfs_trans_log_inode(tp, ip, ilf_fields); 431 xfs_trans_log_inode(tp, ip, ilf_fields);
432 xfs_trans_log_inode(tp, tip, tilf_fields); 432 xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -438,7 +438,7 @@ xfs_swap_extents(
438 if (mp->m_flags & XFS_MOUNT_WSYNC) 438 if (mp->m_flags & XFS_MOUNT_WSYNC)
439 xfs_trans_set_sync(tp); 439 xfs_trans_set_sync(tp);
440 440
441 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); 441 error = xfs_trans_commit(tp, 0);
442 442
443 trace_xfs_swap_extent_after(ip, 0); 443 trace_xfs_swap_extent_after(ip, 0);
444 trace_xfs_swap_extent_after(tip, 1); 444 trace_xfs_swap_extent_after(tip, 1);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index dffba9ba0db6..a3721633abc8 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -148,7 +148,7 @@ typedef enum xfs_dinode_fmt {
148 be32_to_cpu((dip)->di_nextents) : \ 148 be32_to_cpu((dip)->di_nextents) : \
149 be16_to_cpu((dip)->di_anextents)) 149 be16_to_cpu((dip)->di_anextents))
150 150
151#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) 151#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr))
152 152
153/* 153/*
154 * For block and character special files the 32bit dev_t is stored at the 154 * For block and character special files the 32bit dev_t is stored at the
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ca2386d82cdf..66e108f561a3 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents(
888 * we already have in the table. 888 * we already have in the table.
889 */ 889 */
890 nmap = map_size - map_valid; 890 nmap = map_size - map_valid;
891 error = xfs_bmapi(NULL, dp, 891 error = xfs_bmapi_read(dp, map_off,
892 map_off,
893 xfs_dir2_byte_to_da(mp, 892 xfs_dir2_byte_to_da(mp,
894 XFS_DIR2_LEAF_OFFSET) - map_off, 893 XFS_DIR2_LEAF_OFFSET) - map_off,
895 XFS_BMAPI_METADATA, NULL, 0, 894 &map[map_valid], &nmap, 0);
896 &map[map_valid], &nmap, NULL);
897 /* 895 /*
898 * Don't know if we should ignore this or 896 * Don't know if we should ignore this or
899 * try to return an error. 897 * try to return an error.
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/xfs_discard.c
index 244e797dae32..8a24f0c6c860 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -38,7 +38,7 @@ xfs_trim_extents(
38 struct xfs_mount *mp, 38 struct xfs_mount *mp,
39 xfs_agnumber_t agno, 39 xfs_agnumber_t agno,
40 xfs_fsblock_t start, 40 xfs_fsblock_t start,
41 xfs_fsblock_t len, 41 xfs_fsblock_t end,
42 xfs_fsblock_t minlen, 42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed) 43 __uint64_t *blocks_trimmed)
44{ 44{
@@ -100,7 +100,7 @@ xfs_trim_extents(
100 * down partially overlapping ranges for now. 100 * down partially overlapping ranges for now.
101 */ 101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || 102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { 103 XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen); 104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent; 105 goto next_extent;
106 } 106 }
@@ -145,7 +145,7 @@ xfs_ioc_trim(
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; 145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity; 146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range; 147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen; 148 xfs_fsblock_t start, end, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno; 149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0; 150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0; 151 int error, last_error = 0;
@@ -165,19 +165,19 @@ xfs_ioc_trim(
165 * matter as trimming blocks is an advisory interface. 165 * matter as trimming blocks is an advisory interface.
166 */ 166 */
167 start = XFS_B_TO_FSBT(mp, range.start); 167 start = XFS_B_TO_FSBT(mp, range.start);
168 len = XFS_B_TO_FSBT(mp, range.len); 168 end = start + XFS_B_TO_FSBT(mp, range.len) - 1;
169 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); 169 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
170 170
171 start_agno = XFS_FSB_TO_AGNO(mp, start); 171 if (start >= mp->m_sb.sb_dblocks)
172 if (start_agno >= mp->m_sb.sb_agcount)
173 return -XFS_ERROR(EINVAL); 172 return -XFS_ERROR(EINVAL);
173 if (end > mp->m_sb.sb_dblocks - 1)
174 end = mp->m_sb.sb_dblocks - 1;
174 175
175 end_agno = XFS_FSB_TO_AGNO(mp, start + len); 176 start_agno = XFS_FSB_TO_AGNO(mp, start);
176 if (end_agno >= mp->m_sb.sb_agcount) 177 end_agno = XFS_FSB_TO_AGNO(mp, end);
177 end_agno = mp->m_sb.sb_agcount - 1;
178 178
179 for (agno = start_agno; agno <= end_agno; agno++) { 179 for (agno = start_agno; agno <= end_agno; agno++) {
180 error = -xfs_trim_extents(mp, agno, start, len, minlen, 180 error = -xfs_trim_extents(mp, agno, start, end, minlen,
181 &blocks_trimmed); 181 &blocks_trimmed);
182 if (error) 182 if (error)
183 last_error = error; 183 last_error = error;
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/xfs_discard.h
index 344879aea646..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 837f31158d43..25d7280e9f6b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -318,10 +318,9 @@ xfs_qm_init_dquot_blk(
318 int curid, i; 318 int curid, i;
319 319
320 ASSERT(tp); 320 ASSERT(tp);
321 ASSERT(XFS_BUF_ISBUSY(bp));
322 ASSERT(xfs_buf_islocked(bp)); 321 ASSERT(xfs_buf_islocked(bp));
323 322
324 d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); 323 d = bp->b_addr;
325 324
326 /* 325 /*
327 * ID of the first dquot in the block - id's are zero based. 326 * ID of the first dquot in the block - id's are zero based.
@@ -378,16 +377,14 @@ xfs_qm_dqalloc(
378 return (ESRCH); 377 return (ESRCH);
379 } 378 }
380 379
381 xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); 380 xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
382 nmaps = 1; 381 nmaps = 1;
383 if ((error = xfs_bmapi(tp, quotip, 382 error = xfs_bmapi_write(tp, quotip, offset_fsb,
384 offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, 383 XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
385 XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, 384 &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
386 &firstblock, 385 &map, &nmaps, &flist);
387 XFS_QM_DQALLOC_SPACE_RES(mp), 386 if (error)
388 &map, &nmaps, &flist))) {
389 goto error0; 387 goto error0;
390 }
391 ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); 388 ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
392 ASSERT(nmaps == 1); 389 ASSERT(nmaps == 1);
393 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 390 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -403,8 +400,11 @@ xfs_qm_dqalloc(
403 dqp->q_blkno, 400 dqp->q_blkno,
404 mp->m_quotainfo->qi_dqchunklen, 401 mp->m_quotainfo->qi_dqchunklen,
405 0); 402 0);
406 if (!bp || (error = XFS_BUF_GETERROR(bp))) 403
404 error = xfs_buf_geterror(bp);
405 if (error)
407 goto error1; 406 goto error1;
407
408 /* 408 /*
409 * Make a chunk of dquots out of this buffer and log 409 * Make a chunk of dquots out of this buffer and log
410 * the entire thing. 410 * the entire thing.
@@ -486,9 +486,8 @@ xfs_qm_dqtobp(
486 /* 486 /*
487 * Find the block map; no allocations yet 487 * Find the block map; no allocations yet
488 */ 488 */
489 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, 489 error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
490 XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 490 XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
491 NULL, 0, &map, &nmaps, NULL);
492 491
493 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 492 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
494 if (error) 493 if (error)
@@ -534,13 +533,12 @@ xfs_qm_dqtobp(
534 return XFS_ERROR(error); 533 return XFS_ERROR(error);
535 } 534 }
536 535
537 ASSERT(XFS_BUF_ISBUSY(bp));
538 ASSERT(xfs_buf_islocked(bp)); 536 ASSERT(xfs_buf_islocked(bp));
539 537
540 /* 538 /*
541 * calculate the location of the dquot inside the buffer. 539 * calculate the location of the dquot inside the buffer.
542 */ 540 */
543 ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); 541 ddq = bp->b_addr + dqp->q_bufoffset;
544 542
545 /* 543 /*
546 * A simple sanity check in case we got a corrupted dquot... 544 * A simple sanity check in case we got a corrupted dquot...
@@ -553,7 +551,6 @@ xfs_qm_dqtobp(
553 xfs_trans_brelse(tp, bp); 551 xfs_trans_brelse(tp, bp);
554 return XFS_ERROR(EIO); 552 return XFS_ERROR(EIO);
555 } 553 }
556 XFS_BUF_BUSY(bp); /* We dirtied this */
557 } 554 }
558 555
559 *O_bpp = bp; 556 *O_bpp = bp;
@@ -608,7 +605,7 @@ xfs_qm_dqread(
608 dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); 605 dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
609 606
610 /* Mark the buf so that this will stay incore a little longer */ 607 /* Mark the buf so that this will stay incore a little longer */
611 XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF); 608 xfs_buf_set_ref(bp, XFS_DQUOT_REF);
612 609
613 /* 610 /*
614 * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) 611 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
@@ -622,7 +619,6 @@ xfs_qm_dqread(
622 * this particular dquot was repaired. We still aren't afraid to 619 * this particular dquot was repaired. We still aren't afraid to
623 * brelse it because we have the changes incore. 620 * brelse it because we have the changes incore.
624 */ 621 */
625 ASSERT(XFS_BUF_ISBUSY(bp));
626 ASSERT(xfs_buf_islocked(bp)); 622 ASSERT(xfs_buf_islocked(bp));
627 xfs_trans_brelse(tp, bp); 623 xfs_trans_brelse(tp, bp);
628 624
@@ -1204,7 +1200,7 @@ xfs_qm_dqflush(
1204 /* 1200 /*
1205 * Calculate the location of the dquot inside the buffer. 1201 * Calculate the location of the dquot inside the buffer.
1206 */ 1202 */
1207 ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); 1203 ddqp = bp->b_addr + dqp->q_bufoffset;
1208 1204
1209 /* 1205 /*
1210 * A simple sanity check in case we got a corrupted dquot.. 1206 * A simple sanity check in case we got a corrupted dquot..
@@ -1240,15 +1236,17 @@ xfs_qm_dqflush(
1240 * If the buffer is pinned then push on the log so we won't 1236 * If the buffer is pinned then push on the log so we won't
1241 * get stuck waiting in the write for too long. 1237 * get stuck waiting in the write for too long.
1242 */ 1238 */
1243 if (XFS_BUF_ISPINNED(bp)) { 1239 if (xfs_buf_ispinned(bp)) {
1244 trace_xfs_dqflush_force(dqp); 1240 trace_xfs_dqflush_force(dqp);
1245 xfs_log_force(mp, 0); 1241 xfs_log_force(mp, 0);
1246 } 1242 }
1247 1243
1248 if (flags & SYNC_WAIT) 1244 if (flags & SYNC_WAIT)
1249 error = xfs_bwrite(mp, bp); 1245 error = xfs_bwrite(bp);
1250 else 1246 else
1251 xfs_bdwrite(mp, bp); 1247 xfs_buf_delwri_queue(bp);
1248
1249 xfs_buf_relse(bp);
1252 1250
1253 trace_xfs_dqflush_done(dqp); 1251 trace_xfs_dqflush_done(dqp);
1254 1252
@@ -1447,7 +1445,7 @@ xfs_qm_dqflock_pushbuf_wait(
1447 goto out_lock; 1445 goto out_lock;
1448 1446
1449 if (XFS_BUF_ISDELAYWRITE(bp)) { 1447 if (XFS_BUF_ISDELAYWRITE(bp)) {
1450 if (XFS_BUF_ISPINNED(bp)) 1448 if (xfs_buf_ispinned(bp))
1451 xfs_log_force(mp, 0); 1449 xfs_log_force(mp, 0);
1452 xfs_buf_delwri_promote(bp); 1450 xfs_buf_delwri_promote(bp);
1453 wake_up_process(bp->b_target->bt_task); 1451 wake_up_process(bp->b_target->bt_task);
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 34b7e945dbfa..34b7e945dbfa 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 9e0e2fa3f2c8..bb3f71d236d2 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait(
183 * search the buffer cache can be a time consuming thing, and AIL lock is a 183 * search the buffer cache can be a time consuming thing, and AIL lock is a
184 * spinlock. 184 * spinlock.
185 */ 185 */
186STATIC void 186STATIC bool
187xfs_qm_dquot_logitem_pushbuf( 187xfs_qm_dquot_logitem_pushbuf(
188 struct xfs_log_item *lip) 188 struct xfs_log_item *lip)
189{ 189{
190 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); 190 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
191 struct xfs_dquot *dqp = qlip->qli_dquot; 191 struct xfs_dquot *dqp = qlip->qli_dquot;
192 struct xfs_buf *bp; 192 struct xfs_buf *bp;
193 bool ret = true;
193 194
194 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 195 ASSERT(XFS_DQ_IS_LOCKED(dqp));
195 196
@@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf(
201 if (completion_done(&dqp->q_flush) || 202 if (completion_done(&dqp->q_flush) ||
202 !(lip->li_flags & XFS_LI_IN_AIL)) { 203 !(lip->li_flags & XFS_LI_IN_AIL)) {
203 xfs_dqunlock(dqp); 204 xfs_dqunlock(dqp);
204 return; 205 return true;
205 } 206 }
206 207
207 bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, 208 bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
208 dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); 209 dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
209 xfs_dqunlock(dqp); 210 xfs_dqunlock(dqp);
210 if (!bp) 211 if (!bp)
211 return; 212 return true;
212 if (XFS_BUF_ISDELAYWRITE(bp)) 213 if (XFS_BUF_ISDELAYWRITE(bp))
213 xfs_buf_delwri_promote(bp); 214 xfs_buf_delwri_promote(bp);
215 if (xfs_buf_ispinned(bp))
216 ret = false;
214 xfs_buf_relse(bp); 217 xfs_buf_relse(bp);
218 return ret;
215} 219}
216 220
217/* 221/*
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70b..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/xfs_export.c
index 75e5d322e48f..da108977b21f 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata(
229{ 229{
230 struct xfs_inode *ip = XFS_I(inode); 230 struct xfs_inode *ip = XFS_I(inode);
231 struct xfs_mount *mp = ip->i_mount; 231 struct xfs_mount *mp = ip->i_mount;
232 int error = 0; 232 xfs_lsn_t lsn = 0;
233 233
234 xfs_ilock(ip, XFS_ILOCK_SHARED); 234 xfs_ilock(ip, XFS_ILOCK_SHARED);
235 if (xfs_ipincount(ip)) { 235 if (xfs_ipincount(ip))
236 error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, 236 lsn = ip->i_itemp->ili_last_lsn;
237 XFS_LOG_SYNC, NULL);
238 }
239 xfs_iunlock(ip, XFS_ILOCK_SHARED); 237 xfs_iunlock(ip, XFS_ILOCK_SHARED);
240 238
241 return error; 239 if (!lsn)
240 return 0;
241 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
242} 242}
243 243
244const struct export_operations xfs_export_operations = { 244const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/xfs_export.h
index 3272b6ae7a35..3272b6ae7a35 100644
--- a/fs/xfs/linux-2.6/xfs_export.h
+++ b/fs/xfs/xfs_export.h
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/xfs_file.c
index 7f7b42469ea7..753ed9b5c70b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -124,6 +124,35 @@ xfs_iozero(
124 return (-status); 124 return (-status);
125} 125}
126 126
127/*
128 * Fsync operations on directories are much simpler than on regular files,
129 * as there is no file data to flush, and thus also no need for explicit
130 * cache flush operations, and there are no non-transaction metadata updates
131 * on directories either.
132 */
133STATIC int
134xfs_dir_fsync(
135 struct file *file,
136 loff_t start,
137 loff_t end,
138 int datasync)
139{
140 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
141 struct xfs_mount *mp = ip->i_mount;
142 xfs_lsn_t lsn = 0;
143
144 trace_xfs_dir_fsync(ip);
145
146 xfs_ilock(ip, XFS_ILOCK_SHARED);
147 if (xfs_ipincount(ip))
148 lsn = ip->i_itemp->ili_last_lsn;
149 xfs_iunlock(ip, XFS_ILOCK_SHARED);
150
151 if (!lsn)
152 return 0;
153 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
154}
155
127STATIC int 156STATIC int
128xfs_file_fsync( 157xfs_file_fsync(
129 struct file *file, 158 struct file *file,
@@ -137,6 +166,7 @@ xfs_file_fsync(
137 struct xfs_trans *tp; 166 struct xfs_trans *tp;
138 int error = 0; 167 int error = 0;
139 int log_flushed = 0; 168 int log_flushed = 0;
169 xfs_lsn_t lsn = 0;
140 170
141 trace_xfs_file_fsync(ip); 171 trace_xfs_file_fsync(ip);
142 172
@@ -149,10 +179,6 @@ xfs_file_fsync(
149 179
150 xfs_iflags_clear(ip, XFS_ITRUNCATED); 180 xfs_iflags_clear(ip, XFS_ITRUNCATED);
151 181
152 xfs_ilock(ip, XFS_IOLOCK_SHARED);
153 xfs_ioend_wait(ip);
154 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
155
156 if (mp->m_flags & XFS_MOUNT_BARRIER) { 182 if (mp->m_flags & XFS_MOUNT_BARRIER) {
157 /* 183 /*
158 * If we have an RT and/or log subvolume we need to make sure 184 * If we have an RT and/or log subvolume we need to make sure
@@ -216,11 +242,11 @@ xfs_file_fsync(
216 * transaction. So we play it safe and fire off the 242 * transaction. So we play it safe and fire off the
217 * transaction anyway. 243 * transaction anyway.
218 */ 244 */
219 xfs_trans_ijoin(tp, ip); 245 xfs_trans_ijoin(tp, ip, 0);
220 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 246 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
221 xfs_trans_set_sync(tp); 247 error = xfs_trans_commit(tp, 0);
222 error = _xfs_trans_commit(tp, 0, &log_flushed);
223 248
249 lsn = ip->i_itemp->ili_last_lsn;
224 xfs_iunlock(ip, XFS_ILOCK_EXCL); 250 xfs_iunlock(ip, XFS_ILOCK_EXCL);
225 } else { 251 } else {
226 /* 252 /*
@@ -231,14 +257,14 @@ xfs_file_fsync(
231 * disk yet, the inode will be still be pinned. If it is, 257 * disk yet, the inode will be still be pinned. If it is,
232 * force the log. 258 * force the log.
233 */ 259 */
234 if (xfs_ipincount(ip)) { 260 if (xfs_ipincount(ip))
235 error = _xfs_log_force_lsn(mp, 261 lsn = ip->i_itemp->ili_last_lsn;
236 ip->i_itemp->ili_last_lsn,
237 XFS_LOG_SYNC, &log_flushed);
238 }
239 xfs_iunlock(ip, XFS_ILOCK_SHARED); 262 xfs_iunlock(ip, XFS_ILOCK_SHARED);
240 } 263 }
241 264
265 if (!error && lsn)
266 error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
267
242 /* 268 /*
243 * If we only have a single device, and the log force about was 269 * If we only have a single device, and the log force about was
244 * a no-op we might have to flush the data device cache here. 270 * a no-op we might have to flush the data device cache here.
@@ -317,7 +343,19 @@ xfs_file_aio_read(
317 if (XFS_FORCED_SHUTDOWN(mp)) 343 if (XFS_FORCED_SHUTDOWN(mp))
318 return -EIO; 344 return -EIO;
319 345
320 if (unlikely(ioflags & IO_ISDIRECT)) { 346 /*
347 * Locking is a bit tricky here. If we take an exclusive lock
348 * for direct IO, we effectively serialise all new concurrent
349 * read IO to this file and block it behind IO that is currently in
350 * progress because IO in progress holds the IO lock shared. We only
351 * need to hold the lock exclusive to blow away the page cache, so
352 * only take lock exclusively if the page cache needs invalidation.
353 * This allows the normal direct IO case of no page cache pages to
354 * proceeed concurrently without serialisation.
355 */
356 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
357 if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
358 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 359 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
322 360
323 if (inode->i_mapping->nrpages) { 361 if (inode->i_mapping->nrpages) {
@@ -330,8 +368,7 @@ xfs_file_aio_read(
330 } 368 }
331 } 369 }
332 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 370 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
333 } else 371 }
334 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
335 372
336 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 373 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
337 374
@@ -407,11 +444,13 @@ xfs_aio_write_isize_update(
407 */ 444 */
408STATIC void 445STATIC void
409xfs_aio_write_newsize_update( 446xfs_aio_write_newsize_update(
410 struct xfs_inode *ip) 447 struct xfs_inode *ip,
448 xfs_fsize_t new_size)
411{ 449{
412 if (ip->i_new_size) { 450 if (new_size == ip->i_new_size) {
413 xfs_rw_ilock(ip, XFS_ILOCK_EXCL); 451 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
414 ip->i_new_size = 0; 452 if (new_size == ip->i_new_size)
453 ip->i_new_size = 0;
415 if (ip->i_d.di_size > ip->i_size) 454 if (ip->i_d.di_size > ip->i_size)
416 ip->i_d.di_size = ip->i_size; 455 ip->i_d.di_size = ip->i_size;
417 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); 456 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
@@ -462,7 +501,7 @@ xfs_file_splice_write(
462 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 501 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
463 502
464 xfs_aio_write_isize_update(inode, ppos, ret); 503 xfs_aio_write_isize_update(inode, ppos, ret);
465 xfs_aio_write_newsize_update(ip); 504 xfs_aio_write_newsize_update(ip, new_size);
466 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 505 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
467 return ret; 506 return ret;
468} 507}
@@ -500,11 +539,9 @@ xfs_zero_last_block(
500 539
501 last_fsb = XFS_B_TO_FSBT(mp, isize); 540 last_fsb = XFS_B_TO_FSBT(mp, isize);
502 nimaps = 1; 541 nimaps = 1;
503 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, 542 error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
504 &nimaps, NULL); 543 if (error)
505 if (error) {
506 return error; 544 return error;
507 }
508 ASSERT(nimaps > 0); 545 ASSERT(nimaps > 0);
509 /* 546 /*
510 * If the block underlying isize is just a hole, then there 547 * If the block underlying isize is just a hole, then there
@@ -595,8 +632,8 @@ xfs_zero_eof(
595 while (start_zero_fsb <= end_zero_fsb) { 632 while (start_zero_fsb <= end_zero_fsb) {
596 nimaps = 1; 633 nimaps = 1;
597 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 634 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
598 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, 635 error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
599 0, NULL, 0, &imap, &nimaps, NULL); 636 &imap, &nimaps, 0);
600 if (error) { 637 if (error) {
601 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 638 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
602 return error; 639 return error;
@@ -659,6 +696,7 @@ xfs_file_aio_write_checks(
659 struct file *file, 696 struct file *file,
660 loff_t *pos, 697 loff_t *pos,
661 size_t *count, 698 size_t *count,
699 xfs_fsize_t *new_sizep,
662 int *iolock) 700 int *iolock)
663{ 701{
664 struct inode *inode = file->f_mapping->host; 702 struct inode *inode = file->f_mapping->host;
@@ -666,6 +704,9 @@ xfs_file_aio_write_checks(
666 xfs_fsize_t new_size; 704 xfs_fsize_t new_size;
667 int error = 0; 705 int error = 0;
668 706
707 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
708 *new_sizep = 0;
709restart:
669 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); 710 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
670 if (error) { 711 if (error) {
671 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 712 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
@@ -673,20 +714,41 @@ xfs_file_aio_write_checks(
673 return error; 714 return error;
674 } 715 }
675 716
676 new_size = *pos + *count;
677 if (new_size > ip->i_size)
678 ip->i_new_size = new_size;
679
680 if (likely(!(file->f_mode & FMODE_NOCMTIME))) 717 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
681 file_update_time(file); 718 file_update_time(file);
682 719
683 /* 720 /*
684 * If the offset is beyond the size of the file, we need to zero any 721 * If the offset is beyond the size of the file, we need to zero any
685 * blocks that fall between the existing EOF and the start of this 722 * blocks that fall between the existing EOF and the start of this
686 * write. 723 * write. There is no need to issue zeroing if another in-flght IO ends
724 * at or before this one If zeronig is needed and we are currently
725 * holding the iolock shared, we need to update it to exclusive which
726 * involves dropping all locks and relocking to maintain correct locking
727 * order. If we do this, restart the function to ensure all checks and
728 * values are still valid.
687 */ 729 */
688 if (*pos > ip->i_size) 730 if ((ip->i_new_size && *pos > ip->i_new_size) ||
731 (!ip->i_new_size && *pos > ip->i_size)) {
732 if (*iolock == XFS_IOLOCK_SHARED) {
733 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
734 *iolock = XFS_IOLOCK_EXCL;
735 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
736 goto restart;
737 }
689 error = -xfs_zero_eof(ip, *pos, ip->i_size); 738 error = -xfs_zero_eof(ip, *pos, ip->i_size);
739 }
740
741 /*
742 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
743 * We have already zeroed space beyond EOF (if necessary). Only update
744 * ip->i_new_size if this IO ends beyond any other in-flight writes.
745 */
746 new_size = *pos + *count;
747 if (new_size > ip->i_size) {
748 if (new_size > ip->i_new_size)
749 ip->i_new_size = new_size;
750 *new_sizep = new_size;
751 }
690 752
691 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); 753 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
692 if (error) 754 if (error)
@@ -721,7 +783,7 @@ xfs_file_aio_write_checks(
721 * the dio layer. To avoid the problem with aio, we also need to wait for 783 * the dio layer. To avoid the problem with aio, we also need to wait for
722 * outstanding IOs to complete so that unwritten extent conversion is completed 784 * outstanding IOs to complete so that unwritten extent conversion is completed
723 * before we try to map the overlapping block. This is currently implemented by 785 * before we try to map the overlapping block. This is currently implemented by
724 * hitting it with a big hammer (i.e. xfs_ioend_wait()). 786 * hitting it with a big hammer (i.e. inode_dio_wait()).
725 * 787 *
726 * Returns with locks held indicated by @iolock and errors indicated by 788 * Returns with locks held indicated by @iolock and errors indicated by
727 * negative return values. 789 * negative return values.
@@ -733,6 +795,7 @@ xfs_file_dio_aio_write(
733 unsigned long nr_segs, 795 unsigned long nr_segs,
734 loff_t pos, 796 loff_t pos,
735 size_t ocount, 797 size_t ocount,
798 xfs_fsize_t *new_size,
736 int *iolock) 799 int *iolock)
737{ 800{
738 struct file *file = iocb->ki_filp; 801 struct file *file = iocb->ki_filp;
@@ -753,18 +816,35 @@ xfs_file_dio_aio_write(
753 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) 816 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
754 unaligned_io = 1; 817 unaligned_io = 1;
755 818
756 if (unaligned_io || mapping->nrpages || pos > ip->i_size) 819 /*
820 * We don't need to take an exclusive lock unless there page cache needs
821 * to be invalidated or unaligned IO is being executed. We don't need to
822 * consider the EOF extension case here because
823 * xfs_file_aio_write_checks() will relock the inode as necessary for
824 * EOF zeroing cases and fill out the new inode size as appropriate.
825 */
826 if (unaligned_io || mapping->nrpages)
757 *iolock = XFS_IOLOCK_EXCL; 827 *iolock = XFS_IOLOCK_EXCL;
758 else 828 else
759 *iolock = XFS_IOLOCK_SHARED; 829 *iolock = XFS_IOLOCK_SHARED;
760 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 830 xfs_rw_ilock(ip, *iolock);
761 831
762 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); 832 /*
833 * Recheck if there are cached pages that need invalidate after we got
834 * the iolock to protect against other threads adding new pages while
835 * we were waiting for the iolock.
836 */
837 if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
838 xfs_rw_iunlock(ip, *iolock);
839 *iolock = XFS_IOLOCK_EXCL;
840 xfs_rw_ilock(ip, *iolock);
841 }
842
843 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
763 if (ret) 844 if (ret)
764 return ret; 845 return ret;
765 846
766 if (mapping->nrpages) { 847 if (mapping->nrpages) {
767 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
768 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, 848 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
769 FI_REMAPF_LOCKED); 849 FI_REMAPF_LOCKED);
770 if (ret) 850 if (ret)
@@ -776,7 +856,7 @@ xfs_file_dio_aio_write(
776 * otherwise demote the lock if we had to flush cached pages 856 * otherwise demote the lock if we had to flush cached pages
777 */ 857 */
778 if (unaligned_io) 858 if (unaligned_io)
779 xfs_ioend_wait(ip); 859 inode_dio_wait(inode);
780 else if (*iolock == XFS_IOLOCK_EXCL) { 860 else if (*iolock == XFS_IOLOCK_EXCL) {
781 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 861 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
782 *iolock = XFS_IOLOCK_SHARED; 862 *iolock = XFS_IOLOCK_SHARED;
@@ -798,6 +878,7 @@ xfs_file_buffered_aio_write(
798 unsigned long nr_segs, 878 unsigned long nr_segs,
799 loff_t pos, 879 loff_t pos,
800 size_t ocount, 880 size_t ocount,
881 xfs_fsize_t *new_size,
801 int *iolock) 882 int *iolock)
802{ 883{
803 struct file *file = iocb->ki_filp; 884 struct file *file = iocb->ki_filp;
@@ -809,9 +890,9 @@ xfs_file_buffered_aio_write(
809 size_t count = ocount; 890 size_t count = ocount;
810 891
811 *iolock = XFS_IOLOCK_EXCL; 892 *iolock = XFS_IOLOCK_EXCL;
812 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 893 xfs_rw_ilock(ip, *iolock);
813 894
814 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); 895 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
815 if (ret) 896 if (ret)
816 return ret; 897 return ret;
817 898
@@ -851,6 +932,7 @@ xfs_file_aio_write(
851 ssize_t ret; 932 ssize_t ret;
852 int iolock; 933 int iolock;
853 size_t ocount = 0; 934 size_t ocount = 0;
935 xfs_fsize_t new_size = 0;
854 936
855 XFS_STATS_INC(xs_write_calls); 937 XFS_STATS_INC(xs_write_calls);
856 938
@@ -870,10 +952,10 @@ xfs_file_aio_write(
870 952
871 if (unlikely(file->f_flags & O_DIRECT)) 953 if (unlikely(file->f_flags & O_DIRECT))
872 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, 954 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
873 ocount, &iolock); 955 ocount, &new_size, &iolock);
874 else 956 else
875 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, 957 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
876 ocount, &iolock); 958 ocount, &new_size, &iolock);
877 959
878 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); 960 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
879 961
@@ -894,7 +976,7 @@ xfs_file_aio_write(
894 } 976 }
895 977
896out_unlock: 978out_unlock:
897 xfs_aio_write_newsize_update(ip); 979 xfs_aio_write_newsize_update(ip, new_size);
898 xfs_rw_iunlock(ip, iolock); 980 xfs_rw_iunlock(ip, iolock);
899 return ret; 981 return ret;
900} 982}
@@ -1087,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = {
1087#ifdef CONFIG_COMPAT 1169#ifdef CONFIG_COMPAT
1088 .compat_ioctl = xfs_file_compat_ioctl, 1170 .compat_ioctl = xfs_file_compat_ioctl,
1089#endif 1171#endif
1090 .fsync = xfs_file_fsync, 1172 .fsync = xfs_dir_fsync,
1091}; 1173};
1092 1174
1093static const struct vm_operations_struct xfs_file_vm_ops = { 1175static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 3ff3d9e23ded..5170306a1009 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -682,7 +682,7 @@ xfs_filestream_new_ag(
682 ip = ap->ip; 682 ip = ap->ip;
683 mp = ip->i_mount; 683 mp = ip->i_mount;
684 cache = mp->m_filestream; 684 cache = mp->m_filestream;
685 minlen = ap->alen; 685 minlen = ap->length;
686 *agp = NULLAGNUMBER; 686 *agp = NULLAGNUMBER;
687 687
688 /* 688 /*
@@ -761,7 +761,7 @@ xfs_filestream_new_ag(
761 */ 761 */
762 ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; 762 ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
763 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | 763 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
764 (ap->low ? XFS_PICK_LOWSPACE : 0); 764 (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
765 765
766 err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); 766 err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
767 if (err || *agp == NULLAGNUMBER) 767 if (err || *agp == NULLAGNUMBER)
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9153d2c77caf..1c6fdeb702ff 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -194,6 +194,10 @@ xfs_growfs_data_private(
194 bp = xfs_buf_get(mp->m_ddev_targp, 194 bp = xfs_buf_get(mp->m_ddev_targp,
195 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 195 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
196 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); 196 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
197 if (!bp) {
198 error = ENOMEM;
199 goto error0;
200 }
197 agf = XFS_BUF_TO_AGF(bp); 201 agf = XFS_BUF_TO_AGF(bp);
198 memset(agf, 0, mp->m_sb.sb_sectsize); 202 memset(agf, 0, mp->m_sb.sb_sectsize);
199 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); 203 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -216,16 +220,21 @@ xfs_growfs_data_private(
216 tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); 220 tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
217 agf->agf_freeblks = cpu_to_be32(tmpsize); 221 agf->agf_freeblks = cpu_to_be32(tmpsize);
218 agf->agf_longest = cpu_to_be32(tmpsize); 222 agf->agf_longest = cpu_to_be32(tmpsize);
219 error = xfs_bwrite(mp, bp); 223 error = xfs_bwrite(bp);
220 if (error) { 224 xfs_buf_relse(bp);
225 if (error)
221 goto error0; 226 goto error0;
222 } 227
223 /* 228 /*
224 * AG inode header block 229 * AG inode header block
225 */ 230 */
226 bp = xfs_buf_get(mp->m_ddev_targp, 231 bp = xfs_buf_get(mp->m_ddev_targp,
227 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 232 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
228 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); 233 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
234 if (!bp) {
235 error = ENOMEM;
236 goto error0;
237 }
229 agi = XFS_BUF_TO_AGI(bp); 238 agi = XFS_BUF_TO_AGI(bp);
230 memset(agi, 0, mp->m_sb.sb_sectsize); 239 memset(agi, 0, mp->m_sb.sb_sectsize);
231 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); 240 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -240,10 +249,11 @@ xfs_growfs_data_private(
240 agi->agi_dirino = cpu_to_be32(NULLAGINO); 249 agi->agi_dirino = cpu_to_be32(NULLAGINO);
241 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) 250 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
242 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); 251 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
243 error = xfs_bwrite(mp, bp); 252 error = xfs_bwrite(bp);
244 if (error) { 253 xfs_buf_relse(bp);
254 if (error)
245 goto error0; 255 goto error0;
246 } 256
247 /* 257 /*
248 * BNO btree root block 258 * BNO btree root block
249 */ 259 */
@@ -251,6 +261,10 @@ xfs_growfs_data_private(
251 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), 261 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
252 BTOBB(mp->m_sb.sb_blocksize), 262 BTOBB(mp->m_sb.sb_blocksize),
253 XBF_LOCK | XBF_MAPPED); 263 XBF_LOCK | XBF_MAPPED);
264 if (!bp) {
265 error = ENOMEM;
266 goto error0;
267 }
254 block = XFS_BUF_TO_BLOCK(bp); 268 block = XFS_BUF_TO_BLOCK(bp);
255 memset(block, 0, mp->m_sb.sb_blocksize); 269 memset(block, 0, mp->m_sb.sb_blocksize);
256 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); 270 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -262,10 +276,11 @@ xfs_growfs_data_private(
262 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 276 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
263 arec->ar_blockcount = cpu_to_be32( 277 arec->ar_blockcount = cpu_to_be32(
264 agsize - be32_to_cpu(arec->ar_startblock)); 278 agsize - be32_to_cpu(arec->ar_startblock));
265 error = xfs_bwrite(mp, bp); 279 error = xfs_bwrite(bp);
266 if (error) { 280 xfs_buf_relse(bp);
281 if (error)
267 goto error0; 282 goto error0;
268 } 283
269 /* 284 /*
270 * CNT btree root block 285 * CNT btree root block
271 */ 286 */
@@ -273,6 +288,10 @@ xfs_growfs_data_private(
273 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), 288 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
274 BTOBB(mp->m_sb.sb_blocksize), 289 BTOBB(mp->m_sb.sb_blocksize),
275 XBF_LOCK | XBF_MAPPED); 290 XBF_LOCK | XBF_MAPPED);
291 if (!bp) {
292 error = ENOMEM;
293 goto error0;
294 }
276 block = XFS_BUF_TO_BLOCK(bp); 295 block = XFS_BUF_TO_BLOCK(bp);
277 memset(block, 0, mp->m_sb.sb_blocksize); 296 memset(block, 0, mp->m_sb.sb_blocksize);
278 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); 297 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -285,10 +304,11 @@ xfs_growfs_data_private(
285 arec->ar_blockcount = cpu_to_be32( 304 arec->ar_blockcount = cpu_to_be32(
286 agsize - be32_to_cpu(arec->ar_startblock)); 305 agsize - be32_to_cpu(arec->ar_startblock));
287 nfree += be32_to_cpu(arec->ar_blockcount); 306 nfree += be32_to_cpu(arec->ar_blockcount);
288 error = xfs_bwrite(mp, bp); 307 error = xfs_bwrite(bp);
289 if (error) { 308 xfs_buf_relse(bp);
309 if (error)
290 goto error0; 310 goto error0;
291 } 311
292 /* 312 /*
293 * INO btree root block 313 * INO btree root block
294 */ 314 */
@@ -296,6 +316,10 @@ xfs_growfs_data_private(
296 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), 316 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
297 BTOBB(mp->m_sb.sb_blocksize), 317 BTOBB(mp->m_sb.sb_blocksize),
298 XBF_LOCK | XBF_MAPPED); 318 XBF_LOCK | XBF_MAPPED);
319 if (!bp) {
320 error = ENOMEM;
321 goto error0;
322 }
299 block = XFS_BUF_TO_BLOCK(bp); 323 block = XFS_BUF_TO_BLOCK(bp);
300 memset(block, 0, mp->m_sb.sb_blocksize); 324 memset(block, 0, mp->m_sb.sb_blocksize);
301 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); 325 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
@@ -303,10 +327,10 @@ xfs_growfs_data_private(
303 block->bb_numrecs = 0; 327 block->bb_numrecs = 0;
304 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 328 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
305 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 329 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
306 error = xfs_bwrite(mp, bp); 330 error = xfs_bwrite(bp);
307 if (error) { 331 xfs_buf_relse(bp);
332 if (error)
308 goto error0; 333 goto error0;
309 }
310 } 334 }
311 xfs_trans_agblocks_delta(tp, nfree); 335 xfs_trans_agblocks_delta(tp, nfree);
312 /* 336 /*
@@ -396,9 +420,9 @@ xfs_growfs_data_private(
396 * just issue a warning and continue. The real work is 420 * just issue a warning and continue. The real work is
397 * already done and committed. 421 * already done and committed.
398 */ 422 */
399 if (!(error = xfs_bwrite(mp, bp))) { 423 error = xfs_bwrite(bp);
400 continue; 424 xfs_buf_relse(bp);
401 } else { 425 if (error) {
402 xfs_warn(mp, 426 xfs_warn(mp,
403 "write error %d updating secondary superblock for ag %d", 427 "write error %d updating secondary superblock for ag %d",
404 error, agno); 428 error, agno);
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index dd5628bd8d0b..169380e66057 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -150,7 +150,7 @@ xfs_check_agi_freecount(
150/* 150/*
151 * Initialise a new set of inodes. 151 * Initialise a new set of inodes.
152 */ 152 */
153STATIC void 153STATIC int
154xfs_ialloc_inode_init( 154xfs_ialloc_inode_init(
155 struct xfs_mount *mp, 155 struct xfs_mount *mp,
156 struct xfs_trans *tp, 156 struct xfs_trans *tp,
@@ -202,9 +202,8 @@ xfs_ialloc_inode_init(
202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
203 mp->m_bsize * blks_per_cluster, 203 mp->m_bsize * blks_per_cluster,
204 XBF_LOCK); 204 XBF_LOCK);
205 ASSERT(fbuf); 205 if (!fbuf)
206 ASSERT(!XFS_BUF_GETERROR(fbuf)); 206 return ENOMEM;
207
208 /* 207 /*
209 * Initialize all inodes in this buffer and then log them. 208 * Initialize all inodes in this buffer and then log them.
210 * 209 *
@@ -226,6 +225,7 @@ xfs_ialloc_inode_init(
226 } 225 }
227 xfs_trans_inode_alloc_buf(tp, fbuf); 226 xfs_trans_inode_alloc_buf(tp, fbuf);
228 } 227 }
228 return 0;
229} 229}
230 230
231/* 231/*
@@ -370,9 +370,11 @@ xfs_ialloc_ag_alloc(
370 * rather than a linear progression to prevent the next generation 370 * rather than a linear progression to prevent the next generation
371 * number from being easily guessable. 371 * number from being easily guessable.
372 */ 372 */
373 xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len, 373 error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
374 random32()); 374 args.len, random32());
375 375
376 if (error)
377 return error;
376 /* 378 /*
377 * Convert the results. 379 * Convert the results.
378 */ 380 */
@@ -1486,7 +1488,7 @@ xfs_read_agi(
1486 if (error) 1488 if (error)
1487 return error; 1489 return error;
1488 1490
1489 ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp)); 1491 ASSERT(!xfs_buf_geterror(*bpp));
1490 agi = XFS_BUF_TO_AGI(*bpp); 1492 agi = XFS_BUF_TO_AGI(*bpp);
1491 1493
1492 /* 1494 /*
@@ -1503,7 +1505,7 @@ xfs_read_agi(
1503 return XFS_ERROR(EFSCORRUPTED); 1505 return XFS_ERROR(EFSCORRUPTED);
1504 } 1506 }
1505 1507
1506 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF); 1508 xfs_buf_set_ref(*bpp, XFS_AGI_REF);
1507 1509
1508 xfs_check_agi_unlinked(agi); 1510 xfs_check_agi_unlinked(agi);
1509 return 0; 1511 return 0;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 7759812c1bbe..0fa98b1c70ea 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -75,7 +75,6 @@ xfs_inode_alloc(
75 return NULL; 75 return NULL;
76 } 76 }
77 77
78 ASSERT(atomic_read(&ip->i_iocount) == 0);
79 ASSERT(atomic_read(&ip->i_pincount) == 0); 78 ASSERT(atomic_read(&ip->i_pincount) == 0);
80 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 79 ASSERT(!spin_is_locked(&ip->i_flags_lock));
81 ASSERT(completion_done(&ip->i_flush)); 80 ASSERT(completion_done(&ip->i_flush));
@@ -150,7 +149,6 @@ xfs_inode_free(
150 } 149 }
151 150
152 /* asserts to verify all state is correct here */ 151 /* asserts to verify all state is correct here */
153 ASSERT(atomic_read(&ip->i_iocount) == 0);
154 ASSERT(atomic_read(&ip->i_pincount) == 0); 152 ASSERT(atomic_read(&ip->i_pincount) == 0);
155 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 153 ASSERT(!spin_is_locked(&ip->i_flags_lock));
156 ASSERT(completion_done(&ip->i_flush)); 154 ASSERT(completion_done(&ip->i_flush));
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2fcca4b03ed3..c0237c602f11 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -190,12 +190,6 @@ xfs_imap_to_bp(
190 } 190 }
191 191
192 xfs_inobp_check(mp, bp); 192 xfs_inobp_check(mp, bp);
193
194 /*
195 * Mark the buffer as an inode buffer now that it looks good
196 */
197 XFS_BUF_SET_VTYPE(bp, B_FS_INO);
198
199 *bpp = bp; 193 *bpp = bp;
200 return 0; 194 return 0;
201} 195}
@@ -1152,7 +1146,7 @@ xfs_ialloc(
1152 /* 1146 /*
1153 * Log the new values stuffed into the inode. 1147 * Log the new values stuffed into the inode.
1154 */ 1148 */
1155 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 1149 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1156 xfs_trans_log_inode(tp, ip, flags); 1150 xfs_trans_log_inode(tp, ip, flags);
1157 1151
1158 /* now that we have an i_mode we can setup inode ops and unlock */ 1152 /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1187,6 +1181,7 @@ xfs_isize_check(
1187 xfs_fileoff_t map_first; 1181 xfs_fileoff_t map_first;
1188 int nimaps; 1182 int nimaps;
1189 xfs_bmbt_irec_t imaps[2]; 1183 xfs_bmbt_irec_t imaps[2];
1184 int error;
1190 1185
1191 if (!S_ISREG(ip->i_d.di_mode)) 1186 if (!S_ISREG(ip->i_d.di_mode))
1192 return; 1187 return;
@@ -1203,13 +1198,12 @@ xfs_isize_check(
1203 * The filesystem could be shutting down, so bmapi may return 1198 * The filesystem could be shutting down, so bmapi may return
1204 * an error. 1199 * an error.
1205 */ 1200 */
1206 if (xfs_bmapi(NULL, ip, map_first, 1201 error = xfs_bmapi_read(ip, map_first,
1207 (XFS_B_TO_FSB(mp, 1202 (XFS_B_TO_FSB(mp,
1208 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - 1203 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
1209 map_first), 1204 imaps, &nimaps, XFS_BMAPI_ENTIRE);
1210 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, 1205 if (error)
1211 NULL)) 1206 return;
1212 return;
1213 ASSERT(nimaps == 1); 1207 ASSERT(nimaps == 1);
1214 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1208 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1215} 1209}
@@ -1297,7 +1291,7 @@ xfs_itruncate_extents(
1297 */ 1291 */
1298 error = xfs_bmap_finish(&tp, &free_list, &committed); 1292 error = xfs_bmap_finish(&tp, &free_list, &committed);
1299 if (committed) 1293 if (committed)
1300 xfs_trans_ijoin(tp, ip); 1294 xfs_trans_ijoin(tp, ip, 0);
1301 if (error) 1295 if (error)
1302 goto out_bmap_cancel; 1296 goto out_bmap_cancel;
1303 1297
@@ -1313,7 +1307,7 @@ xfs_itruncate_extents(
1313 error = xfs_trans_commit(tp, 0); 1307 error = xfs_trans_commit(tp, 0);
1314 tp = ntp; 1308 tp = ntp;
1315 1309
1316 xfs_trans_ijoin(tp, ip); 1310 xfs_trans_ijoin(tp, ip, 0);
1317 1311
1318 if (error) 1312 if (error)
1319 goto out; 1313 goto out;
@@ -1644,7 +1638,7 @@ xfs_iunlink_remove(
1644 * inodes that are in memory - they all must be marked stale and attached to 1638 * inodes that are in memory - they all must be marked stale and attached to
1645 * the cluster buffer. 1639 * the cluster buffer.
1646 */ 1640 */
1647STATIC void 1641STATIC int
1648xfs_ifree_cluster( 1642xfs_ifree_cluster(
1649 xfs_inode_t *free_ip, 1643 xfs_inode_t *free_ip,
1650 xfs_trans_t *tp, 1644 xfs_trans_t *tp,
@@ -1690,6 +1684,8 @@ xfs_ifree_cluster(
1690 mp->m_bsize * blks_per_cluster, 1684 mp->m_bsize * blks_per_cluster,
1691 XBF_LOCK); 1685 XBF_LOCK);
1692 1686
1687 if (!bp)
1688 return ENOMEM;
1693 /* 1689 /*
1694 * Walk the inodes already attached to the buffer and mark them 1690 * Walk the inodes already attached to the buffer and mark them
1695 * stale. These will all have the flush locks held, so an 1691 * stale. These will all have the flush locks held, so an
@@ -1799,6 +1795,7 @@ retry:
1799 } 1795 }
1800 1796
1801 xfs_perag_put(pag); 1797 xfs_perag_put(pag);
1798 return 0;
1802} 1799}
1803 1800
1804/* 1801/*
@@ -1878,10 +1875,10 @@ xfs_ifree(
1878 dip->di_mode = 0; 1875 dip->di_mode = 0;
1879 1876
1880 if (delete) { 1877 if (delete) {
1881 xfs_ifree_cluster(ip, tp, first_ino); 1878 error = xfs_ifree_cluster(ip, tp, first_ino);
1882 } 1879 }
1883 1880
1884 return 0; 1881 return error;
1885} 1882}
1886 1883
1887/* 1884/*
@@ -2472,11 +2469,11 @@ cluster_corrupt_out:
2472 */ 2469 */
2473 if (bp->b_iodone) { 2470 if (bp->b_iodone) {
2474 XFS_BUF_UNDONE(bp); 2471 XFS_BUF_UNDONE(bp);
2475 XFS_BUF_STALE(bp); 2472 xfs_buf_stale(bp);
2476 XFS_BUF_ERROR(bp,EIO); 2473 xfs_buf_ioerror(bp, EIO);
2477 xfs_buf_ioend(bp, 0); 2474 xfs_buf_ioend(bp, 0);
2478 } else { 2475 } else {
2479 XFS_BUF_STALE(bp); 2476 xfs_buf_stale(bp);
2480 xfs_buf_relse(bp); 2477 xfs_buf_relse(bp);
2481 } 2478 }
2482 } 2479 }
@@ -2585,7 +2582,7 @@ xfs_iflush(
2585 * If the buffer is pinned then push on the log now so we won't 2582 * If the buffer is pinned then push on the log now so we won't
2586 * get stuck waiting in the write for too long. 2583 * get stuck waiting in the write for too long.
2587 */ 2584 */
2588 if (XFS_BUF_ISPINNED(bp)) 2585 if (xfs_buf_ispinned(bp))
2589 xfs_log_force(mp, 0); 2586 xfs_log_force(mp, 0);
2590 2587
2591 /* 2588 /*
@@ -2597,9 +2594,11 @@ xfs_iflush(
2597 goto cluster_corrupt_out; 2594 goto cluster_corrupt_out;
2598 2595
2599 if (flags & SYNC_WAIT) 2596 if (flags & SYNC_WAIT)
2600 error = xfs_bwrite(mp, bp); 2597 error = xfs_bwrite(bp);
2601 else 2598 else
2602 xfs_bdwrite(mp, bp); 2599 xfs_buf_delwri_queue(bp);
2600
2601 xfs_buf_relse(bp);
2603 return error; 2602 return error;
2604 2603
2605corrupt_out: 2604corrupt_out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 2380a4bcbece..760140d1dd66 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -257,7 +257,6 @@ typedef struct xfs_inode {
257 257
258 xfs_fsize_t i_size; /* in-memory size */ 258 xfs_fsize_t i_size; /* in-memory size */
259 xfs_fsize_t i_new_size; /* size when write completes */ 259 xfs_fsize_t i_new_size; /* size when write completes */
260 atomic_t i_iocount; /* outstanding I/O count */
261 260
262 /* VFS inode */ 261 /* VFS inode */
263 struct inode i_vnode; /* embedded VFS inode */ 262 struct inode i_vnode; /* embedded VFS inode */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 588406dc6a35..b7cf21ba240f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -658,10 +658,8 @@ xfs_inode_item_unlock(
658 658
659 lock_flags = iip->ili_lock_flags; 659 lock_flags = iip->ili_lock_flags;
660 iip->ili_lock_flags = 0; 660 iip->ili_lock_flags = 0;
661 if (lock_flags) { 661 if (lock_flags)
662 xfs_iunlock(ip, lock_flags); 662 xfs_iunlock(ip, lock_flags);
663 IRELE(ip);
664 }
665} 663}
666 664
667/* 665/*
@@ -708,13 +706,14 @@ xfs_inode_item_committed(
708 * marked delayed write. If that's the case, we'll promote it and that will 706 * marked delayed write. If that's the case, we'll promote it and that will
709 * allow the caller to write the buffer by triggering the xfsbufd to run. 707 * allow the caller to write the buffer by triggering the xfsbufd to run.
710 */ 708 */
711STATIC void 709STATIC bool
712xfs_inode_item_pushbuf( 710xfs_inode_item_pushbuf(
713 struct xfs_log_item *lip) 711 struct xfs_log_item *lip)
714{ 712{
715 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 713 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
716 struct xfs_inode *ip = iip->ili_inode; 714 struct xfs_inode *ip = iip->ili_inode;
717 struct xfs_buf *bp; 715 struct xfs_buf *bp;
716 bool ret = true;
718 717
719 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 718 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
720 719
@@ -725,7 +724,7 @@ xfs_inode_item_pushbuf(
725 if (completion_done(&ip->i_flush) || 724 if (completion_done(&ip->i_flush) ||
726 !(lip->li_flags & XFS_LI_IN_AIL)) { 725 !(lip->li_flags & XFS_LI_IN_AIL)) {
727 xfs_iunlock(ip, XFS_ILOCK_SHARED); 726 xfs_iunlock(ip, XFS_ILOCK_SHARED);
728 return; 727 return true;
729 } 728 }
730 729
731 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, 730 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
@@ -733,10 +732,13 @@ xfs_inode_item_pushbuf(
733 732
734 xfs_iunlock(ip, XFS_ILOCK_SHARED); 733 xfs_iunlock(ip, XFS_ILOCK_SHARED);
735 if (!bp) 734 if (!bp)
736 return; 735 return true;
737 if (XFS_BUF_ISDELAYWRITE(bp)) 736 if (XFS_BUF_ISDELAYWRITE(bp))
738 xfs_buf_delwri_promote(bp); 737 xfs_buf_delwri_promote(bp);
738 if (xfs_buf_ispinned(bp))
739 ret = false;
739 xfs_buf_relse(bp); 740 xfs_buf_relse(bp);
741 return ret;
740} 742}
741 743
742/* 744/*
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f7ce7debe14c..d99a90518909 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1069,7 +1069,7 @@ xfs_ioctl_setattr(
1069 } 1069 }
1070 } 1070 }
1071 1071
1072 xfs_trans_ijoin(tp, ip); 1072 xfs_trans_ijoin(tp, ip, 0);
1073 1073
1074 /* 1074 /*
1075 * Change file ownership. Must be the owner or privileged. 1075 * Change file ownership. Must be the owner or privileged.
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index d56173b34a2a..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 54e623bfbb85..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 80f4060e8970..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 091d82b94c4d..9afa282aa937 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -208,22 +208,20 @@ xfs_iomap_write_direct(
208 if (error) 208 if (error)
209 goto error1; 209 goto error1;
210 210
211 xfs_trans_ijoin(tp, ip); 211 xfs_trans_ijoin(tp, ip, 0);
212 212
213 bmapi_flag = XFS_BMAPI_WRITE; 213 bmapi_flag = 0;
214 if (offset < ip->i_size || extsz) 214 if (offset < ip->i_size || extsz)
215 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
216 216
217 /* 217 /*
218 * Issue the xfs_bmapi() call to allocate the blocks.
219 *
220 * From this point onwards we overwrite the imap pointer that the 218 * From this point onwards we overwrite the imap pointer that the
221 * caller gave to us. 219 * caller gave to us.
222 */ 220 */
223 xfs_bmap_init(&free_list, &firstfsb); 221 xfs_bmap_init(&free_list, &firstfsb);
224 nimaps = 1; 222 nimaps = 1;
225 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, 223 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
226 &firstfsb, 0, imap, &nimaps, &free_list); 224 &firstfsb, 0, imap, &nimaps, &free_list);
227 if (error) 225 if (error)
228 goto error0; 226 goto error0;
229 227
@@ -300,8 +298,8 @@ xfs_iomap_eof_want_preallocate(
300 while (count_fsb > 0) { 298 while (count_fsb > 0) {
301 imaps = nimaps; 299 imaps = nimaps;
302 firstblock = NULLFSBLOCK; 300 firstblock = NULLFSBLOCK;
303 error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, 301 error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
304 &firstblock, 0, imap, &imaps, NULL); 302 0);
305 if (error) 303 if (error)
306 return error; 304 return error;
307 for (n = 0; n < imaps; n++) { 305 for (n = 0; n < imaps; n++) {
@@ -381,7 +379,6 @@ xfs_iomap_write_delay(
381 xfs_fileoff_t last_fsb; 379 xfs_fileoff_t last_fsb;
382 xfs_off_t aligned_offset; 380 xfs_off_t aligned_offset;
383 xfs_fileoff_t ioalign; 381 xfs_fileoff_t ioalign;
384 xfs_fsblock_t firstblock;
385 xfs_extlen_t extsz; 382 xfs_extlen_t extsz;
386 int nimaps; 383 int nimaps;
387 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 384 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
@@ -425,12 +422,8 @@ retry:
425 } 422 }
426 423
427 nimaps = XFS_WRITE_IMAPS; 424 nimaps = XFS_WRITE_IMAPS;
428 firstblock = NULLFSBLOCK; 425 error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
429 error = xfs_bmapi(NULL, ip, offset_fsb, 426 imap, &nimaps, XFS_BMAPI_ENTIRE);
430 (xfs_filblks_t)(last_fsb - offset_fsb),
431 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
432 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
433 &nimaps, NULL);
434 switch (error) { 427 switch (error) {
435 case 0: 428 case 0:
436 case ENOSPC: 429 case ENOSPC:
@@ -535,7 +528,7 @@ xfs_iomap_write_allocate(
535 return XFS_ERROR(error); 528 return XFS_ERROR(error);
536 } 529 }
537 xfs_ilock(ip, XFS_ILOCK_EXCL); 530 xfs_ilock(ip, XFS_ILOCK_EXCL);
538 xfs_trans_ijoin(tp, ip); 531 xfs_trans_ijoin(tp, ip, 0);
539 532
540 xfs_bmap_init(&free_list, &first_block); 533 xfs_bmap_init(&free_list, &first_block);
541 534
@@ -587,14 +580,12 @@ xfs_iomap_write_allocate(
587 } 580 }
588 581
589 /* 582 /*
590 * Go get the actual blocks.
591 *
592 * From this point onwards we overwrite the imap 583 * From this point onwards we overwrite the imap
593 * pointer that the caller gave to us. 584 * pointer that the caller gave to us.
594 */ 585 */
595 error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, 586 error = xfs_bmapi_write(tp, ip, map_start_fsb,
596 XFS_BMAPI_WRITE, &first_block, 1, 587 count_fsb, 0, &first_block, 1,
597 imap, &nimaps, &free_list); 588 imap, &nimaps, &free_list);
598 if (error) 589 if (error)
599 goto trans_cancel; 590 goto trans_cancel;
600 591
@@ -701,15 +692,15 @@ xfs_iomap_write_unwritten(
701 } 692 }
702 693
703 xfs_ilock(ip, XFS_ILOCK_EXCL); 694 xfs_ilock(ip, XFS_ILOCK_EXCL);
704 xfs_trans_ijoin(tp, ip); 695 xfs_trans_ijoin(tp, ip, 0);
705 696
706 /* 697 /*
707 * Modify the unwritten extent state of the buffer. 698 * Modify the unwritten extent state of the buffer.
708 */ 699 */
709 xfs_bmap_init(&free_list, &firstfsb); 700 xfs_bmap_init(&free_list, &firstfsb);
710 nimaps = 1; 701 nimaps = 1;
711 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, 702 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
712 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 703 XFS_BMAPI_CONVERT, &firstfsb,
713 1, &imap, &nimaps, &free_list); 704 1, &imap, &nimaps, &free_list);
714 if (error) 705 if (error)
715 goto error_on_bmapi_transaction; 706 goto error_on_bmapi_transaction;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/xfs_iops.c
index b9c172b3fbbe..9ba2a07b7343 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -70,9 +70,8 @@ xfs_synchronize_times(
70} 70}
71 71
72/* 72/*
73 * If the linux inode is valid, mark it dirty. 73 * If the linux inode is valid, mark it dirty, else mark the dirty state
74 * Used when committing a dirty inode into a transaction so that 74 * in the XFS inode to make sure we pick it up when reclaiming the inode.
75 * the inode will get written back by the linux code
76 */ 75 */
77void 76void
78xfs_mark_inode_dirty_sync( 77xfs_mark_inode_dirty_sync(
@@ -82,6 +81,10 @@ xfs_mark_inode_dirty_sync(
82 81
83 if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) 82 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
84 mark_inode_dirty_sync(inode); 83 mark_inode_dirty_sync(inode);
84 else {
85 barrier();
86 ip->i_update_core = 1;
87 }
85} 88}
86 89
87void 90void
@@ -92,6 +95,28 @@ xfs_mark_inode_dirty(
92 95
93 if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) 96 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
94 mark_inode_dirty(inode); 97 mark_inode_dirty(inode);
98 else {
99 barrier();
100 ip->i_update_core = 1;
101 }
102
103}
104
105
106int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
107 void *fs_info)
108{
109 const struct xattr *xattr;
110 struct xfs_inode *ip = XFS_I(inode);
111 int error = 0;
112
113 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
114 error = xfs_attr_set(ip, xattr->name, xattr->value,
115 xattr->value_len, ATTR_SECURE);
116 if (error < 0)
117 break;
118 }
119 return error;
95} 120}
96 121
97/* 122/*
@@ -100,31 +125,15 @@ xfs_mark_inode_dirty(
100 * these attrs can be journalled at inode creation time (along with the 125 * these attrs can be journalled at inode creation time (along with the
101 * inode, of course, such that log replay can't cause these to be lost). 126 * inode, of course, such that log replay can't cause these to be lost).
102 */ 127 */
128
103STATIC int 129STATIC int
104xfs_init_security( 130xfs_init_security(
105 struct inode *inode, 131 struct inode *inode,
106 struct inode *dir, 132 struct inode *dir,
107 const struct qstr *qstr) 133 const struct qstr *qstr)
108{ 134{
109 struct xfs_inode *ip = XFS_I(inode); 135 return security_inode_init_security(inode, dir, qstr,
110 size_t length; 136 &xfs_initxattrs, NULL);
111 void *value;
112 unsigned char *name;
113 int error;
114
115 error = security_inode_init_security(inode, dir, qstr, (char **)&name,
116 &value, &length);
117 if (error) {
118 if (error == -EOPNOTSUPP)
119 return 0;
120 return -error;
121 }
122
123 error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
124
125 kfree(name);
126 kfree(value);
127 return error;
128} 137}
129 138
130static void 139static void
@@ -457,7 +466,7 @@ xfs_vn_getattr(
457 trace_xfs_getattr(ip); 466 trace_xfs_getattr(ip);
458 467
459 if (XFS_FORCED_SHUTDOWN(mp)) 468 if (XFS_FORCED_SHUTDOWN(mp))
460 return XFS_ERROR(EIO); 469 return -XFS_ERROR(EIO);
461 470
462 stat->size = XFS_ISIZE(ip); 471 stat->size = XFS_ISIZE(ip);
463 stat->dev = inode->i_sb->s_dev; 472 stat->dev = inode->i_sb->s_dev;
@@ -603,7 +612,7 @@ xfs_setattr_nonsize(
603 } 612 }
604 } 613 }
605 614
606 xfs_trans_ijoin(tp, ip); 615 xfs_trans_ijoin(tp, ip, 0);
607 616
608 /* 617 /*
609 * Change file ownership. Must be the owner or privileged. 618 * Change file ownership. Must be the owner or privileged.
@@ -825,16 +834,16 @@ xfs_setattr_size(
825 * care about here. 834 * care about here.
826 */ 835 */
827 if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { 836 if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
828 error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 837 error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
829 XBF_ASYNC, FI_NONE); 838 FI_NONE);
830 if (error) 839 if (error)
831 goto out_unlock; 840 goto out_unlock;
832 } 841 }
833 842
834 /* 843 /*
835 * Wait for all I/O to complete. 844 * Wait for all direct I/O to complete.
836 */ 845 */
837 xfs_ioend_wait(ip); 846 inode_dio_wait(inode);
838 847
839 error = -block_truncate_page(inode->i_mapping, iattr->ia_size, 848 error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
840 xfs_get_blocks); 849 xfs_get_blocks);
@@ -855,7 +864,7 @@ xfs_setattr_size(
855 864
856 xfs_ilock(ip, XFS_ILOCK_EXCL); 865 xfs_ilock(ip, XFS_ILOCK_EXCL);
857 866
858 xfs_trans_ijoin(tp, ip); 867 xfs_trans_ijoin(tp, ip, 0);
859 868
860 /* 869 /*
861 * Only change the c/mtime if we are changing the size or we are 870 * Only change the c/mtime if we are changing the size or we are
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/xfs_iops.h
index ef41c92ce66e..ef41c92ce66e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/xfs_linux.h
index d42f814e4d35..828662f70d64 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,13 +32,12 @@
32# define XFS_BIG_INUMS 0 32# define XFS_BIG_INUMS 0
33#endif 33#endif
34 34
35#include <xfs_types.h> 35#include "xfs_types.h"
36 36
37#include <kmem.h> 37#include "kmem.h"
38#include <mrlock.h> 38#include "mrlock.h"
39#include <time.h> 39#include "time.h"
40 40#include "uuid.h"
41#include <support/uuid.h>
42 41
43#include <linux/semaphore.h> 42#include <linux/semaphore.h>
44#include <linux/mm.h> 43#include <linux/mm.h>
@@ -69,6 +68,8 @@
69#include <linux/ctype.h> 68#include <linux/ctype.h>
70#include <linux/writeback.h> 69#include <linux/writeback.h>
71#include <linux/capability.h> 70#include <linux/capability.h>
71#include <linux/kthread.h>
72#include <linux/freezer.h>
72#include <linux/list_sort.h> 73#include <linux/list_sort.h>
73 74
74#include <asm/page.h> 75#include <asm/page.h>
@@ -78,14 +79,14 @@
78#include <asm/byteorder.h> 79#include <asm/byteorder.h>
79#include <asm/unaligned.h> 80#include <asm/unaligned.h>
80 81
81#include <xfs_vnode.h> 82#include "xfs_vnode.h"
82#include <xfs_stats.h> 83#include "xfs_stats.h"
83#include <xfs_sysctl.h> 84#include "xfs_sysctl.h"
84#include <xfs_iops.h> 85#include "xfs_iops.h"
85#include <xfs_aops.h> 86#include "xfs_aops.h"
86#include <xfs_super.h> 87#include "xfs_super.h"
87#include <xfs_buf.h> 88#include "xfs_buf.h"
88#include <xfs_message.h> 89#include "xfs_message.h"
89 90
90#ifdef __BIG_ENDIAN 91#ifdef __BIG_ENDIAN
91#define XFS_NATIVE_HOST 1 92#define XFS_NATIVE_HOST 1
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 06ff8437ed8e..2758a6277c52 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -878,10 +878,10 @@ xlog_iodone(xfs_buf_t *bp)
878 /* 878 /*
879 * Race to shutdown the filesystem if we see an error. 879 * Race to shutdown the filesystem if we see an error.
880 */ 880 */
881 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, 881 if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
882 XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { 882 XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
883 xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); 883 xfs_buf_ioerror_alert(bp, __func__);
884 XFS_BUF_STALE(bp); 884 xfs_buf_stale(bp);
885 xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); 885 xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
886 /* 886 /*
887 * This flag will be propagated to the trans-committed 887 * This flag will be propagated to the trans-committed
@@ -1047,11 +1047,10 @@ xlog_alloc_log(xfs_mount_t *mp,
1047 xlog_get_iclog_buffer_size(mp, log); 1047 xlog_get_iclog_buffer_size(mp, log);
1048 1048
1049 error = ENOMEM; 1049 error = ENOMEM;
1050 bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); 1050 bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0);
1051 if (!bp) 1051 if (!bp)
1052 goto out_free_log; 1052 goto out_free_log;
1053 bp->b_iodone = xlog_iodone; 1053 bp->b_iodone = xlog_iodone;
1054 ASSERT(XFS_BUF_ISBUSY(bp));
1055 ASSERT(xfs_buf_islocked(bp)); 1054 ASSERT(xfs_buf_islocked(bp));
1056 log->l_xbuf = bp; 1055 log->l_xbuf = bp;
1057 1056
@@ -1108,7 +1107,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1108 iclog->ic_callback_tail = &(iclog->ic_callback); 1107 iclog->ic_callback_tail = &(iclog->ic_callback);
1109 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; 1108 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
1110 1109
1111 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1112 ASSERT(xfs_buf_islocked(iclog->ic_bp)); 1110 ASSERT(xfs_buf_islocked(iclog->ic_bp));
1113 init_waitqueue_head(&iclog->ic_force_wait); 1111 init_waitqueue_head(&iclog->ic_force_wait);
1114 init_waitqueue_head(&iclog->ic_write_wait); 1112 init_waitqueue_head(&iclog->ic_write_wait);
@@ -1248,8 +1246,8 @@ xlog_bdstrat(
1248 struct xlog_in_core *iclog = bp->b_fspriv; 1246 struct xlog_in_core *iclog = bp->b_fspriv;
1249 1247
1250 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1248 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1251 XFS_BUF_ERROR(bp, EIO); 1249 xfs_buf_ioerror(bp, EIO);
1252 XFS_BUF_STALE(bp); 1250 xfs_buf_stale(bp);
1253 xfs_buf_ioend(bp, 0); 1251 xfs_buf_ioend(bp, 0);
1254 /* 1252 /*
1255 * It would seem logical to return EIO here, but we rely on 1253 * It would seem logical to return EIO here, but we rely on
@@ -1355,7 +1353,6 @@ xlog_sync(xlog_t *log,
1355 XFS_BUF_SET_COUNT(bp, count); 1353 XFS_BUF_SET_COUNT(bp, count);
1356 bp->b_fspriv = iclog; 1354 bp->b_fspriv = iclog;
1357 XFS_BUF_ZEROFLAGS(bp); 1355 XFS_BUF_ZEROFLAGS(bp);
1358 XFS_BUF_BUSY(bp);
1359 XFS_BUF_ASYNC(bp); 1356 XFS_BUF_ASYNC(bp);
1360 bp->b_flags |= XBF_SYNCIO; 1357 bp->b_flags |= XBF_SYNCIO;
1361 1358
@@ -1390,24 +1387,23 @@ xlog_sync(xlog_t *log,
1390 */ 1387 */
1391 XFS_BUF_WRITE(bp); 1388 XFS_BUF_WRITE(bp);
1392 1389
1393 if ((error = xlog_bdstrat(bp))) { 1390 error = xlog_bdstrat(bp);
1394 xfs_ioerror_alert("xlog_sync", log->l_mp, bp, 1391 if (error) {
1395 XFS_BUF_ADDR(bp)); 1392 xfs_buf_ioerror_alert(bp, "xlog_sync");
1396 return error; 1393 return error;
1397 } 1394 }
1398 if (split) { 1395 if (split) {
1399 bp = iclog->ic_log->l_xbuf; 1396 bp = iclog->ic_log->l_xbuf;
1400 XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ 1397 XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
1401 XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ 1398 xfs_buf_associate_memory(bp,
1402 (__psint_t)count), split); 1399 (char *)&iclog->ic_header + count, split);
1403 bp->b_fspriv = iclog; 1400 bp->b_fspriv = iclog;
1404 XFS_BUF_ZEROFLAGS(bp); 1401 XFS_BUF_ZEROFLAGS(bp);
1405 XFS_BUF_BUSY(bp);
1406 XFS_BUF_ASYNC(bp); 1402 XFS_BUF_ASYNC(bp);
1407 bp->b_flags |= XBF_SYNCIO; 1403 bp->b_flags |= XBF_SYNCIO;
1408 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1404 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1409 bp->b_flags |= XBF_FUA; 1405 bp->b_flags |= XBF_FUA;
1410 dptr = XFS_BUF_PTR(bp); 1406 dptr = bp->b_addr;
1411 /* 1407 /*
1412 * Bump the cycle numbers at the start of each block 1408 * Bump the cycle numbers at the start of each block
1413 * since this part of the buffer is at the start of 1409 * since this part of the buffer is at the start of
@@ -1427,9 +1423,9 @@ xlog_sync(xlog_t *log,
1427 /* account for internal log which doesn't start at block #0 */ 1423 /* account for internal log which doesn't start at block #0 */
1428 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1424 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1429 XFS_BUF_WRITE(bp); 1425 XFS_BUF_WRITE(bp);
1430 if ((error = xlog_bdstrat(bp))) { 1426 error = xlog_bdstrat(bp);
1431 xfs_ioerror_alert("xlog_sync (split)", log->l_mp, 1427 if (error) {
1432 bp, XFS_BUF_ADDR(bp)); 1428 xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
1433 return error; 1429 return error;
1434 } 1430 }
1435 } 1431 }
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 052a2c0ec5fb..541a508adea1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -147,7 +147,7 @@ xlog_align(
147 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); 147 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
148 148
149 ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); 149 ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
150 return XFS_BUF_PTR(bp) + BBTOB(offset); 150 return bp->b_addr + BBTOB(offset);
151} 151}
152 152
153 153
@@ -178,15 +178,12 @@ xlog_bread_noalign(
178 178
179 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 179 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
180 XFS_BUF_READ(bp); 180 XFS_BUF_READ(bp);
181 XFS_BUF_BUSY(bp);
182 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); 181 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
183 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
184 182
185 xfsbdstrat(log->l_mp, bp); 183 xfsbdstrat(log->l_mp, bp);
186 error = xfs_buf_iowait(bp); 184 error = xfs_buf_iowait(bp);
187 if (error) 185 if (error)
188 xfs_ioerror_alert("xlog_bread", log->l_mp, 186 xfs_buf_ioerror_alert(bp, __func__);
189 bp, XFS_BUF_ADDR(bp));
190 return error; 187 return error;
191} 188}
192 189
@@ -220,18 +217,18 @@ xlog_bread_offset(
220 xfs_buf_t *bp, 217 xfs_buf_t *bp,
221 xfs_caddr_t offset) 218 xfs_caddr_t offset)
222{ 219{
223 xfs_caddr_t orig_offset = XFS_BUF_PTR(bp); 220 xfs_caddr_t orig_offset = bp->b_addr;
224 int orig_len = bp->b_buffer_length; 221 int orig_len = bp->b_buffer_length;
225 int error, error2; 222 int error, error2;
226 223
227 error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks)); 224 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
228 if (error) 225 if (error)
229 return error; 226 return error;
230 227
231 error = xlog_bread_noalign(log, blk_no, nbblks, bp); 228 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
232 229
233 /* must reset buffer pointer even on error */ 230 /* must reset buffer pointer even on error */
234 error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len); 231 error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
235 if (error) 232 if (error)
236 return error; 233 return error;
237 return error2; 234 return error2;
@@ -266,15 +263,14 @@ xlog_bwrite(
266 263
267 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 264 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
268 XFS_BUF_ZEROFLAGS(bp); 265 XFS_BUF_ZEROFLAGS(bp);
269 XFS_BUF_BUSY(bp); 266 xfs_buf_hold(bp);
270 XFS_BUF_HOLD(bp);
271 xfs_buf_lock(bp); 267 xfs_buf_lock(bp);
272 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); 268 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
273 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
274 269
275 if ((error = xfs_bwrite(log->l_mp, bp))) 270 error = xfs_bwrite(bp);
276 xfs_ioerror_alert("xlog_bwrite", log->l_mp, 271 if (error)
277 bp, XFS_BUF_ADDR(bp)); 272 xfs_buf_ioerror_alert(bp, __func__);
273 xfs_buf_relse(bp);
278 return error; 274 return error;
279} 275}
280 276
@@ -360,14 +356,12 @@ STATIC void
360xlog_recover_iodone( 356xlog_recover_iodone(
361 struct xfs_buf *bp) 357 struct xfs_buf *bp)
362{ 358{
363 if (XFS_BUF_GETERROR(bp)) { 359 if (bp->b_error) {
364 /* 360 /*
365 * We're not going to bother about retrying 361 * We're not going to bother about retrying
366 * this during recovery. One strike! 362 * this during recovery. One strike!
367 */ 363 */
368 xfs_ioerror_alert("xlog_recover_iodone", 364 xfs_buf_ioerror_alert(bp, __func__);
369 bp->b_target->bt_mount, bp,
370 XFS_BUF_ADDR(bp));
371 xfs_force_shutdown(bp->b_target->bt_mount, 365 xfs_force_shutdown(bp->b_target->bt_mount,
372 SHUTDOWN_META_IO_ERROR); 366 SHUTDOWN_META_IO_ERROR);
373 } 367 }
@@ -1262,7 +1256,7 @@ xlog_write_log_records(
1262 */ 1256 */
1263 ealign = round_down(end_block, sectbb); 1257 ealign = round_down(end_block, sectbb);
1264 if (j == 0 && (start_block + endcount > ealign)) { 1258 if (j == 0 && (start_block + endcount > ealign)) {
1265 offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block); 1259 offset = bp->b_addr + BBTOB(ealign - start_block);
1266 error = xlog_bread_offset(log, ealign, sectbb, 1260 error = xlog_bread_offset(log, ealign, sectbb,
1267 bp, offset); 1261 bp, offset);
1268 if (error) 1262 if (error)
@@ -2135,15 +2129,15 @@ xlog_recover_buffer_pass2(
2135 2129
2136 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2130 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2137 buf_flags); 2131 buf_flags);
2138 if (XFS_BUF_ISERROR(bp)) { 2132 if (!bp)
2139 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, 2133 return XFS_ERROR(ENOMEM);
2140 bp, buf_f->blf_blkno); 2134 error = bp->b_error;
2141 error = XFS_BUF_GETERROR(bp); 2135 if (error) {
2136 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2142 xfs_buf_relse(bp); 2137 xfs_buf_relse(bp);
2143 return error; 2138 return error;
2144 } 2139 }
2145 2140
2146 error = 0;
2147 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 2141 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2148 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2142 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2149 } else if (buf_f->blf_flags & 2143 } else if (buf_f->blf_flags &
@@ -2174,15 +2168,16 @@ xlog_recover_buffer_pass2(
2174 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 2168 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2175 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, 2169 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2176 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { 2170 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2177 XFS_BUF_STALE(bp); 2171 xfs_buf_stale(bp);
2178 error = xfs_bwrite(mp, bp); 2172 error = xfs_bwrite(bp);
2179 } else { 2173 } else {
2180 ASSERT(bp->b_target->bt_mount == mp); 2174 ASSERT(bp->b_target->bt_mount == mp);
2181 bp->b_iodone = xlog_recover_iodone; 2175 bp->b_iodone = xlog_recover_iodone;
2182 xfs_bdwrite(mp, bp); 2176 xfs_buf_delwri_queue(bp);
2183 } 2177 }
2184 2178
2185 return (error); 2179 xfs_buf_relse(bp);
2180 return error;
2186} 2181}
2187 2182
2188STATIC int 2183STATIC int
@@ -2227,14 +2222,16 @@ xlog_recover_inode_pass2(
2227 2222
2228 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2223 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2229 XBF_LOCK); 2224 XBF_LOCK);
2230 if (XFS_BUF_ISERROR(bp)) { 2225 if (!bp) {
2231 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, 2226 error = ENOMEM;
2232 bp, in_f->ilf_blkno); 2227 goto error;
2233 error = XFS_BUF_GETERROR(bp); 2228 }
2229 error = bp->b_error;
2230 if (error) {
2231 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2234 xfs_buf_relse(bp); 2232 xfs_buf_relse(bp);
2235 goto error; 2233 goto error;
2236 } 2234 }
2237 error = 0;
2238 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 2235 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2239 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); 2236 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2240 2237
@@ -2439,7 +2436,8 @@ xlog_recover_inode_pass2(
2439write_inode_buffer: 2436write_inode_buffer:
2440 ASSERT(bp->b_target->bt_mount == mp); 2437 ASSERT(bp->b_target->bt_mount == mp);
2441 bp->b_iodone = xlog_recover_iodone; 2438 bp->b_iodone = xlog_recover_iodone;
2442 xfs_bdwrite(mp, bp); 2439 xfs_buf_delwri_queue(bp);
2440 xfs_buf_relse(bp);
2443error: 2441error:
2444 if (need_free) 2442 if (need_free)
2445 kmem_free(in_f); 2443 kmem_free(in_f);
@@ -2537,8 +2535,7 @@ xlog_recover_dquot_pass2(
2537 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 2535 XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2538 0, &bp); 2536 0, &bp);
2539 if (error) { 2537 if (error) {
2540 xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, 2538 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
2541 bp, dq_f->qlf_blkno);
2542 return error; 2539 return error;
2543 } 2540 }
2544 ASSERT(bp); 2541 ASSERT(bp);
@@ -2561,7 +2558,8 @@ xlog_recover_dquot_pass2(
2561 ASSERT(dq_f->qlf_size == 2); 2558 ASSERT(dq_f->qlf_size == 2);
2562 ASSERT(bp->b_target->bt_mount == mp); 2559 ASSERT(bp->b_target->bt_mount == mp);
2563 bp->b_iodone = xlog_recover_iodone; 2560 bp->b_iodone = xlog_recover_iodone;
2564 xfs_bdwrite(mp, bp); 2561 xfs_buf_delwri_queue(bp);
2562 xfs_buf_relse(bp);
2565 2563
2566 return (0); 2564 return (0);
2567} 2565}
@@ -3437,7 +3435,7 @@ xlog_do_recovery_pass(
3437 /* 3435 /*
3438 * Check for header wrapping around physical end-of-log 3436 * Check for header wrapping around physical end-of-log
3439 */ 3437 */
3440 offset = XFS_BUF_PTR(hbp); 3438 offset = hbp->b_addr;
3441 split_hblks = 0; 3439 split_hblks = 0;
3442 wrapped_hblks = 0; 3440 wrapped_hblks = 0;
3443 if (blk_no + hblks <= log->l_logBBsize) { 3441 if (blk_no + hblks <= log->l_logBBsize) {
@@ -3497,7 +3495,7 @@ xlog_do_recovery_pass(
3497 } else { 3495 } else {
3498 /* This log record is split across the 3496 /* This log record is split across the
3499 * physical end of log */ 3497 * physical end of log */
3500 offset = XFS_BUF_PTR(dbp); 3498 offset = dbp->b_addr;
3501 split_bblks = 0; 3499 split_bblks = 0;
3502 if (blk_no != log->l_logBBsize) { 3500 if (blk_no != log->l_logBBsize) {
3503 /* some data is before the physical 3501 /* some data is before the physical
@@ -3656,7 +3654,7 @@ xlog_do_recover(
3656 return error; 3654 return error;
3657 } 3655 }
3658 3656
3659 XFS_bflush(log->l_mp->m_ddev_targp); 3657 xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
3660 3658
3661 /* 3659 /*
3662 * If IO errors happened during recovery, bail out. 3660 * If IO errors happened during recovery, bail out.
@@ -3689,8 +3687,7 @@ xlog_do_recover(
3689 xfsbdstrat(log->l_mp, bp); 3687 xfsbdstrat(log->l_mp, bp);
3690 error = xfs_buf_iowait(bp); 3688 error = xfs_buf_iowait(bp);
3691 if (error) { 3689 if (error) {
3692 xfs_ioerror_alert("xlog_do_recover", 3690 xfs_buf_ioerror_alert(bp, __func__);
3693 log->l_mp, bp, XFS_BUF_ADDR(bp));
3694 ASSERT(0); 3691 ASSERT(0);
3695 xfs_buf_relse(bp); 3692 xfs_buf_relse(bp);
3696 return error; 3693 return error;
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/xfs_message.c
index bd672def95ac..bd672def95ac 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/xfs_message.c
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb7ea007672..7fb7ea007672 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/xfs_message.h
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 092e16ae4d9d..d06afbc3540d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,9 +44,6 @@
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45 45
46 46
47STATIC void xfs_unmountfs_wait(xfs_mount_t *);
48
49
50#ifdef HAVE_PERCPU_SB 47#ifdef HAVE_PERCPU_SB
51STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, 48STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
52 int); 49 int);
@@ -1484,7 +1481,7 @@ xfs_unmountfs(
1484 * state as much as possible. 1481 * state as much as possible.
1485 */ 1482 */
1486 xfs_reclaim_inodes(mp, 0); 1483 xfs_reclaim_inodes(mp, 0);
1487 XFS_bflush(mp->m_ddev_targp); 1484 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1488 xfs_reclaim_inodes(mp, SYNC_WAIT); 1485 xfs_reclaim_inodes(mp, SYNC_WAIT);
1489 1486
1490 xfs_qm_unmount(mp); 1487 xfs_qm_unmount(mp);
@@ -1496,11 +1493,6 @@ xfs_unmountfs(
1496 */ 1493 */
1497 xfs_log_force(mp, XFS_LOG_SYNC); 1494 xfs_log_force(mp, XFS_LOG_SYNC);
1498 1495
1499 xfs_binval(mp->m_ddev_targp);
1500 if (mp->m_rtdev_targp) {
1501 xfs_binval(mp->m_rtdev_targp);
1502 }
1503
1504 /* 1496 /*
1505 * Unreserve any blocks we have so that when we unmount we don't account 1497 * Unreserve any blocks we have so that when we unmount we don't account
1506 * the reserved free space as used. This is really only necessary for 1498 * the reserved free space as used. This is really only necessary for
@@ -1526,7 +1518,16 @@ xfs_unmountfs(
1526 xfs_warn(mp, "Unable to update superblock counters. " 1518 xfs_warn(mp, "Unable to update superblock counters. "
1527 "Freespace may not be correct on next mount."); 1519 "Freespace may not be correct on next mount.");
1528 xfs_unmountfs_writesb(mp); 1520 xfs_unmountfs_writesb(mp);
1529 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1521
1522 /*
1523 * Make sure all buffers have been flushed and completed before
1524 * unmounting the log.
1525 */
1526 error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
1527 if (error)
1528 xfs_warn(mp, "%d busy buffers during unmount.", error);
1529 xfs_wait_buftarg(mp->m_ddev_targp);
1530
1530 xfs_log_unmount_write(mp); 1531 xfs_log_unmount_write(mp);
1531 xfs_log_unmount(mp); 1532 xfs_log_unmount(mp);
1532 xfs_uuid_unmount(mp); 1533 xfs_uuid_unmount(mp);
@@ -1537,16 +1538,6 @@ xfs_unmountfs(
1537 xfs_free_perag(mp); 1538 xfs_free_perag(mp);
1538} 1539}
1539 1540
1540STATIC void
1541xfs_unmountfs_wait(xfs_mount_t *mp)
1542{
1543 if (mp->m_logdev_targp != mp->m_ddev_targp)
1544 xfs_wait_buftarg(mp->m_logdev_targp);
1545 if (mp->m_rtdev_targp)
1546 xfs_wait_buftarg(mp->m_rtdev_targp);
1547 xfs_wait_buftarg(mp->m_ddev_targp);
1548}
1549
1550int 1541int
1551xfs_fs_writable(xfs_mount_t *mp) 1542xfs_fs_writable(xfs_mount_t *mp)
1552{ 1543{
@@ -1612,15 +1603,14 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1612 1603
1613 XFS_BUF_UNDONE(sbp); 1604 XFS_BUF_UNDONE(sbp);
1614 XFS_BUF_UNREAD(sbp); 1605 XFS_BUF_UNREAD(sbp);
1615 XFS_BUF_UNDELAYWRITE(sbp); 1606 xfs_buf_delwri_dequeue(sbp);
1616 XFS_BUF_WRITE(sbp); 1607 XFS_BUF_WRITE(sbp);
1617 XFS_BUF_UNASYNC(sbp); 1608 XFS_BUF_UNASYNC(sbp);
1618 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); 1609 ASSERT(sbp->b_target == mp->m_ddev_targp);
1619 xfsbdstrat(mp, sbp); 1610 xfsbdstrat(mp, sbp);
1620 error = xfs_buf_iowait(sbp); 1611 error = xfs_buf_iowait(sbp);
1621 if (error) 1612 if (error)
1622 xfs_ioerror_alert("xfs_unmountfs_writesb", 1613 xfs_buf_ioerror_alert(sbp, __func__);
1623 mp, sbp, XFS_BUF_ADDR(sbp));
1624 xfs_buf_relse(sbp); 1614 xfs_buf_relse(sbp);
1625 } 1615 }
1626 return error; 1616 return error;
@@ -1938,7 +1928,7 @@ xfs_getsb(
1938 xfs_buf_lock(bp); 1928 xfs_buf_lock(bp);
1939 } 1929 }
1940 1930
1941 XFS_BUF_HOLD(bp); 1931 xfs_buf_hold(bp);
1942 ASSERT(XFS_BUF_ISDONE(bp)); 1932 ASSERT(XFS_BUF_ISDONE(bp));
1943 return bp; 1933 return bp;
1944} 1934}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/xfs_qm.c
index 46e54ad9a2dc..5cff443f6cdb 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1240,7 +1240,7 @@ xfs_qm_reset_dqcounts(
1240 do_div(j, sizeof(xfs_dqblk_t)); 1240 do_div(j, sizeof(xfs_dqblk_t));
1241 ASSERT(mp->m_quotainfo->qi_dqperchunk == j); 1241 ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
1242#endif 1242#endif
1243 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); 1243 ddq = bp->b_addr;
1244 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { 1244 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
1245 /* 1245 /*
1246 * Do a sanity check, and if needed, repair the dqblk. Don't 1246 * Do a sanity check, and if needed, repair the dqblk. Don't
@@ -1296,7 +1296,8 @@ xfs_qm_dqiter_bufs(
1296 break; 1296 break;
1297 1297
1298 xfs_qm_reset_dqcounts(mp, bp, firstid, type); 1298 xfs_qm_reset_dqcounts(mp, bp, firstid, type);
1299 xfs_bdwrite(mp, bp); 1299 xfs_buf_delwri_queue(bp);
1300 xfs_buf_relse(bp);
1300 /* 1301 /*
1301 * goto the next block. 1302 * goto the next block.
1302 */ 1303 */
@@ -1346,11 +1347,8 @@ xfs_qm_dqiterate(
1346 * the inode is never added to the transaction. 1347 * the inode is never added to the transaction.
1347 */ 1348 */
1348 xfs_ilock(qip, XFS_ILOCK_SHARED); 1349 xfs_ilock(qip, XFS_ILOCK_SHARED);
1349 error = xfs_bmapi(NULL, qip, lblkno, 1350 error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
1350 maxlblkcnt - lblkno, 1351 map, &nmaps, 0);
1351 XFS_BMAPI_METADATA,
1352 NULL,
1353 0, map, &nmaps, NULL);
1354 xfs_iunlock(qip, XFS_ILOCK_SHARED); 1352 xfs_iunlock(qip, XFS_ILOCK_SHARED);
1355 if (error) 1353 if (error)
1356 break; 1354 break;
@@ -1683,7 +1681,7 @@ xfs_qm_quotacheck(
1683 * quotacheck'd stamp on the superblock. So, here we do a synchronous 1681 * quotacheck'd stamp on the superblock. So, here we do a synchronous
1684 * flush. 1682 * flush.
1685 */ 1683 */
1686 XFS_bflush(mp->m_ddev_targp); 1684 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1687 1685
1688 /* 1686 /*
1689 * If one type of quotas is off, then it will lose its 1687 * If one type of quotas is off, then it will lose its
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/xfs_qm.h
index 43b9abe1052c..43b9abe1052c 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index a0a829addca9..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
index 8671a0b32644..8671a0b32644 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/xfs_qm_stats.c
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
index 5b964fc0dc09..5b964fc0dc09 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/xfs_qm_stats.h
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 609246f42e6c..5cc3dde1bc90 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile(
261 } 261 }
262 262
263 xfs_ilock(ip, XFS_ILOCK_EXCL); 263 xfs_ilock(ip, XFS_ILOCK_EXCL);
264 xfs_trans_ijoin(tp, ip); 264 xfs_trans_ijoin(tp, ip, 0);
265 265
266 error = xfs_itruncate_data(&tp, ip, 0); 266 error = xfs_itruncate_data(&tp, ip, 0);
267 if (error) { 267 if (error) {
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
index 94a3d927d716..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/xfs_quota_priv.h
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 29b9d642e93d..7e76f537abb7 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -25,7 +25,7 @@
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
27#include "xfs_inode.h" 27#include "xfs_inode.h"
28#include "quota/xfs_qm.h" 28#include "xfs_qm.h"
29#include <linux/quota.h> 29#include <linux/quota.h>
30 30
31 31
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index df78c297d1a1..866de277079a 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -170,12 +170,12 @@ xfs_rename(
170 * we can rely on either trans_commit or trans_cancel to unlock 170 * we can rely on either trans_commit or trans_cancel to unlock
171 * them. 171 * them.
172 */ 172 */
173 xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL); 173 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
174 if (new_parent) 174 if (new_parent)
175 xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL); 175 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
176 xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL); 176 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
177 if (target_ip) 177 if (target_ip)
178 xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL); 178 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
179 179
180 /* 180 /*
181 * If we are using project inheritance, we only allow renames 181 * If we are using project inheritance, we only allow renames
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 8f76fdff4f46..87323f1ded64 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -112,7 +112,7 @@ xfs_growfs_rt_alloc(
112 * Lock the inode. 112 * Lock the inode.
113 */ 113 */
114 xfs_ilock(ip, XFS_ILOCK_EXCL); 114 xfs_ilock(ip, XFS_ILOCK_EXCL);
115 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 115 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
116 116
117 xfs_bmap_init(&flist, &firstblock); 117 xfs_bmap_init(&flist, &firstblock);
118 /* 118 /*
@@ -120,9 +120,9 @@ xfs_growfs_rt_alloc(
120 */ 120 */
121 nmap = 1; 121 nmap = 1;
122 cancelflags |= XFS_TRANS_ABORT; 122 cancelflags |= XFS_TRANS_ABORT;
123 error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, 123 error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
124 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, 124 XFS_BMAPI_METADATA, &firstblock,
125 resblks, &map, &nmap, &flist); 125 resblks, &map, &nmap, &flist);
126 if (!error && nmap < 1) 126 if (!error && nmap < 1)
127 error = XFS_ERROR(ENOSPC); 127 error = XFS_ERROR(ENOSPC);
128 if (error) 128 if (error)
@@ -155,7 +155,7 @@ xfs_growfs_rt_alloc(
155 * Lock the bitmap inode. 155 * Lock the bitmap inode.
156 */ 156 */
157 xfs_ilock(ip, XFS_ILOCK_EXCL); 157 xfs_ilock(ip, XFS_ILOCK_EXCL);
158 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 158 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
159 /* 159 /*
160 * Get a buffer for the block. 160 * Get a buffer for the block.
161 */ 161 */
@@ -168,7 +168,7 @@ error_cancel:
168 xfs_trans_cancel(tp, cancelflags); 168 xfs_trans_cancel(tp, cancelflags);
169 goto error; 169 goto error;
170 } 170 }
171 memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); 171 memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
172 xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); 172 xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
173 /* 173 /*
174 * Commit the transaction. 174 * Commit the transaction.
@@ -856,34 +856,24 @@ xfs_rtbuf_get(
856 xfs_buf_t **bpp) /* output: buffer for the block */ 856 xfs_buf_t **bpp) /* output: buffer for the block */
857{ 857{
858 xfs_buf_t *bp; /* block buffer, result */ 858 xfs_buf_t *bp; /* block buffer, result */
859 xfs_daddr_t d; /* disk addr of block */
860 int error; /* error value */
861 xfs_fsblock_t fsb; /* fs block number for block */
862 xfs_inode_t *ip; /* bitmap or summary inode */ 859 xfs_inode_t *ip; /* bitmap or summary inode */
860 xfs_bmbt_irec_t map;
861 int nmap;
862 int error; /* error value */
863 863
864 ip = issum ? mp->m_rsumip : mp->m_rbmip; 864 ip = issum ? mp->m_rsumip : mp->m_rbmip;
865 /* 865
866 * Map from the file offset (block) and inode number to the 866 error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
867 * file system block. 867 if (error)
868 */
869 error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
870 if (error) {
871 return error; 868 return error;
872 } 869
873 ASSERT(fsb != NULLFSBLOCK); 870 ASSERT(map.br_startblock != NULLFSBLOCK);
874 /* 871 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
875 * Convert to disk address for buffer cache. 872 XFS_FSB_TO_DADDR(mp, map.br_startblock),
876 */
877 d = XFS_FSB_TO_DADDR(mp, fsb);
878 /*
879 * Read the buffer.
880 */
881 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
882 mp->m_bsize, 0, &bp); 873 mp->m_bsize, 0, &bp);
883 if (error) { 874 if (error)
884 return error; 875 return error;
885 } 876 ASSERT(!xfs_buf_geterror(bp));
886 ASSERT(bp && !XFS_BUF_GETERROR(bp));
887 *bpp = bp; 877 *bpp = bp;
888 return 0; 878 return 0;
889} 879}
@@ -943,7 +933,7 @@ xfs_rtcheck_range(
943 if (error) { 933 if (error) {
944 return error; 934 return error;
945 } 935 }
946 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 936 bufp = bp->b_addr;
947 /* 937 /*
948 * Compute the starting word's address, and starting bit. 938 * Compute the starting word's address, and starting bit.
949 */ 939 */
@@ -994,7 +984,7 @@ xfs_rtcheck_range(
994 if (error) { 984 if (error) {
995 return error; 985 return error;
996 } 986 }
997 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 987 b = bufp = bp->b_addr;
998 word = 0; 988 word = 0;
999 } else { 989 } else {
1000 /* 990 /*
@@ -1040,7 +1030,7 @@ xfs_rtcheck_range(
1040 if (error) { 1030 if (error) {
1041 return error; 1031 return error;
1042 } 1032 }
1043 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1033 b = bufp = bp->b_addr;
1044 word = 0; 1034 word = 0;
1045 } else { 1035 } else {
1046 /* 1036 /*
@@ -1158,7 +1148,7 @@ xfs_rtfind_back(
1158 if (error) { 1148 if (error) {
1159 return error; 1149 return error;
1160 } 1150 }
1161 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1151 bufp = bp->b_addr;
1162 /* 1152 /*
1163 * Get the first word's index & point to it. 1153 * Get the first word's index & point to it.
1164 */ 1154 */
@@ -1210,7 +1200,7 @@ xfs_rtfind_back(
1210 if (error) { 1200 if (error) {
1211 return error; 1201 return error;
1212 } 1202 }
1213 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1203 bufp = bp->b_addr;
1214 word = XFS_BLOCKWMASK(mp); 1204 word = XFS_BLOCKWMASK(mp);
1215 b = &bufp[word]; 1205 b = &bufp[word];
1216 } else { 1206 } else {
@@ -1256,7 +1246,7 @@ xfs_rtfind_back(
1256 if (error) { 1246 if (error) {
1257 return error; 1247 return error;
1258 } 1248 }
1259 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1249 bufp = bp->b_addr;
1260 word = XFS_BLOCKWMASK(mp); 1250 word = XFS_BLOCKWMASK(mp);
1261 b = &bufp[word]; 1251 b = &bufp[word];
1262 } else { 1252 } else {
@@ -1333,7 +1323,7 @@ xfs_rtfind_forw(
1333 if (error) { 1323 if (error) {
1334 return error; 1324 return error;
1335 } 1325 }
1336 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1326 bufp = bp->b_addr;
1337 /* 1327 /*
1338 * Get the first word's index & point to it. 1328 * Get the first word's index & point to it.
1339 */ 1329 */
@@ -1384,7 +1374,7 @@ xfs_rtfind_forw(
1384 if (error) { 1374 if (error) {
1385 return error; 1375 return error;
1386 } 1376 }
1387 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1377 b = bufp = bp->b_addr;
1388 word = 0; 1378 word = 0;
1389 } else { 1379 } else {
1390 /* 1380 /*
@@ -1429,7 +1419,7 @@ xfs_rtfind_forw(
1429 if (error) { 1419 if (error) {
1430 return error; 1420 return error;
1431 } 1421 }
1432 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1422 b = bufp = bp->b_addr;
1433 word = 0; 1423 word = 0;
1434 } else { 1424 } else {
1435 /* 1425 /*
@@ -1649,7 +1639,7 @@ xfs_rtmodify_range(
1649 if (error) { 1639 if (error) {
1650 return error; 1640 return error;
1651 } 1641 }
1652 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1642 bufp = bp->b_addr;
1653 /* 1643 /*
1654 * Compute the starting word's address, and starting bit. 1644 * Compute the starting word's address, and starting bit.
1655 */ 1645 */
@@ -1694,7 +1684,7 @@ xfs_rtmodify_range(
1694 if (error) { 1684 if (error) {
1695 return error; 1685 return error;
1696 } 1686 }
1697 first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1687 first = b = bufp = bp->b_addr;
1698 word = 0; 1688 word = 0;
1699 } else { 1689 } else {
1700 /* 1690 /*
@@ -1734,7 +1724,7 @@ xfs_rtmodify_range(
1734 if (error) { 1724 if (error) {
1735 return error; 1725 return error;
1736 } 1726 }
1737 first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1727 first = b = bufp = bp->b_addr;
1738 word = 0; 1728 word = 0;
1739 } else { 1729 } else {
1740 /* 1730 /*
@@ -1832,8 +1822,8 @@ xfs_rtmodify_summary(
1832 */ 1822 */
1833 sp = XFS_SUMPTR(mp, bp, so); 1823 sp = XFS_SUMPTR(mp, bp, so);
1834 *sp += delta; 1824 *sp += delta;
1835 xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)), 1825 xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
1836 (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1)); 1826 (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
1837 return 0; 1827 return 0;
1838} 1828}
1839 1829
@@ -1970,7 +1960,7 @@ xfs_growfs_rt(
1970 * Lock out other callers by grabbing the bitmap inode lock. 1960 * Lock out other callers by grabbing the bitmap inode lock.
1971 */ 1961 */
1972 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); 1962 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
1973 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); 1963 xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
1974 /* 1964 /*
1975 * Update the bitmap inode's size. 1965 * Update the bitmap inode's size.
1976 */ 1966 */
@@ -1982,7 +1972,7 @@ xfs_growfs_rt(
1982 * Get the summary inode into the transaction. 1972 * Get the summary inode into the transaction.
1983 */ 1973 */
1984 xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); 1974 xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
1985 xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL); 1975 xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
1986 /* 1976 /*
1987 * Update the summary inode's size. 1977 * Update the summary inode's size.
1988 */ 1978 */
@@ -2153,7 +2143,7 @@ xfs_rtfree_extent(
2153 * Synchronize by locking the bitmap inode. 2143 * Synchronize by locking the bitmap inode.
2154 */ 2144 */
2155 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); 2145 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2156 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); 2146 xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2157 2147
2158#if defined(__KERNEL__) && defined(DEBUG) 2148#if defined(__KERNEL__) && defined(DEBUG)
2159 /* 2149 /*
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 09e1f4f35e97..f7f3a359c1c5 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -47,7 +47,7 @@ struct xfs_trans;
47#define XFS_SUMOFFSTOBLOCK(mp,s) \ 47#define XFS_SUMOFFSTOBLOCK(mp,s) \
48 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) 48 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
49#define XFS_SUMPTR(mp,bp,so) \ 49#define XFS_SUMPTR(mp,bp,so) \
50 ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \ 50 ((xfs_suminfo_t *)((bp)->b_addr + \
51 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) 51 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
52 52
53#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) 53#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index d6d6fdfe9422..597d044a09a1 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -92,24 +92,6 @@ xfs_do_force_shutdown(
92} 92}
93 93
94/* 94/*
95 * Prints out an ALERT message about I/O error.
96 */
97void
98xfs_ioerror_alert(
99 char *func,
100 struct xfs_mount *mp,
101 xfs_buf_t *bp,
102 xfs_daddr_t blkno)
103{
104 xfs_alert(mp,
105 "I/O error occurred: meta-data dev %s block 0x%llx"
106 " (\"%s\") error %d buf count %zd",
107 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
108 (__uint64_t)blkno, func,
109 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
110}
111
112/*
113 * This isn't an absolute requirement, but it is 95 * This isn't an absolute requirement, but it is
114 * just a good idea to call xfs_read_buf instead of 96 * just a good idea to call xfs_read_buf instead of
115 * directly doing a read_buf call. For one, we shouldn't 97 * directly doing a read_buf call. For one, we shouldn't
@@ -137,20 +119,19 @@ xfs_read_buf(
137 bp = xfs_buf_read(target, blkno, len, flags); 119 bp = xfs_buf_read(target, blkno, len, flags);
138 if (!bp) 120 if (!bp)
139 return XFS_ERROR(EIO); 121 return XFS_ERROR(EIO);
140 error = XFS_BUF_GETERROR(bp); 122 error = bp->b_error;
141 if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { 123 if (!error && !XFS_FORCED_SHUTDOWN(mp)) {
142 *bpp = bp; 124 *bpp = bp;
143 } else { 125 } else {
144 *bpp = NULL; 126 *bpp = NULL;
145 if (error) { 127 if (error) {
146 xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); 128 xfs_buf_ioerror_alert(bp, __func__);
147 } else { 129 } else {
148 error = XFS_ERROR(EIO); 130 error = XFS_ERROR(EIO);
149 } 131 }
150 if (bp) { 132 if (bp) {
151 XFS_BUF_UNDONE(bp); 133 XFS_BUF_UNDONE(bp);
152 XFS_BUF_UNDELAYWRITE(bp); 134 xfs_buf_stale(bp);
153 XFS_BUF_STALE(bp);
154 /* 135 /*
155 * brelse clears B_ERROR and b_error 136 * brelse clears B_ERROR and b_error
156 */ 137 */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 11c41ec6ed75..bbdb9ad6a4ba 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, 42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
43 xfs_daddr_t blkno, int len, uint flags, 43 xfs_daddr_t blkno, int len, uint flags,
44 struct xfs_buf **bpp); 44 struct xfs_buf **bpp);
45extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
46 xfs_buf_t *bp, xfs_daddr_t blkno);
47extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); 45extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
48 46
49#endif /* __XFS_RW_H__ */ 47#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1eb2ba586814..cb6ae715814a 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -509,7 +509,7 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
509 509
510#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ 510#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
511#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) 511#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
512#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)XFS_BUF_PTR(bp)) 512#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
513 513
514#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) 514#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
515#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ 515#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/xfs_stats.c
index 76fdc5861932..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/xfs_stats.h
index 736854b1ca1a..736854b1ca1a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/xfs_super.c
index 9a72dda58bd0..3eca58f51ae9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -356,6 +356,8 @@ xfs_parseargs(
356 mp->m_flags |= XFS_MOUNT_DELAYLOG; 356 mp->m_flags |= XFS_MOUNT_DELAYLOG;
357 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 357 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
358 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 358 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
359 xfs_warn(mp,
360 "nodelaylog is deprecated and will be removed in Linux 3.3");
359 } else if (!strcmp(this_char, MNTOPT_DISCARD)) { 361 } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
360 mp->m_flags |= XFS_MOUNT_DISCARD; 362 mp->m_flags |= XFS_MOUNT_DISCARD;
361 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { 363 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
@@ -794,8 +796,6 @@ xfs_fs_destroy_inode(
794 if (is_bad_inode(inode)) 796 if (is_bad_inode(inode))
795 goto out_reclaim; 797 goto out_reclaim;
796 798
797 xfs_ioend_wait(ip);
798
799 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); 799 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
800 800
801 /* 801 /*
@@ -835,7 +835,6 @@ xfs_fs_inode_init_once(
835 inode_init_once(VFS_I(ip)); 835 inode_init_once(VFS_I(ip));
836 836
837 /* xfs inode */ 837 /* xfs inode */
838 atomic_set(&ip->i_iocount, 0);
839 atomic_set(&ip->i_pincount, 0); 838 atomic_set(&ip->i_pincount, 0);
840 spin_lock_init(&ip->i_flags_lock); 839 spin_lock_init(&ip->i_flags_lock);
841 init_waitqueue_head(&ip->i_ipin_wait); 840 init_waitqueue_head(&ip->i_ipin_wait);
@@ -877,33 +876,17 @@ xfs_log_inode(
877 struct xfs_trans *tp; 876 struct xfs_trans *tp;
878 int error; 877 int error;
879 878
880 xfs_iunlock(ip, XFS_ILOCK_SHARED);
881 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 879 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
882 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); 880 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
883
884 if (error) { 881 if (error) {
885 xfs_trans_cancel(tp, 0); 882 xfs_trans_cancel(tp, 0);
886 /* we need to return with the lock hold shared */
887 xfs_ilock(ip, XFS_ILOCK_SHARED);
888 return error; 883 return error;
889 } 884 }
890 885
891 xfs_ilock(ip, XFS_ILOCK_EXCL); 886 xfs_ilock(ip, XFS_ILOCK_EXCL);
892 887 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
893 /*
894 * Note - it's possible that we might have pushed ourselves out of the
895 * way during trans_reserve which would flush the inode. But there's
896 * no guarantee that the inode buffer has actually gone out yet (it's
897 * delwri). Plus the buffer could be pinned anyway if it's part of
898 * an inode in another recent transaction. So we play it safe and
899 * fire off the transaction anyway.
900 */
901 xfs_trans_ijoin(tp, ip);
902 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 888 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
903 error = xfs_trans_commit(tp, 0); 889 return xfs_trans_commit(tp, 0);
904 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
905
906 return error;
907} 890}
908 891
909STATIC int 892STATIC int
@@ -918,7 +901,9 @@ xfs_fs_write_inode(
918 trace_xfs_write_inode(ip); 901 trace_xfs_write_inode(ip);
919 902
920 if (XFS_FORCED_SHUTDOWN(mp)) 903 if (XFS_FORCED_SHUTDOWN(mp))
921 return XFS_ERROR(EIO); 904 return -XFS_ERROR(EIO);
905 if (!ip->i_update_core)
906 return 0;
922 907
923 if (wbc->sync_mode == WB_SYNC_ALL) { 908 if (wbc->sync_mode == WB_SYNC_ALL) {
924 /* 909 /*
@@ -926,15 +911,12 @@ xfs_fs_write_inode(
926 * of forcing it all the way to stable storage using a 911 * of forcing it all the way to stable storage using a
927 * synchronous transaction we let the log force inside the 912 * synchronous transaction we let the log force inside the
928 * ->sync_fs call do that for thus, which reduces the number 913 * ->sync_fs call do that for thus, which reduces the number
929 * of synchronous log foces dramatically. 914 * of synchronous log forces dramatically.
930 */ 915 */
931 xfs_ioend_wait(ip); 916 error = xfs_log_inode(ip);
932 xfs_ilock(ip, XFS_ILOCK_SHARED); 917 if (error)
933 if (ip->i_update_core) { 918 goto out;
934 error = xfs_log_inode(ip); 919 return 0;
935 if (error)
936 goto out_unlock;
937 }
938 } else { 920 } else {
939 /* 921 /*
940 * We make this non-blocking if the inode is contended, return 922 * We make this non-blocking if the inode is contended, return
@@ -1033,7 +1015,7 @@ xfs_fs_put_super(
1033 */ 1015 */
1034 xfs_filestream_unmount(mp); 1016 xfs_filestream_unmount(mp);
1035 1017
1036 XFS_bflush(mp->m_ddev_targp); 1018 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1037 1019
1038 xfs_unmountfs(mp); 1020 xfs_unmountfs(mp);
1039 xfs_freesb(mp); 1021 xfs_freesb(mp);
@@ -1457,7 +1439,7 @@ xfs_fs_fill_super(
1457 */ 1439 */
1458 xfs_filestream_unmount(mp); 1440 xfs_filestream_unmount(mp);
1459 1441
1460 XFS_bflush(mp->m_ddev_targp); 1442 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1461 1443
1462 xfs_unmountfs(mp); 1444 xfs_unmountfs(mp);
1463 goto out_free_sb; 1445 goto out_free_sb;
@@ -1666,24 +1648,13 @@ xfs_init_workqueues(void)
1666 */ 1648 */
1667 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); 1649 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
1668 if (!xfs_syncd_wq) 1650 if (!xfs_syncd_wq)
1669 goto out; 1651 return -ENOMEM;
1670
1671 xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
1672 if (!xfs_ail_wq)
1673 goto out_destroy_syncd;
1674
1675 return 0; 1652 return 0;
1676
1677out_destroy_syncd:
1678 destroy_workqueue(xfs_syncd_wq);
1679out:
1680 return -ENOMEM;
1681} 1653}
1682 1654
1683STATIC void 1655STATIC void
1684xfs_destroy_workqueues(void) 1656xfs_destroy_workqueues(void)
1685{ 1657{
1686 destroy_workqueue(xfs_ail_wq);
1687 destroy_workqueue(xfs_syncd_wq); 1658 destroy_workqueue(xfs_syncd_wq);
1688} 1659}
1689 1660
@@ -1695,7 +1666,6 @@ init_xfs_fs(void)
1695 printk(KERN_INFO XFS_VERSION_STRING " with " 1666 printk(KERN_INFO XFS_VERSION_STRING " with "
1696 XFS_BUILD_OPTIONS " enabled\n"); 1667 XFS_BUILD_OPTIONS " enabled\n");
1697 1668
1698 xfs_ioend_init();
1699 xfs_dir_startup(); 1669 xfs_dir_startup();
1700 1670
1701 error = xfs_init_zones(); 1671 error = xfs_init_zones();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/xfs_super.h
index 50a3266c999e..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/xfs_super.h
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/xfs_sync.c
index e4c938afb910..aa3dc1a4d53d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -227,21 +227,17 @@ xfs_sync_inode_data(
227 int error = 0; 227 int error = 0;
228 228
229 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 229 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
230 goto out_wait; 230 return 0;
231 231
232 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { 232 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
233 if (flags & SYNC_TRYLOCK) 233 if (flags & SYNC_TRYLOCK)
234 goto out_wait; 234 return 0;
235 xfs_ilock(ip, XFS_IOLOCK_SHARED); 235 xfs_ilock(ip, XFS_IOLOCK_SHARED);
236 } 236 }
237 237
238 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 238 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
239 0 : XBF_ASYNC, FI_NONE); 239 0 : XBF_ASYNC, FI_NONE);
240 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 240 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241
242 out_wait:
243 if (flags & SYNC_WAIT)
244 xfs_ioend_wait(ip);
245 return error; 241 return error;
246} 242}
247 243
@@ -322,6 +318,7 @@ xfs_sync_fsdata(
322 struct xfs_mount *mp) 318 struct xfs_mount *mp)
323{ 319{
324 struct xfs_buf *bp; 320 struct xfs_buf *bp;
321 int error;
325 322
326 /* 323 /*
327 * If the buffer is pinned then push on the log so we won't get stuck 324 * If the buffer is pinned then push on the log so we won't get stuck
@@ -332,10 +329,11 @@ xfs_sync_fsdata(
332 * between there and here. 329 * between there and here.
333 */ 330 */
334 bp = xfs_getsb(mp, 0); 331 bp = xfs_getsb(mp, 0);
335 if (XFS_BUF_ISPINNED(bp)) 332 if (xfs_buf_ispinned(bp))
336 xfs_log_force(mp, 0); 333 xfs_log_force(mp, 0);
337 334 error = xfs_bwrite(bp);
338 return xfs_bwrite(mp, bp); 335 xfs_buf_relse(bp);
336 return error;
339} 337}
340 338
341/* 339/*
@@ -379,7 +377,7 @@ xfs_quiesce_data(
379 377
380 /* flush data-only devices */ 378 /* flush data-only devices */
381 if (mp->m_rtdev_targp) 379 if (mp->m_rtdev_targp)
382 XFS_bflush(mp->m_rtdev_targp); 380 xfs_flush_buftarg(mp->m_rtdev_targp, 1);
383 381
384 return error ? error : error2; 382 return error ? error : error2;
385} 383}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e7ac6e..941202e7ac6e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..b9937d450f8e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/xfs_trace.c
index 88d25d4aa56e..9010ce885e6a 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -43,8 +43,8 @@
43#include "xfs_quota.h" 43#include "xfs_quota.h"
44#include "xfs_iomap.h" 44#include "xfs_iomap.h"
45#include "xfs_aops.h" 45#include "xfs_aops.h"
46#include "quota/xfs_dquot_item.h" 46#include "xfs_dquot_item.h"
47#include "quota/xfs_dquot.h" 47#include "xfs_dquot.h"
48#include "xfs_log_recover.h" 48#include "xfs_log_recover.h"
49#include "xfs_inode_item.h" 49#include "xfs_inode_item.h"
50 50
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/xfs_trace.h
index 690fc7a7bd72..f1d2802b2f07 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -30,6 +30,7 @@ struct xfs_buf_log_item;
30struct xfs_da_args; 30struct xfs_da_args;
31struct xfs_da_node_entry; 31struct xfs_da_node_entry;
32struct xfs_dquot; 32struct xfs_dquot;
33struct xfs_log_item;
33struct xlog_ticket; 34struct xlog_ticket;
34struct log; 35struct log;
35struct xlog_recover; 36struct xlog_recover;
@@ -320,7 +321,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
320DEFINE_BUF_EVENT(xfs_buf_iodone); 321DEFINE_BUF_EVENT(xfs_buf_iodone);
321DEFINE_BUF_EVENT(xfs_buf_iorequest); 322DEFINE_BUF_EVENT(xfs_buf_iorequest);
322DEFINE_BUF_EVENT(xfs_buf_bawrite); 323DEFINE_BUF_EVENT(xfs_buf_bawrite);
323DEFINE_BUF_EVENT(xfs_buf_bdwrite);
324DEFINE_BUF_EVENT(xfs_buf_lock); 324DEFINE_BUF_EVENT(xfs_buf_lock);
325DEFINE_BUF_EVENT(xfs_buf_lock_done); 325DEFINE_BUF_EVENT(xfs_buf_lock_done);
326DEFINE_BUF_EVENT(xfs_buf_trylock); 326DEFINE_BUF_EVENT(xfs_buf_trylock);
@@ -577,6 +577,7 @@ DEFINE_INODE_EVENT(xfs_vm_bmap);
577DEFINE_INODE_EVENT(xfs_file_ioctl); 577DEFINE_INODE_EVENT(xfs_file_ioctl);
578DEFINE_INODE_EVENT(xfs_file_compat_ioctl); 578DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
579DEFINE_INODE_EVENT(xfs_ioctl_setattr); 579DEFINE_INODE_EVENT(xfs_ioctl_setattr);
580DEFINE_INODE_EVENT(xfs_dir_fsync);
580DEFINE_INODE_EVENT(xfs_file_fsync); 581DEFINE_INODE_EVENT(xfs_file_fsync);
581DEFINE_INODE_EVENT(xfs_destroy_inode); 582DEFINE_INODE_EVENT(xfs_destroy_inode);
582DEFINE_INODE_EVENT(xfs_write_inode); 583DEFINE_INODE_EVENT(xfs_write_inode);
@@ -853,6 +854,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
853DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); 854DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
854DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); 855DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
855 856
857DECLARE_EVENT_CLASS(xfs_log_item_class,
858 TP_PROTO(struct xfs_log_item *lip),
859 TP_ARGS(lip),
860 TP_STRUCT__entry(
861 __field(dev_t, dev)
862 __field(void *, lip)
863 __field(uint, type)
864 __field(uint, flags)
865 __field(xfs_lsn_t, lsn)
866 ),
867 TP_fast_assign(
868 __entry->dev = lip->li_mountp->m_super->s_dev;
869 __entry->lip = lip;
870 __entry->type = lip->li_type;
871 __entry->flags = lip->li_flags;
872 __entry->lsn = lip->li_lsn;
873 ),
874 TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s",
875 MAJOR(__entry->dev), MINOR(__entry->dev),
876 __entry->lip,
877 CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
878 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
879 __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
880)
881
882#define DEFINE_LOG_ITEM_EVENT(name) \
883DEFINE_EVENT(xfs_log_item_class, name, \
884 TP_PROTO(struct xfs_log_item *lip), \
885 TP_ARGS(lip))
886DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
887DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
888DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
889DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
890DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
891
892
856DECLARE_EVENT_CLASS(xfs_file_class, 893DECLARE_EVENT_CLASS(xfs_file_class,
857 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), 894 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
858 TP_ARGS(ip, count, offset, flags), 895 TP_ARGS(ip, count, offset, flags),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index efc147f0e9b6..1f35b2feca97 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1790,9 +1790,7 @@ xfs_trans_commit_cil(
1790} 1790}
1791 1791
1792/* 1792/*
1793 * xfs_trans_commit 1793 * Commit the given transaction to the log.
1794 *
1795 * Commit the given transaction to the log a/synchronously.
1796 * 1794 *
1797 * XFS disk error handling mechanism is not based on a typical 1795 * XFS disk error handling mechanism is not based on a typical
1798 * transaction abort mechanism. Logically after the filesystem 1796 * transaction abort mechanism. Logically after the filesystem
@@ -1804,10 +1802,9 @@ xfs_trans_commit_cil(
1804 * Do not reference the transaction structure after this call. 1802 * Do not reference the transaction structure after this call.
1805 */ 1803 */
1806int 1804int
1807_xfs_trans_commit( 1805xfs_trans_commit(
1808 struct xfs_trans *tp, 1806 struct xfs_trans *tp,
1809 uint flags, 1807 uint flags)
1810 int *log_flushed)
1811{ 1808{
1812 struct xfs_mount *mp = tp->t_mountp; 1809 struct xfs_mount *mp = tp->t_mountp;
1813 xfs_lsn_t commit_lsn = -1; 1810 xfs_lsn_t commit_lsn = -1;
@@ -1866,7 +1863,7 @@ _xfs_trans_commit(
1866 if (sync) { 1863 if (sync) {
1867 if (!error) { 1864 if (!error) {
1868 error = _xfs_log_force_lsn(mp, commit_lsn, 1865 error = _xfs_log_force_lsn(mp, commit_lsn,
1869 XFS_LOG_SYNC, log_flushed); 1866 XFS_LOG_SYNC, NULL);
1870 } 1867 }
1871 XFS_STATS_INC(xs_trans_sync); 1868 XFS_STATS_INC(xs_trans_sync);
1872 } else { 1869 } else {
@@ -2021,6 +2018,6 @@ xfs_trans_roll(
2021 if (error) 2018 if (error)
2022 return error; 2019 return error;
2023 2020
2024 xfs_trans_ijoin(trans, dp); 2021 xfs_trans_ijoin(trans, dp, 0);
2025 return 0; 2022 return 0;
2026} 2023}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 06a9759b6352..603f3eb52041 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -350,7 +350,7 @@ typedef struct xfs_item_ops {
350 void (*iop_unlock)(xfs_log_item_t *); 350 void (*iop_unlock)(xfs_log_item_t *);
351 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); 351 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
352 void (*iop_push)(xfs_log_item_t *); 352 void (*iop_push)(xfs_log_item_t *);
353 void (*iop_pushbuf)(xfs_log_item_t *); 353 bool (*iop_pushbuf)(xfs_log_item_t *);
354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
355} xfs_item_ops_t; 355} xfs_item_ops_t;
356 356
@@ -470,8 +470,7 @@ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
472void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); 472void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
473void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); 473void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
474void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
475void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); 474void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
476void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); 475void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
477struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); 476struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -487,10 +486,7 @@ void xfs_trans_log_efd_extent(xfs_trans_t *,
487 struct xfs_efd_log_item *, 486 struct xfs_efd_log_item *,
488 xfs_fsblock_t, 487 xfs_fsblock_t,
489 xfs_extlen_t); 488 xfs_extlen_t);
490int _xfs_trans_commit(xfs_trans_t *, 489int xfs_trans_commit(xfs_trans_t *, uint flags);
491 uint flags,
492 int *);
493#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL)
494void xfs_trans_cancel(xfs_trans_t *, int); 490void xfs_trans_cancel(xfs_trans_t *, int);
495int xfs_trans_ail_init(struct xfs_mount *); 491int xfs_trans_ail_init(struct xfs_mount *);
496void xfs_trans_ail_destroy(struct xfs_mount *); 492void xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 43233e92f0f6..ed9252bcdac9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -26,10 +26,9 @@
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_mount.h" 27#include "xfs_mount.h"
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_trace.h"
29#include "xfs_error.h" 30#include "xfs_error.h"
30 31
31struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
32
33#ifdef DEBUG 32#ifdef DEBUG
34/* 33/*
35 * Check that the list is sorted as it should be. 34 * Check that the list is sorted as it should be.
@@ -299,7 +298,7 @@ xfs_trans_ail_cursor_last(
299 * Splice the log item list into the AIL at the given LSN. We splice to the 298 * Splice the log item list into the AIL at the given LSN. We splice to the
300 * tail of the given LSN to maintain insert order for push traversals. The 299 * tail of the given LSN to maintain insert order for push traversals. The
301 * cursor is optional, allowing repeated updates to the same LSN to avoid 300 * cursor is optional, allowing repeated updates to the same LSN to avoid
302 * repeated traversals. 301 * repeated traversals. This should not be called with an empty list.
303 */ 302 */
304static void 303static void
305xfs_ail_splice( 304xfs_ail_splice(
@@ -308,50 +307,39 @@ xfs_ail_splice(
308 struct list_head *list, 307 struct list_head *list,
309 xfs_lsn_t lsn) 308 xfs_lsn_t lsn)
310{ 309{
311 struct xfs_log_item *lip = cur ? cur->item : NULL; 310 struct xfs_log_item *lip;
312 struct xfs_log_item *next_lip; 311
312 ASSERT(!list_empty(list));
313 313
314 /* 314 /*
315 * Get a new cursor if we don't have a placeholder or the existing one 315 * Use the cursor to determine the insertion point if one is
316 * has been invalidated. 316 * provided. If not, or if the one we got is not valid,
317 * find the place in the AIL where the items belong.
317 */ 318 */
318 if (!lip || (__psint_t)lip & 1) { 319 lip = cur ? cur->item : NULL;
320 if (!lip || (__psint_t) lip & 1)
319 lip = __xfs_trans_ail_cursor_last(ailp, lsn); 321 lip = __xfs_trans_ail_cursor_last(ailp, lsn);
320 322
321 if (!lip) { 323 /*
322 /* The list is empty, so just splice and return. */ 324 * If a cursor is provided, we know we're processing the AIL
323 if (cur) 325 * in lsn order, and future items to be spliced in will
324 cur->item = NULL; 326 * follow the last one being inserted now. Update the
325 list_splice(list, &ailp->xa_ail); 327 * cursor to point to that last item, now while we have a
326 return; 328 * reliable pointer to it.
327 } 329 */
328 } 330 if (cur)
331 cur->item = list_entry(list->prev, struct xfs_log_item, li_ail);
329 332
330 /* 333 /*
331 * Our cursor points to the item we want to insert _after_, so we have 334 * Finally perform the splice. Unless the AIL was empty,
332 * to update the cursor to point to the end of the list we are splicing 335 * lip points to the item in the AIL _after_ which the new
333 * in so that it points to the correct location for the next splice. 336 * items should go. If lip is null the AIL was empty, so
334 * i.e. before the splice 337 * the new items go at the head of the AIL.
335 *
336 * lsn -> lsn -> lsn + x -> lsn + x ...
337 * ^
338 * | cursor points here
339 *
340 * After the splice we have:
341 *
342 * lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ...
343 * ^ ^
344 * | cursor points here | needs to move here
345 *
346 * So we set the cursor to the last item in the list to be spliced
347 * before we execute the splice, resulting in the cursor pointing to
348 * the correct item after the splice occurs.
349 */ 338 */
350 if (cur) { 339 if (lip)
351 next_lip = list_entry(list->prev, struct xfs_log_item, li_ail); 340 list_splice(list, &lip->li_ail);
352 cur->item = next_lip; 341 else
353 } 342 list_splice(list, &ailp->xa_ail);
354 list_splice(list, &lip->li_ail);
355} 343}
356 344
357/* 345/*
@@ -367,28 +355,34 @@ xfs_ail_delete(
367 xfs_trans_ail_cursor_clear(ailp, lip); 355 xfs_trans_ail_cursor_clear(ailp, lip);
368} 356}
369 357
370/* 358static long
371 * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself 359xfsaild_push(
372 * to run at a later time if there is more work to do to complete the push. 360 struct xfs_ail *ailp)
373 */
374STATIC void
375xfs_ail_worker(
376 struct work_struct *work)
377{ 361{
378 struct xfs_ail *ailp = container_of(to_delayed_work(work),
379 struct xfs_ail, xa_work);
380 xfs_mount_t *mp = ailp->xa_mount; 362 xfs_mount_t *mp = ailp->xa_mount;
381 struct xfs_ail_cursor cur; 363 struct xfs_ail_cursor cur;
382 xfs_log_item_t *lip; 364 xfs_log_item_t *lip;
383 xfs_lsn_t lsn; 365 xfs_lsn_t lsn;
384 xfs_lsn_t target; 366 xfs_lsn_t target;
385 long tout = 10; 367 long tout = 10;
386 int flush_log = 0;
387 int stuck = 0; 368 int stuck = 0;
388 int count = 0; 369 int count = 0;
389 int push_xfsbufd = 0; 370 int push_xfsbufd = 0;
390 371
372 /*
373 * If last time we ran we encountered pinned items, force the log first
374 * and wait for it before pushing again.
375 */
391 spin_lock(&ailp->xa_lock); 376 spin_lock(&ailp->xa_lock);
377 if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush &&
378 !list_empty(&ailp->xa_ail)) {
379 ailp->xa_log_flush = 0;
380 spin_unlock(&ailp->xa_lock);
381 XFS_STATS_INC(xs_push_ail_flush);
382 xfs_log_force(mp, XFS_LOG_SYNC);
383 spin_lock(&ailp->xa_lock);
384 }
385
392 target = ailp->xa_target; 386 target = ailp->xa_target;
393 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); 387 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
394 if (!lip || XFS_FORCED_SHUTDOWN(mp)) { 388 if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
@@ -432,26 +426,37 @@ xfs_ail_worker(
432 switch (lock_result) { 426 switch (lock_result) {
433 case XFS_ITEM_SUCCESS: 427 case XFS_ITEM_SUCCESS:
434 XFS_STATS_INC(xs_push_ail_success); 428 XFS_STATS_INC(xs_push_ail_success);
429 trace_xfs_ail_push(lip);
430
435 IOP_PUSH(lip); 431 IOP_PUSH(lip);
436 ailp->xa_last_pushed_lsn = lsn; 432 ailp->xa_last_pushed_lsn = lsn;
437 break; 433 break;
438 434
439 case XFS_ITEM_PUSHBUF: 435 case XFS_ITEM_PUSHBUF:
440 XFS_STATS_INC(xs_push_ail_pushbuf); 436 XFS_STATS_INC(xs_push_ail_pushbuf);
441 IOP_PUSHBUF(lip); 437 trace_xfs_ail_pushbuf(lip);
442 ailp->xa_last_pushed_lsn = lsn; 438
439 if (!IOP_PUSHBUF(lip)) {
440 trace_xfs_ail_pushbuf_pinned(lip);
441 stuck++;
442 ailp->xa_log_flush++;
443 } else {
444 ailp->xa_last_pushed_lsn = lsn;
445 }
443 push_xfsbufd = 1; 446 push_xfsbufd = 1;
444 break; 447 break;
445 448
446 case XFS_ITEM_PINNED: 449 case XFS_ITEM_PINNED:
447 XFS_STATS_INC(xs_push_ail_pinned); 450 XFS_STATS_INC(xs_push_ail_pinned);
451 trace_xfs_ail_pinned(lip);
452
448 stuck++; 453 stuck++;
449 flush_log = 1; 454 ailp->xa_log_flush++;
450 break; 455 break;
451 456
452 case XFS_ITEM_LOCKED: 457 case XFS_ITEM_LOCKED:
453 XFS_STATS_INC(xs_push_ail_locked); 458 XFS_STATS_INC(xs_push_ail_locked);
454 ailp->xa_last_pushed_lsn = lsn; 459 trace_xfs_ail_locked(lip);
455 stuck++; 460 stuck++;
456 break; 461 break;
457 462
@@ -491,16 +496,6 @@ xfs_ail_worker(
491 xfs_trans_ail_cursor_done(ailp, &cur); 496 xfs_trans_ail_cursor_done(ailp, &cur);
492 spin_unlock(&ailp->xa_lock); 497 spin_unlock(&ailp->xa_lock);
493 498
494 if (flush_log) {
495 /*
496 * If something we need to push out was pinned, then
497 * push out the log so it will become unpinned and
498 * move forward in the AIL.
499 */
500 XFS_STATS_INC(xs_push_ail_flush);
501 xfs_log_force(mp, 0);
502 }
503
504 if (push_xfsbufd) { 499 if (push_xfsbufd) {
505 /* we've got delayed write buffers to flush */ 500 /* we've got delayed write buffers to flush */
506 wake_up_process(mp->m_ddev_targp->bt_task); 501 wake_up_process(mp->m_ddev_targp->bt_task);
@@ -511,20 +506,7 @@ out_done:
511 if (!count) { 506 if (!count) {
512 /* We're past our target or empty, so idle */ 507 /* We're past our target or empty, so idle */
513 ailp->xa_last_pushed_lsn = 0; 508 ailp->xa_last_pushed_lsn = 0;
514 509 ailp->xa_log_flush = 0;
515 /*
516 * We clear the XFS_AIL_PUSHING_BIT first before checking
517 * whether the target has changed. If the target has changed,
518 * this pushes the requeue race directly onto the result of the
519 * atomic test/set bit, so we are guaranteed that either the
520 * the pusher that changed the target or ourselves will requeue
521 * the work (but not both).
522 */
523 clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
524 smp_rmb();
525 if (XFS_LSN_CMP(ailp->xa_target, target) == 0 ||
526 test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
527 return;
528 510
529 tout = 50; 511 tout = 50;
530 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 512 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
@@ -543,14 +525,39 @@ out_done:
543 * were stuck. 525 * were stuck.
544 * 526 *
545 * Backoff a bit more to allow some I/O to complete before 527 * Backoff a bit more to allow some I/O to complete before
546 * continuing from where we were. 528 * restarting from the start of the AIL. This prevents us
529 * from spinning on the same items, and if they are pinned will
530 * all the restart to issue a log force to unpin the stuck
531 * items.
547 */ 532 */
548 tout = 20; 533 tout = 20;
534 ailp->xa_last_pushed_lsn = 0;
535 }
536
537 return tout;
538}
539
540static int
541xfsaild(
542 void *data)
543{
544 struct xfs_ail *ailp = data;
545 long tout = 0; /* milliseconds */
546
547 while (!kthread_should_stop()) {
548 if (tout && tout <= 20)
549 __set_current_state(TASK_KILLABLE);
550 else
551 __set_current_state(TASK_INTERRUPTIBLE);
552 schedule_timeout(tout ?
553 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
554
555 try_to_freeze();
556
557 tout = xfsaild_push(ailp);
549 } 558 }
550 559
551 /* There is more to do, requeue us. */ 560 return 0;
552 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
553 msecs_to_jiffies(tout));
554} 561}
555 562
556/* 563/*
@@ -585,8 +592,9 @@ xfs_ail_push(
585 */ 592 */
586 smp_wmb(); 593 smp_wmb();
587 xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); 594 xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
588 if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) 595 smp_wmb();
589 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0); 596
597 wake_up_process(ailp->xa_task);
590} 598}
591 599
592/* 600/*
@@ -682,6 +690,7 @@ xfs_trans_ail_update_bulk(
682 int i; 690 int i;
683 LIST_HEAD(tmp); 691 LIST_HEAD(tmp);
684 692
693 ASSERT(nr_items > 0); /* Not required, but true. */
685 mlip = xfs_ail_min(ailp); 694 mlip = xfs_ail_min(ailp);
686 695
687 for (i = 0; i < nr_items; i++) { 696 for (i = 0; i < nr_items; i++) {
@@ -701,7 +710,8 @@ xfs_trans_ail_update_bulk(
701 list_add(&lip->li_ail, &tmp); 710 list_add(&lip->li_ail, &tmp);
702 } 711 }
703 712
704 xfs_ail_splice(ailp, cur, &tmp, lsn); 713 if (!list_empty(&tmp))
714 xfs_ail_splice(ailp, cur, &tmp, lsn);
705 715
706 if (!mlip_changed) { 716 if (!mlip_changed) {
707 spin_unlock(&ailp->xa_lock); 717 spin_unlock(&ailp->xa_lock);
@@ -822,9 +832,18 @@ xfs_trans_ail_init(
822 INIT_LIST_HEAD(&ailp->xa_ail); 832 INIT_LIST_HEAD(&ailp->xa_ail);
823 INIT_LIST_HEAD(&ailp->xa_cursors); 833 INIT_LIST_HEAD(&ailp->xa_cursors);
824 spin_lock_init(&ailp->xa_lock); 834 spin_lock_init(&ailp->xa_lock);
825 INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker); 835
836 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
837 ailp->xa_mount->m_fsname);
838 if (IS_ERR(ailp->xa_task))
839 goto out_free_ailp;
840
826 mp->m_ail = ailp; 841 mp->m_ail = ailp;
827 return 0; 842 return 0;
843
844out_free_ailp:
845 kmem_free(ailp);
846 return ENOMEM;
828} 847}
829 848
830void 849void
@@ -833,6 +852,6 @@ xfs_trans_ail_destroy(
833{ 852{
834 struct xfs_ail *ailp = mp->m_ail; 853 struct xfs_ail *ailp = mp->m_ail;
835 854
836 cancel_delayed_work_sync(&ailp->xa_work); 855 kthread_stop(ailp->xa_task);
837 kmem_free(ailp); 856 kmem_free(ailp);
838} 857}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 15584fc3ed7d..475a4ded4f41 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -54,7 +54,7 @@ xfs_trans_buf_item_match(
54 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 54 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
55 blip = (struct xfs_buf_log_item *)lidp->lid_item; 55 blip = (struct xfs_buf_log_item *)lidp->lid_item;
56 if (blip->bli_item.li_type == XFS_LI_BUF && 56 if (blip->bli_item.li_type == XFS_LI_BUF &&
57 XFS_BUF_TARGET(blip->bli_buf) == target && 57 blip->bli_buf->b_target == target &&
58 XFS_BUF_ADDR(blip->bli_buf) == blkno && 58 XFS_BUF_ADDR(blip->bli_buf) == blkno &&
59 XFS_BUF_COUNT(blip->bli_buf) == len) 59 XFS_BUF_COUNT(blip->bli_buf) == len)
60 return blip->bli_buf; 60 return blip->bli_buf;
@@ -80,7 +80,6 @@ _xfs_trans_bjoin(
80{ 80{
81 struct xfs_buf_log_item *bip; 81 struct xfs_buf_log_item *bip;
82 82
83 ASSERT(XFS_BUF_ISBUSY(bp));
84 ASSERT(bp->b_transp == NULL); 83 ASSERT(bp->b_transp == NULL);
85 84
86 /* 85 /*
@@ -161,8 +160,10 @@ xfs_trans_get_buf(xfs_trans_t *tp,
161 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); 160 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
162 if (bp != NULL) { 161 if (bp != NULL) {
163 ASSERT(xfs_buf_islocked(bp)); 162 ASSERT(xfs_buf_islocked(bp));
164 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) 163 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
165 XFS_BUF_SUPER_STALE(bp); 164 xfs_buf_stale(bp);
165 XFS_BUF_DONE(bp);
166 }
166 167
167 /* 168 /*
168 * If the buffer is stale then it was binval'ed 169 * If the buffer is stale then it was binval'ed
@@ -194,7 +195,7 @@ xfs_trans_get_buf(xfs_trans_t *tp,
194 return NULL; 195 return NULL;
195 } 196 }
196 197
197 ASSERT(!XFS_BUF_GETERROR(bp)); 198 ASSERT(!bp->b_error);
198 199
199 _xfs_trans_bjoin(tp, bp, 1); 200 _xfs_trans_bjoin(tp, bp, 1);
200 trace_xfs_trans_get_buf(bp->b_fspriv); 201 trace_xfs_trans_get_buf(bp->b_fspriv);
@@ -293,10 +294,9 @@ xfs_trans_read_buf(
293 return (flags & XBF_TRYLOCK) ? 294 return (flags & XBF_TRYLOCK) ?
294 EAGAIN : XFS_ERROR(ENOMEM); 295 EAGAIN : XFS_ERROR(ENOMEM);
295 296
296 if (XFS_BUF_GETERROR(bp) != 0) { 297 if (bp->b_error) {
297 xfs_ioerror_alert("xfs_trans_read_buf", mp, 298 error = bp->b_error;
298 bp, blkno); 299 xfs_buf_ioerror_alert(bp, __func__);
299 error = XFS_BUF_GETERROR(bp);
300 xfs_buf_relse(bp); 300 xfs_buf_relse(bp);
301 return error; 301 return error;
302 } 302 }
@@ -330,7 +330,7 @@ xfs_trans_read_buf(
330 ASSERT(xfs_buf_islocked(bp)); 330 ASSERT(xfs_buf_islocked(bp));
331 ASSERT(bp->b_transp == tp); 331 ASSERT(bp->b_transp == tp);
332 ASSERT(bp->b_fspriv != NULL); 332 ASSERT(bp->b_fspriv != NULL);
333 ASSERT((XFS_BUF_ISERROR(bp)) == 0); 333 ASSERT(!bp->b_error);
334 if (!(XFS_BUF_ISDONE(bp))) { 334 if (!(XFS_BUF_ISDONE(bp))) {
335 trace_xfs_trans_read_buf_io(bp, _RET_IP_); 335 trace_xfs_trans_read_buf_io(bp, _RET_IP_);
336 ASSERT(!XFS_BUF_ISASYNC(bp)); 336 ASSERT(!XFS_BUF_ISASYNC(bp));
@@ -338,8 +338,7 @@ xfs_trans_read_buf(
338 xfsbdstrat(tp->t_mountp, bp); 338 xfsbdstrat(tp->t_mountp, bp);
339 error = xfs_buf_iowait(bp); 339 error = xfs_buf_iowait(bp);
340 if (error) { 340 if (error) {
341 xfs_ioerror_alert("xfs_trans_read_buf", mp, 341 xfs_buf_ioerror_alert(bp, __func__);
342 bp, blkno);
343 xfs_buf_relse(bp); 342 xfs_buf_relse(bp);
344 /* 343 /*
345 * We can gracefully recover from most read 344 * We can gracefully recover from most read
@@ -386,12 +385,11 @@ xfs_trans_read_buf(
386 return (flags & XBF_TRYLOCK) ? 385 return (flags & XBF_TRYLOCK) ?
387 0 : XFS_ERROR(ENOMEM); 386 0 : XFS_ERROR(ENOMEM);
388 } 387 }
389 if (XFS_BUF_GETERROR(bp) != 0) { 388 if (bp->b_error) {
390 XFS_BUF_SUPER_STALE(bp); 389 error = bp->b_error;
391 error = XFS_BUF_GETERROR(bp); 390 xfs_buf_stale(bp);
392 391 XFS_BUF_DONE(bp);
393 xfs_ioerror_alert("xfs_trans_read_buf", mp, 392 xfs_buf_ioerror_alert(bp, __func__);
394 bp, blkno);
395 if (tp->t_flags & XFS_TRANS_DIRTY) 393 if (tp->t_flags & XFS_TRANS_DIRTY)
396 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); 394 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
397 xfs_buf_relse(bp); 395 xfs_buf_relse(bp);
@@ -430,7 +428,7 @@ shutdown_abort:
430 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 428 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
431 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp); 429 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
432#endif 430#endif
433 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != 431 ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) !=
434 (XBF_STALE|XBF_DELWRI)); 432 (XBF_STALE|XBF_DELWRI));
435 433
436 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 434 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
@@ -581,7 +579,6 @@ xfs_trans_bhold(xfs_trans_t *tp,
581{ 579{
582 xfs_buf_log_item_t *bip = bp->b_fspriv; 580 xfs_buf_log_item_t *bip = bp->b_fspriv;
583 581
584 ASSERT(XFS_BUF_ISBUSY(bp));
585 ASSERT(bp->b_transp == tp); 582 ASSERT(bp->b_transp == tp);
586 ASSERT(bip != NULL); 583 ASSERT(bip != NULL);
587 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 584 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -602,7 +599,6 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
602{ 599{
603 xfs_buf_log_item_t *bip = bp->b_fspriv; 600 xfs_buf_log_item_t *bip = bp->b_fspriv;
604 601
605 ASSERT(XFS_BUF_ISBUSY(bp));
606 ASSERT(bp->b_transp == tp); 602 ASSERT(bp->b_transp == tp);
607 ASSERT(bip != NULL); 603 ASSERT(bip != NULL);
608 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 604 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -631,7 +627,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
631{ 627{
632 xfs_buf_log_item_t *bip = bp->b_fspriv; 628 xfs_buf_log_item_t *bip = bp->b_fspriv;
633 629
634 ASSERT(XFS_BUF_ISBUSY(bp));
635 ASSERT(bp->b_transp == tp); 630 ASSERT(bp->b_transp == tp);
636 ASSERT(bip != NULL); 631 ASSERT(bip != NULL);
637 ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); 632 ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
@@ -648,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
648 * inside the b_bdstrat callback so that this won't get written to 643 * inside the b_bdstrat callback so that this won't get written to
649 * disk. 644 * disk.
650 */ 645 */
651 XFS_BUF_DELAYWRITE(bp);
652 XFS_BUF_DONE(bp); 646 XFS_BUF_DONE(bp);
653 647
654 ASSERT(atomic_read(&bip->bli_refcount) > 0); 648 ASSERT(atomic_read(&bip->bli_refcount) > 0);
655 bp->b_iodone = xfs_buf_iodone_callbacks; 649 bp->b_iodone = xfs_buf_iodone_callbacks;
656 bip->bli_item.li_cb = xfs_buf_iodone; 650 bip->bli_item.li_cb = xfs_buf_iodone;
657 651
652 xfs_buf_delwri_queue(bp);
653
658 trace_xfs_trans_log_buf(bip); 654 trace_xfs_trans_log_buf(bip);
659 655
660 /* 656 /*
@@ -702,7 +698,6 @@ xfs_trans_binval(
702{ 698{
703 xfs_buf_log_item_t *bip = bp->b_fspriv; 699 xfs_buf_log_item_t *bip = bp->b_fspriv;
704 700
705 ASSERT(XFS_BUF_ISBUSY(bp));
706 ASSERT(bp->b_transp == tp); 701 ASSERT(bp->b_transp == tp);
707 ASSERT(bip != NULL); 702 ASSERT(bip != NULL);
708 ASSERT(atomic_read(&bip->bli_refcount) > 0); 703 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -744,8 +739,7 @@ xfs_trans_binval(
744 * We set the stale bit in the buffer as well since we're getting 739 * We set the stale bit in the buffer as well since we're getting
745 * rid of it. 740 * rid of it.
746 */ 741 */
747 XFS_BUF_UNDELAYWRITE(bp); 742 xfs_buf_stale(bp);
748 XFS_BUF_STALE(bp);
749 bip->bli_flags |= XFS_BLI_STALE; 743 bip->bli_flags |= XFS_BLI_STALE;
750 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); 744 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
751 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; 745 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
@@ -774,7 +768,6 @@ xfs_trans_inode_buf(
774{ 768{
775 xfs_buf_log_item_t *bip = bp->b_fspriv; 769 xfs_buf_log_item_t *bip = bp->b_fspriv;
776 770
777 ASSERT(XFS_BUF_ISBUSY(bp));
778 ASSERT(bp->b_transp == tp); 771 ASSERT(bp->b_transp == tp);
779 ASSERT(bip != NULL); 772 ASSERT(bip != NULL);
780 ASSERT(atomic_read(&bip->bli_refcount) > 0); 773 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -798,7 +791,6 @@ xfs_trans_stale_inode_buf(
798{ 791{
799 xfs_buf_log_item_t *bip = bp->b_fspriv; 792 xfs_buf_log_item_t *bip = bp->b_fspriv;
800 793
801 ASSERT(XFS_BUF_ISBUSY(bp));
802 ASSERT(bp->b_transp == tp); 794 ASSERT(bp->b_transp == tp);
803 ASSERT(bip != NULL); 795 ASSERT(bip != NULL);
804 ASSERT(atomic_read(&bip->bli_refcount) > 0); 796 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -823,7 +815,6 @@ xfs_trans_inode_alloc_buf(
823{ 815{
824 xfs_buf_log_item_t *bip = bp->b_fspriv; 816 xfs_buf_log_item_t *bip = bp->b_fspriv;
825 817
826 ASSERT(XFS_BUF_ISBUSY(bp));
827 ASSERT(bp->b_transp == tp); 818 ASSERT(bp->b_transp == tp);
828 ASSERT(bip != NULL); 819 ASSERT(bip != NULL);
829 ASSERT(atomic_read(&bip->bli_refcount) > 0); 820 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -851,7 +842,6 @@ xfs_trans_dquot_buf(
851{ 842{
852 xfs_buf_log_item_t *bip = bp->b_fspriv; 843 xfs_buf_log_item_t *bip = bp->b_fspriv;
853 844
854 ASSERT(XFS_BUF_ISBUSY(bp));
855 ASSERT(bp->b_transp == tp); 845 ASSERT(bp->b_transp == tp);
856 ASSERT(bip != NULL); 846 ASSERT(bip != NULL);
857 ASSERT(type == XFS_BLF_UDQUOT_BUF || 847 ASSERT(type == XFS_BLF_UDQUOT_BUF ||
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 4d00ee67792d..4d00ee67792d 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index c8dea2fd7e68..32f0288ae10f 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug(
47 * Add a locked inode to the transaction. 47 * Add a locked inode to the transaction.
48 * 48 *
49 * The inode must be locked, and it cannot be associated with any transaction. 49 * The inode must be locked, and it cannot be associated with any transaction.
50 * If lock_flags is non-zero the inode will be unlocked on transaction commit.
50 */ 51 */
51void 52void
52xfs_trans_ijoin( 53xfs_trans_ijoin(
53 struct xfs_trans *tp, 54 struct xfs_trans *tp,
54 struct xfs_inode *ip) 55 struct xfs_inode *ip,
56 uint lock_flags)
55{ 57{
56 xfs_inode_log_item_t *iip; 58 xfs_inode_log_item_t *iip;
57 59
@@ -59,7 +61,9 @@ xfs_trans_ijoin(
59 if (ip->i_itemp == NULL) 61 if (ip->i_itemp == NULL)
60 xfs_inode_item_init(ip, ip->i_mount); 62 xfs_inode_item_init(ip, ip->i_mount);
61 iip = ip->i_itemp; 63 iip = ip->i_itemp;
64
62 ASSERT(iip->ili_lock_flags == 0); 65 ASSERT(iip->ili_lock_flags == 0);
66 iip->ili_lock_flags = lock_flags;
63 67
64 /* 68 /*
65 * Get a log_item_desc to point at the new item. 69 * Get a log_item_desc to point at the new item.
@@ -70,25 +74,6 @@ xfs_trans_ijoin(
70} 74}
71 75
72/* 76/*
73 * Add a locked inode to the transaction.
74 *
75 *
76 * Grabs a reference to the inode which will be dropped when the transaction
77 * is committed. The inode will also be unlocked at that point. The inode
78 * must be locked, and it cannot be associated with any transaction.
79 */
80void
81xfs_trans_ijoin_ref(
82 struct xfs_trans *tp,
83 struct xfs_inode *ip,
84 uint lock_flags)
85{
86 xfs_trans_ijoin(tp, ip);
87 IHOLD(ip);
88 ip->i_itemp->ili_lock_flags = lock_flags;
89}
90
91/*
92 * Transactional inode timestamp update. Requires the inode to be locked and 77 * Transactional inode timestamp update. Requires the inode to be locked and
93 * joined to the transaction supplied. Relies on the transaction subsystem to 78 * joined to the transaction supplied. Relies on the transaction subsystem to
94 * track dirty state and update/writeback the inode accordingly. 79 * track dirty state and update/writeback the inode accordingly.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 212946b97239..44820b9fcb43 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -64,23 +64,18 @@ struct xfs_ail_cursor {
64 */ 64 */
65struct xfs_ail { 65struct xfs_ail {
66 struct xfs_mount *xa_mount; 66 struct xfs_mount *xa_mount;
67 struct task_struct *xa_task;
67 struct list_head xa_ail; 68 struct list_head xa_ail;
68 xfs_lsn_t xa_target; 69 xfs_lsn_t xa_target;
69 struct list_head xa_cursors; 70 struct list_head xa_cursors;
70 spinlock_t xa_lock; 71 spinlock_t xa_lock;
71 struct delayed_work xa_work;
72 xfs_lsn_t xa_last_pushed_lsn; 72 xfs_lsn_t xa_last_pushed_lsn;
73 unsigned long xa_flags; 73 int xa_log_flush;
74}; 74};
75 75
76#define XFS_AIL_PUSHING_BIT 0
77
78/* 76/*
79 * From xfs_trans_ail.c 77 * From xfs_trans_ail.c
80 */ 78 */
81
82extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
83
84void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, 79void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
85 struct xfs_ail_cursor *cur, 80 struct xfs_ail_cursor *cur,
86 struct xfs_log_item **log_items, int nr_items, 81 struct xfs_log_item **log_items, int nr_items,
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 7c220b4227bc..7c220b4227bc 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9322e13f0c63..4ecf2a549060 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -72,8 +72,8 @@ xfs_readlink_bmap(
72 xfs_buf_t *bp; 72 xfs_buf_t *bp;
73 int error = 0; 73 int error = 0;
74 74
75 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, 75 error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
76 mval, &nmaps, NULL); 76 0);
77 if (error) 77 if (error)
78 goto out; 78 goto out;
79 79
@@ -83,10 +83,11 @@ xfs_readlink_bmap(
83 83
84 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 84 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
85 XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); 85 XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
86 error = XFS_BUF_GETERROR(bp); 86 if (!bp)
87 return XFS_ERROR(ENOMEM);
88 error = bp->b_error;
87 if (error) { 89 if (error) {
88 xfs_ioerror_alert("xfs_readlink", 90 xfs_buf_ioerror_alert(bp, __func__);
89 ip->i_mount, bp, XFS_BUF_ADDR(bp));
90 xfs_buf_relse(bp); 91 xfs_buf_relse(bp);
91 goto out; 92 goto out;
92 } 93 }
@@ -94,7 +95,7 @@ xfs_readlink_bmap(
94 byte_cnt = pathlen; 95 byte_cnt = pathlen;
95 pathlen -= byte_cnt; 96 pathlen -= byte_cnt;
96 97
97 memcpy(link, XFS_BUF_PTR(bp), byte_cnt); 98 memcpy(link, bp->b_addr, byte_cnt);
98 xfs_buf_relse(bp); 99 xfs_buf_relse(bp);
99 } 100 }
100 101
@@ -176,8 +177,7 @@ xfs_free_eofblocks(
176 177
177 nimaps = 1; 178 nimaps = 1;
178 xfs_ilock(ip, XFS_ILOCK_SHARED); 179 xfs_ilock(ip, XFS_ILOCK_SHARED);
179 error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, 180 error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
180 NULL, 0, &imap, &nimaps, NULL);
181 xfs_iunlock(ip, XFS_ILOCK_SHARED); 181 xfs_iunlock(ip, XFS_ILOCK_SHARED);
182 182
183 if (!error && (nimaps != 0) && 183 if (!error && (nimaps != 0) &&
@@ -218,7 +218,7 @@ xfs_free_eofblocks(
218 } 218 }
219 219
220 xfs_ilock(ip, XFS_ILOCK_EXCL); 220 xfs_ilock(ip, XFS_ILOCK_EXCL);
221 xfs_trans_ijoin(tp, ip); 221 xfs_trans_ijoin(tp, ip, 0);
222 222
223 error = xfs_itruncate_data(&tp, ip, ip->i_size); 223 error = xfs_itruncate_data(&tp, ip, ip->i_size);
224 if (error) { 224 if (error) {
@@ -287,7 +287,7 @@ xfs_inactive_symlink_rmt(
287 xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 287 xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
288 size = (int)ip->i_d.di_size; 288 size = (int)ip->i_d.di_size;
289 ip->i_d.di_size = 0; 289 ip->i_d.di_size = 0;
290 xfs_trans_ijoin(tp, ip); 290 xfs_trans_ijoin(tp, ip, 0);
291 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 291 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
292 /* 292 /*
293 * Find the block(s) so we can inval and unmap them. 293 * Find the block(s) so we can inval and unmap them.
@@ -295,9 +295,9 @@ xfs_inactive_symlink_rmt(
295 done = 0; 295 done = 0;
296 xfs_bmap_init(&free_list, &first_block); 296 xfs_bmap_init(&free_list, &first_block);
297 nmaps = ARRAY_SIZE(mval); 297 nmaps = ARRAY_SIZE(mval);
298 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), 298 error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
299 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, 299 mval, &nmaps, 0);
300 &free_list))) 300 if (error)
301 goto error0; 301 goto error0;
302 /* 302 /*
303 * Invalidate the block(s). 303 * Invalidate the block(s).
@@ -306,6 +306,10 @@ xfs_inactive_symlink_rmt(
306 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, 306 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
307 XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), 307 XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
308 XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); 308 XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
309 if (!bp) {
310 error = ENOMEM;
311 goto error1;
312 }
309 xfs_trans_binval(tp, bp); 313 xfs_trans_binval(tp, bp);
310 } 314 }
311 /* 315 /*
@@ -331,7 +335,7 @@ xfs_inactive_symlink_rmt(
331 * Mark it dirty so it will be logged and moved forward in the log as 335 * Mark it dirty so it will be logged and moved forward in the log as
332 * part of every commit. 336 * part of every commit.
333 */ 337 */
334 xfs_trans_ijoin(tp, ip); 338 xfs_trans_ijoin(tp, ip, 0);
335 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 339 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
336 /* 340 /*
337 * Get a new, empty transaction to return to our caller. 341 * Get a new, empty transaction to return to our caller.
@@ -464,7 +468,7 @@ xfs_inactive_attrs(
464 goto error_cancel; 468 goto error_cancel;
465 469
466 xfs_ilock(ip, XFS_ILOCK_EXCL); 470 xfs_ilock(ip, XFS_ILOCK_EXCL);
467 xfs_trans_ijoin(tp, ip); 471 xfs_trans_ijoin(tp, ip, 0);
468 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 472 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
469 473
470 ASSERT(ip->i_d.di_anextents == 0); 474 ASSERT(ip->i_d.di_anextents == 0);
@@ -645,8 +649,6 @@ xfs_inactive(
645 if (truncate) { 649 if (truncate) {
646 xfs_ilock(ip, XFS_IOLOCK_EXCL); 650 xfs_ilock(ip, XFS_IOLOCK_EXCL);
647 651
648 xfs_ioend_wait(ip);
649
650 error = xfs_trans_reserve(tp, 0, 652 error = xfs_trans_reserve(tp, 0,
651 XFS_ITRUNCATE_LOG_RES(mp), 653 XFS_ITRUNCATE_LOG_RES(mp),
652 0, XFS_TRANS_PERM_LOG_RES, 654 0, XFS_TRANS_PERM_LOG_RES,
@@ -660,7 +662,7 @@ xfs_inactive(
660 } 662 }
661 663
662 xfs_ilock(ip, XFS_ILOCK_EXCL); 664 xfs_ilock(ip, XFS_ILOCK_EXCL);
663 xfs_trans_ijoin(tp, ip); 665 xfs_trans_ijoin(tp, ip, 0);
664 666
665 error = xfs_itruncate_data(&tp, ip, 0); 667 error = xfs_itruncate_data(&tp, ip, 0);
666 if (error) { 668 if (error) {
@@ -684,7 +686,7 @@ xfs_inactive(
684 return VN_INACTIVE_CACHE; 686 return VN_INACTIVE_CACHE;
685 } 687 }
686 688
687 xfs_trans_ijoin(tp, ip); 689 xfs_trans_ijoin(tp, ip, 0);
688 } else { 690 } else {
689 error = xfs_trans_reserve(tp, 0, 691 error = xfs_trans_reserve(tp, 0,
690 XFS_IFREE_LOG_RES(mp), 692 XFS_IFREE_LOG_RES(mp),
@@ -697,7 +699,7 @@ xfs_inactive(
697 } 699 }
698 700
699 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 701 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
700 xfs_trans_ijoin(tp, ip); 702 xfs_trans_ijoin(tp, ip, 0);
701 } 703 }
702 704
703 /* 705 /*
@@ -937,7 +939,7 @@ xfs_create(
937 * the transaction cancel unlocking dp so don't do it explicitly in the 939 * the transaction cancel unlocking dp so don't do it explicitly in the
938 * error path. 940 * error path.
939 */ 941 */
940 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); 942 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
941 unlock_dp_on_error = B_FALSE; 943 unlock_dp_on_error = B_FALSE;
942 944
943 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 945 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1258,8 +1260,8 @@ xfs_remove(
1258 1260
1259 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); 1261 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
1260 1262
1261 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); 1263 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1262 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 1264 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1263 1265
1264 /* 1266 /*
1265 * If we're removing a directory perform some additional validation. 1267 * If we're removing a directory perform some additional validation.
@@ -1404,8 +1406,8 @@ xfs_link(
1404 1406
1405 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); 1407 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1406 1408
1407 xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL); 1409 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1408 xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL); 1410 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1409 1411
1410 /* 1412 /*
1411 * If the source has too many links, we can't make any more to it. 1413 * If the source has too many links, we can't make any more to it.
@@ -1599,7 +1601,7 @@ xfs_symlink(
1599 * transaction cancel unlocking dp so don't do it explicitly in the 1601 * transaction cancel unlocking dp so don't do it explicitly in the
1600 * error path. 1602 * error path.
1601 */ 1603 */
1602 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); 1604 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1603 unlock_dp_on_error = B_FALSE; 1605 unlock_dp_on_error = B_FALSE;
1604 1606
1605 /* 1607 /*
@@ -1630,10 +1632,9 @@ xfs_symlink(
1630 first_fsb = 0; 1632 first_fsb = 0;
1631 nmaps = SYMLINK_MAPS; 1633 nmaps = SYMLINK_MAPS;
1632 1634
1633 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, 1635 error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
1634 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, 1636 XFS_BMAPI_METADATA, &first_block, resblks,
1635 &first_block, resblks, mval, &nmaps, 1637 mval, &nmaps, &free_list);
1636 &free_list);
1637 if (error) 1638 if (error)
1638 goto error2; 1639 goto error2;
1639 1640
@@ -1648,13 +1649,16 @@ xfs_symlink(
1648 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 1649 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1649 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 1650 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
1650 BTOBB(byte_cnt), 0); 1651 BTOBB(byte_cnt), 0);
1651 ASSERT(bp && !XFS_BUF_GETERROR(bp)); 1652 if (!bp) {
1653 error = ENOMEM;
1654 goto error2;
1655 }
1652 if (pathlen < byte_cnt) { 1656 if (pathlen < byte_cnt) {
1653 byte_cnt = pathlen; 1657 byte_cnt = pathlen;
1654 } 1658 }
1655 pathlen -= byte_cnt; 1659 pathlen -= byte_cnt;
1656 1660
1657 memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt); 1661 memcpy(bp->b_addr, cur_chunk, byte_cnt);
1658 cur_chunk += byte_cnt; 1662 cur_chunk += byte_cnt;
1659 1663
1660 xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); 1664 xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
@@ -1730,7 +1734,7 @@ xfs_set_dmattrs(
1730 return error; 1734 return error;
1731 } 1735 }
1732 xfs_ilock(ip, XFS_ILOCK_EXCL); 1736 xfs_ilock(ip, XFS_ILOCK_EXCL);
1733 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 1737 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1734 1738
1735 ip->i_d.di_dmevmask = evmask; 1739 ip->i_d.di_dmevmask = evmask;
1736 ip->i_d.di_dmstate = state; 1740 ip->i_d.di_dmstate = state;
@@ -1776,7 +1780,6 @@ xfs_alloc_file_space(
1776 xfs_fileoff_t startoffset_fsb; 1780 xfs_fileoff_t startoffset_fsb;
1777 xfs_fsblock_t firstfsb; 1781 xfs_fsblock_t firstfsb;
1778 int nimaps; 1782 int nimaps;
1779 int bmapi_flag;
1780 int quota_flag; 1783 int quota_flag;
1781 int rt; 1784 int rt;
1782 xfs_trans_t *tp; 1785 xfs_trans_t *tp;
@@ -1804,7 +1807,6 @@ xfs_alloc_file_space(
1804 count = len; 1807 count = len;
1805 imapp = &imaps[0]; 1808 imapp = &imaps[0];
1806 nimaps = 1; 1809 nimaps = 1;
1807 bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
1808 startoffset_fsb = XFS_B_TO_FSBT(mp, offset); 1810 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1809 allocatesize_fsb = XFS_B_TO_FSB(mp, count); 1811 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1810 1812
@@ -1875,16 +1877,12 @@ xfs_alloc_file_space(
1875 if (error) 1877 if (error)
1876 goto error1; 1878 goto error1;
1877 1879
1878 xfs_trans_ijoin(tp, ip); 1880 xfs_trans_ijoin(tp, ip, 0);
1879 1881
1880 /*
1881 * Issue the xfs_bmapi() call to allocate the blocks
1882 */
1883 xfs_bmap_init(&free_list, &firstfsb); 1882 xfs_bmap_init(&free_list, &firstfsb);
1884 error = xfs_bmapi(tp, ip, startoffset_fsb, 1883 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1885 allocatesize_fsb, bmapi_flag, 1884 allocatesize_fsb, alloc_type, &firstfsb,
1886 &firstfsb, 0, imapp, &nimaps, 1885 0, imapp, &nimaps, &free_list);
1887 &free_list);
1888 if (error) { 1886 if (error) {
1889 goto error0; 1887 goto error0;
1890 } 1888 }
@@ -1974,8 +1972,7 @@ xfs_zero_remaining_bytes(
1974 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { 1972 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1975 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1973 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1976 nimap = 1; 1974 nimap = 1;
1977 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, 1975 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1978 NULL, 0, &imap, &nimap, NULL);
1979 if (error || nimap < 1) 1976 if (error || nimap < 1)
1980 break; 1977 break;
1981 ASSERT(imap.br_blockcount >= 1); 1978 ASSERT(imap.br_blockcount >= 1);
@@ -1995,11 +1992,11 @@ xfs_zero_remaining_bytes(
1995 xfsbdstrat(mp, bp); 1992 xfsbdstrat(mp, bp);
1996 error = xfs_buf_iowait(bp); 1993 error = xfs_buf_iowait(bp);
1997 if (error) { 1994 if (error) {
1998 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", 1995 xfs_buf_ioerror_alert(bp,
1999 mp, bp, XFS_BUF_ADDR(bp)); 1996 "xfs_zero_remaining_bytes(read)");
2000 break; 1997 break;
2001 } 1998 }
2002 memset(XFS_BUF_PTR(bp) + 1999 memset(bp->b_addr +
2003 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), 2000 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
2004 0, lastoffset - offset + 1); 2001 0, lastoffset - offset + 1);
2005 XFS_BUF_UNDONE(bp); 2002 XFS_BUF_UNDONE(bp);
@@ -2008,8 +2005,8 @@ xfs_zero_remaining_bytes(
2008 xfsbdstrat(mp, bp); 2005 xfsbdstrat(mp, bp);
2009 error = xfs_buf_iowait(bp); 2006 error = xfs_buf_iowait(bp);
2010 if (error) { 2007 if (error) {
2011 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", 2008 xfs_buf_ioerror_alert(bp,
2012 mp, bp, XFS_BUF_ADDR(bp)); 2009 "xfs_zero_remaining_bytes(write)");
2013 break; 2010 break;
2014 } 2011 }
2015 } 2012 }
@@ -2074,7 +2071,7 @@ xfs_free_file_space(
2074 if (need_iolock) { 2071 if (need_iolock) {
2075 xfs_ilock(ip, XFS_IOLOCK_EXCL); 2072 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2076 /* wait for the completion of any pending DIOs */ 2073 /* wait for the completion of any pending DIOs */
2077 xfs_ioend_wait(ip); 2074 inode_dio_wait(VFS_I(ip));
2078 } 2075 }
2079 2076
2080 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 2077 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -2094,8 +2091,8 @@ xfs_free_file_space(
2094 */ 2091 */
2095 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { 2092 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
2096 nimap = 1; 2093 nimap = 1;
2097 error = xfs_bmapi(NULL, ip, startoffset_fsb, 2094 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
2098 1, 0, NULL, 0, &imap, &nimap, NULL); 2095 &imap, &nimap, 0);
2099 if (error) 2096 if (error)
2100 goto out_unlock_iolock; 2097 goto out_unlock_iolock;
2101 ASSERT(nimap == 0 || nimap == 1); 2098 ASSERT(nimap == 0 || nimap == 1);
@@ -2109,8 +2106,8 @@ xfs_free_file_space(
2109 startoffset_fsb += mp->m_sb.sb_rextsize - mod; 2106 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
2110 } 2107 }
2111 nimap = 1; 2108 nimap = 1;
2112 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 2109 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
2113 1, 0, NULL, 0, &imap, &nimap, NULL); 2110 &imap, &nimap, 0);
2114 if (error) 2111 if (error)
2115 goto out_unlock_iolock; 2112 goto out_unlock_iolock;
2116 ASSERT(nimap == 0 || nimap == 1); 2113 ASSERT(nimap == 0 || nimap == 1);
@@ -2178,7 +2175,7 @@ xfs_free_file_space(
2178 if (error) 2175 if (error)
2179 goto error1; 2176 goto error1;
2180 2177
2181 xfs_trans_ijoin(tp, ip); 2178 xfs_trans_ijoin(tp, ip, 0);
2182 2179
2183 /* 2180 /*
2184 * issue the bunmapi() call to free the blocks 2181 * issue the bunmapi() call to free the blocks
@@ -2351,8 +2348,7 @@ xfs_change_file_space(
2351 } 2348 }
2352 2349
2353 xfs_ilock(ip, XFS_ILOCK_EXCL); 2350 xfs_ilock(ip, XFS_ILOCK_EXCL);
2354 2351 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2355 xfs_trans_ijoin(tp, ip);
2356 2352
2357 if ((attr_flags & XFS_ATTR_DMI) == 0) { 2353 if ((attr_flags & XFS_ATTR_DMI) == 0) {
2358 ip->i_d.di_mode &= ~S_ISUID; 2354 ip->i_d.di_mode &= ~S_ISUID;
@@ -2377,10 +2373,5 @@ xfs_change_file_space(
2377 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2373 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2378 if (attr_flags & XFS_ATTR_SYNC) 2374 if (attr_flags & XFS_ATTR_SYNC)
2379 xfs_trans_set_sync(tp); 2375 xfs_trans_set_sync(tp);
2380 2376 return xfs_trans_commit(tp, 0);
2381 error = xfs_trans_commit(tp, 0);
2382
2383 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2384
2385 return error;
2386} 2377}
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 87d3e03878c8..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c