aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/9p/vfs_dir.c72
-rw-r--r--fs/adfs/dir.c48
-rw-r--r--fs/affs/dir.c69
-rw-r--r--fs/affs/namei.c26
-rw-r--r--fs/afs/dir.c99
-rw-r--r--fs/afs/file.c10
-rw-r--r--fs/afs/flock.c7
-rw-r--r--fs/aio.c45
-rw-r--r--fs/autofs4/root.c4
-rw-r--r--fs/bad_inode.c4
-rw-r--r--fs/befs/linuxvfs.c40
-rw-r--r--fs/bfs/dir.c35
-rw-r--r--fs/block_dev.c23
-rw-r--r--fs/btrfs/backref.c3
-rw-r--r--fs/btrfs/check-integrity.c2
-rw-r--r--fs/btrfs/ctree.c4
-rw-r--r--fs/btrfs/ctree.h8
-rw-r--r--fs/btrfs/delayed-inode.c9
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/delayed-ref.h1
-rw-r--r--fs/btrfs/dev-replace.c5
-rw-r--r--fs/btrfs/disk-io.c65
-rw-r--r--fs/btrfs/extent-tree.c94
-rw-r--r--fs/btrfs/extent_io.c140
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c15
-rw-r--r--fs/btrfs/free-space-cache.c43
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c8
-rw-r--r--fs/btrfs/inode.c136
-rw-r--r--fs/btrfs/ioctl.c18
-rw-r--r--fs/btrfs/raid56.c2
-rw-r--r--fs/btrfs/relocation.c16
-rw-r--r--fs/btrfs/scrub.c10
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/btrfs/volumes.c54
-rw-r--r--fs/btrfs/volumes.h20
-rw-r--r--fs/buffer.c21
-rw-r--r--fs/cachefiles/interface.c13
-rw-r--r--fs/cachefiles/namei.c10
-rw-r--r--fs/cachefiles/xattr.c6
-rw-r--r--fs/ceph/addr.c15
-rw-r--r--fs/ceph/dir.c99
-rw-r--r--fs/ceph/file.c11
-rw-r--r--fs/ceph/locks.c73
-rw-r--r--fs/ceph/mds_client.c65
-rw-r--r--fs/ceph/super.h9
-rw-r--r--fs/cifs/cifs_dfs_ref.c141
-rw-r--r--fs/cifs/cifsfs.c7
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/connect.c27
-rw-r--r--fs/cifs/dir.c9
-rw-r--r--fs/cifs/dns_resolve.c4
-rw-r--r--fs/cifs/file.c20
-rw-r--r--fs/cifs/inode.c3
-rw-r--r--fs/cifs/readdir.c178
-rw-r--r--fs/coda/dir.c74
-rw-r--r--fs/compat.c43
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/configfs/dir.c122
-rw-r--r--fs/cramfs/inode.c21
-rw-r--r--fs/dcache.c77
-rw-r--r--fs/debugfs/file.c43
-rw-r--r--fs/dlm/config.c5
-rw-r--r--fs/dlm/lock.c8
-rw-r--r--fs/dlm/lockspace.c9
-rw-r--r--fs/dlm/lowcomms.c177
-rw-r--r--fs/ecryptfs/crypto.c5
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c50
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/efivarfs/file.c14
-rw-r--r--fs/efivarfs/super.c9
-rw-r--r--fs/efs/dir.c75
-rw-r--r--fs/exec.c26
-rw-r--r--fs/exofs/dir.c38
-rw-r--r--fs/exofs/inode.c6
-rw-r--r--fs/exportfs/expfs.c14
-rw-r--r--fs/ext2/dir.c27
-rw-r--r--fs/ext2/namei.c24
-rw-r--r--fs/ext3/dir.c157
-rw-r--r--fs/ext3/inode.c9
-rw-r--r--fs/ext3/namei.c54
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/dir.c158
-rw-r--r--fs/ext4/ext4.h181
-rw-r--r--fs/ext4/ext4_jbd2.c58
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/extents.c202
-rw-r--r--fs/ext4/extents_status.c92
-rw-r--r--fs/ext4/extents_status.h8
-rw-r--r--fs/ext4/file.c42
-rw-r--r--fs/ext4/fsync.c52
-rw-r--r--fs/ext4/ialloc.c3
-rw-r--r--fs/ext4/indirect.c40
-rw-r--r--fs/ext4/inline.c168
-rw-r--r--fs/ext4/inode.c1682
-rw-r--r--fs/ext4/mballoc.c27
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c54
-rw-r--r--fs/ext4/page-io.c226
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c155
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c99
-rw-r--r--fs/f2fs/data.c71
-rw-r--r--fs/f2fs/debug.c4
-rw-r--r--fs/f2fs/dir.c145
-rw-r--r--fs/f2fs/f2fs.h66
-rw-r--r--fs/f2fs/file.c58
-rw-r--r--fs/f2fs/gc.c42
-rw-r--r--fs/f2fs/inode.c13
-rw-r--r--fs/f2fs/namei.c17
-rw-r--r--fs/f2fs/node.c37
-rw-r--r--fs/f2fs/node.h68
-rw-r--r--fs/f2fs/recovery.c150
-rw-r--r--fs/f2fs/segment.c101
-rw-r--r--fs/f2fs/super.c253
-rw-r--r--fs/f2fs/xattr.c68
-rw-r--r--fs/f2fs/xattr.h24
-rw-r--r--fs/fat/dir.c104
-rw-r--r--fs/fat/inode.c15
-rw-r--r--fs/fat/namei_msdos.c6
-rw-r--r--fs/fat/namei_vfat.c12
-rw-r--r--fs/file_table.c21
-rw-r--r--fs/freevxfs/vxfs_lookup.c55
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/fscache/cache.c34
-rw-r--r--fs/fscache/cookie.c93
-rw-r--r--fs/fscache/fsdef.c1
-rw-r--r--fs/fscache/internal.h11
-rw-r--r--fs/fscache/main.c11
-rw-r--r--fs/fscache/netfs.c1
-rw-r--r--fs/fscache/object-list.c103
-rw-r--r--fs/fscache/object.c1106
-rw-r--r--fs/fscache/operation.c37
-rw-r--r--fs/fscache/page.c65
-rw-r--r--fs/fuse/dir.c49
-rw-r--r--fs/fuse/file.c65
-rw-r--r--fs/fuse/inode.c7
-rw-r--r--fs/gfs2/Kconfig5
-rw-r--r--fs/gfs2/aops.c17
-rw-r--r--fs/gfs2/bmap.c21
-rw-r--r--fs/gfs2/dentry.c3
-rw-r--r--fs/gfs2/dir.c123
-rw-r--r--fs/gfs2/dir.h7
-rw-r--r--fs/gfs2/export.c10
-rw-r--r--fs/gfs2/file.c113
-rw-r--r--fs/gfs2/glops.c8
-rw-r--r--fs/gfs2/inode.c151
-rw-r--r--fs/gfs2/inode.h1
-rw-r--r--fs/gfs2/log.c78
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c28
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c11
-rw-r--r--fs/gfs2/rgrp.c27
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/gfs2/trans.c9
-rw-r--r--fs/hfs/bnode.c6
-rw-r--r--fs/hfs/dir.c49
-rw-r--r--fs/hfs/hfs_fs.h7
-rw-r--r--fs/hfs/string.c6
-rw-r--r--fs/hfsplus/dir.c50
-rw-r--r--fs/hfsplus/hfsplus_fs.h7
-rw-r--r--fs/hfsplus/unicode.c7
-rw-r--r--fs/hostfs/hostfs_kern.c13
-rw-r--r--fs/hpfs/dentry.c7
-rw-r--r--fs/hpfs/dir.c66
-rw-r--r--fs/hpfs/file.c4
-rw-r--r--fs/hppfs/hppfs.c33
-rw-r--r--fs/inode.c4
-rw-r--r--fs/internal.h12
-rw-r--r--fs/isofs/dir.c42
-rw-r--r--fs/isofs/inode.c48
-rw-r--r--fs/isofs/namei.c3
-rw-r--r--fs/jbd/transaction.c19
-rw-r--r--fs/jbd2/Kconfig6
-rw-r--r--fs/jbd2/checkpoint.c22
-rw-r--r--fs/jbd2/commit.c184
-rw-r--r--fs/jbd2/journal.c166
-rw-r--r--fs/jbd2/recovery.c11
-rw-r--r--fs/jbd2/revoke.c49
-rw-r--r--fs/jbd2/transaction.c526
-rw-r--r--fs/jffs2/dir.c52
-rw-r--r--fs/jfs/jfs_dtree.c63
-rw-r--r--fs/jfs/jfs_dtree.h2
-rw-r--r--fs/jfs/jfs_logmgr.c8
-rw-r--r--fs/jfs/jfs_metapage.c5
-rw-r--r--fs/jfs/namei.c9
-rw-r--r--fs/jfs/super.c38
-rw-r--r--fs/libfs.c80
-rw-r--r--fs/lockd/svclock.c14
-rw-r--r--fs/lockd/svcsubs.c12
-rw-r--r--fs/locks.c281
-rw-r--r--fs/logfs/dir.c49
-rw-r--r--fs/logfs/file.c3
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/minix/dir.c42
-rw-r--r--fs/minix/namei.c13
-rw-r--r--fs/namei.c117
-rw-r--r--fs/ncpfs/dir.c132
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/delegation.c10
-rw-r--r--fs/nfs/dir.c51
-rw-r--r--fs/nfs/file.c8
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4proc.c4
-rw-r--r--fs/nfs/nfs4session.c4
-rw-r--r--fs/nfs/nfs4session.h13
-rw-r--r--fs/nfs/nfs4state.c23
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/nfs4recover.c20
-rw-r--r--fs/nfsd/nfs4state.c8
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nilfs2/dir.c48
-rw-r--r--fs/nilfs2/inode.c27
-rw-r--r--fs/notify/fanotify/fanotify_user.c3
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/dir.c84
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/dir.c151
-rw-r--r--fs/ocfs2/dir.h5
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/extent_map.c2
-rw-r--r--fs/ocfs2/file.c18
-rw-r--r--fs/ocfs2/journal.c14
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/omfs/dir.c94
-rw-r--r--fs/open.c63
-rw-r--r--fs/openpromfs/inode.c95
-rw-r--r--fs/pnode.c3
-rw-r--r--fs/proc/base.c463
-rw-r--r--fs/proc/fd.c114
-rw-r--r--fs/proc/generic.c100
-rw-r--r--fs/proc/internal.h10
-rw-r--r--fs/proc/kmsg.c10
-rw-r--r--fs/proc/namespaces.c87
-rw-r--r--fs/proc/proc_net.c9
-rw-r--r--fs/proc/proc_sysctl.c78
-rw-r--r--fs/proc/root.c19
-rw-r--r--fs/pstore/inode.c2
-rw-r--r--fs/pstore/platform.c11
-rw-r--r--fs/pstore/ram.c2
-rw-r--r--fs/pstore/ram_core.c54
-rw-r--r--fs/qnx4/dir.c66
-rw-r--r--fs/qnx6/dir.c31
-rw-r--r--fs/read_write.c89
-rw-r--r--fs/readdir.c56
-rw-r--r--fs/reiserfs/dir.c38
-rw-r--r--fs/reiserfs/inode.c21
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/reiserfs/xattr.c47
-rw-r--r--fs/reiserfs/xattr_acl.c3
-rw-r--r--fs/romfs/super.c21
-rw-r--r--fs/splice.c70
-rw-r--r--fs/squashfs/dir.c40
-rw-r--r--fs/sysfs/dir.c68
-rw-r--r--fs/sysfs/file.c10
-rw-r--r--fs/sysfs/inode.c2
-rw-r--r--fs/sysv/dir.c37
-rw-r--r--fs/sysv/namei.c3
-rw-r--r--fs/ubifs/dir.c69
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/udf/dir.c63
-rw-r--r--fs/udf/namei.c24
-rw-r--r--fs/ufs/dir.c28
-rw-r--r--fs/xfs/xfs_acl.c31
-rw-r--r--fs/xfs/xfs_acl.h31
-rw-r--r--fs/xfs/xfs_aops.c33
-rw-r--r--fs/xfs/xfs_attr_leaf.c98
-rw-r--r--fs/xfs/xfs_attr_leaf.h1
-rw-r--r--fs/xfs/xfs_attr_remote.c408
-rw-r--r--fs/xfs/xfs_attr_remote.h10
-rw-r--r--fs/xfs/xfs_btree.c10
-rw-r--r--fs/xfs/xfs_buf.c3
-rw-r--r--fs/xfs/xfs_buf_item.c7
-rw-r--r--fs/xfs/xfs_da_btree.c7
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dir2.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c17
-rw-r--r--fs/xfs/xfs_dir2_format.h4
-rw-r--r--fs/xfs/xfs_dir2_leaf.c20
-rw-r--r--fs/xfs/xfs_dir2_node.c13
-rw-r--r--fs/xfs/xfs_dir2_priv.h11
-rw-r--r--fs/xfs/xfs_dir2_sf.c31
-rw-r--r--fs/xfs/xfs_dquot.c37
-rw-r--r--fs/xfs/xfs_extfree_item.c5
-rw-r--r--fs/xfs/xfs_file.c18
-rw-r--r--fs/xfs/xfs_fs.h1
-rw-r--r--fs/xfs/xfs_fsops.c4
-rw-r--r--fs/xfs/xfs_inode.c16
-rw-r--r--fs/xfs/xfs_iops.c47
-rw-r--r--fs/xfs/xfs_log_cil.c2
-rw-r--r--fs/xfs/xfs_log_recover.c114
-rw-r--r--fs/xfs/xfs_mount.c18
-rw-r--r--fs/xfs/xfs_qm.c40
-rw-r--r--fs/xfs/xfs_qm_syscalls.c40
-rw-r--r--fs/xfs/xfs_quota.h2
-rw-r--r--fs/xfs/xfs_super.c11
-rw-r--r--fs/xfs/xfs_symlink.c20
-rw-r--r--fs/xfs/xfs_trace.h15
-rw-r--r--fs/xfs/xfs_vnodeops.c4
-rw-r--r--fs/xfs/xfs_vnodeops.h3
310 files changed, 8399 insertions, 7398 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055562c580b4..9ff073f4090a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
148 * @offset: offset in the page 148 * @offset: offset in the page
149 */ 149 */
150 150
151static void v9fs_invalidate_page(struct page *page, unsigned long offset) 151static void v9fs_invalidate_page(struct page *page, unsigned int offset,
152 unsigned int length)
152{ 153{
153 /* 154 /*
154 * If called with zero offset, we should release 155 * If called with zero offset, we should release
155 * the private state assocated with the page 156 * the private state assocated with the page
156 */ 157 */
157 if (offset == 0) 158 if (offset == 0 && length == PAGE_CACHE_SIZE)
158 v9fs_fscache_invalidate_page(page); 159 v9fs_fscache_invalidate_page(page);
159} 160}
160 161
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index be1e34adc3c6..4d0c2e0be7e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -101,16 +101,15 @@ static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
101} 101}
102 102
103/** 103/**
104 * v9fs_dir_readdir - read a directory 104 * v9fs_dir_readdir - iterate through a directory
105 * @filp: opened file structure 105 * @file: opened file structure
106 * @dirent: directory structure ??? 106 * @ctx: actor we feed the entries to
107 * @filldir: function to populate directory structure ???
108 * 107 *
109 */ 108 */
110 109
111static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) 110static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
112{ 111{
113 int over; 112 bool over;
114 struct p9_wstat st; 113 struct p9_wstat st;
115 int err = 0; 114 int err = 0;
116 struct p9_fid *fid; 115 struct p9_fid *fid;
@@ -118,19 +117,19 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
118 int reclen = 0; 117 int reclen = 0;
119 struct p9_rdir *rdir; 118 struct p9_rdir *rdir;
120 119
121 p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); 120 p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
122 fid = filp->private_data; 121 fid = file->private_data;
123 122
124 buflen = fid->clnt->msize - P9_IOHDRSZ; 123 buflen = fid->clnt->msize - P9_IOHDRSZ;
125 124
126 rdir = v9fs_alloc_rdir_buf(filp, buflen); 125 rdir = v9fs_alloc_rdir_buf(file, buflen);
127 if (!rdir) 126 if (!rdir)
128 return -ENOMEM; 127 return -ENOMEM;
129 128
130 while (1) { 129 while (1) {
131 if (rdir->tail == rdir->head) { 130 if (rdir->tail == rdir->head) {
132 err = v9fs_file_readn(filp, rdir->buf, NULL, 131 err = v9fs_file_readn(file, rdir->buf, NULL,
133 buflen, filp->f_pos); 132 buflen, ctx->pos);
134 if (err <= 0) 133 if (err <= 0)
135 return err; 134 return err;
136 135
@@ -148,51 +147,45 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
148 } 147 }
149 reclen = st.size+2; 148 reclen = st.size+2;
150 149
151 over = filldir(dirent, st.name, strlen(st.name), 150 over = !dir_emit(ctx, st.name, strlen(st.name),
152 filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); 151 v9fs_qid2ino(&st.qid), dt_type(&st));
153
154 p9stat_free(&st); 152 p9stat_free(&st);
155
156 if (over) 153 if (over)
157 return 0; 154 return 0;
158 155
159 rdir->head += reclen; 156 rdir->head += reclen;
160 filp->f_pos += reclen; 157 ctx->pos += reclen;
161 } 158 }
162 } 159 }
163} 160}
164 161
165/** 162/**
166 * v9fs_dir_readdir_dotl - read a directory 163 * v9fs_dir_readdir_dotl - iterate through a directory
167 * @filp: opened file structure 164 * @file: opened file structure
168 * @dirent: buffer to fill dirent structures 165 * @ctx: actor we feed the entries to
169 * @filldir: function to populate dirent structures
170 * 166 *
171 */ 167 */
172static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, 168static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
173 filldir_t filldir)
174{ 169{
175 int over;
176 int err = 0; 170 int err = 0;
177 struct p9_fid *fid; 171 struct p9_fid *fid;
178 int buflen; 172 int buflen;
179 struct p9_rdir *rdir; 173 struct p9_rdir *rdir;
180 struct p9_dirent curdirent; 174 struct p9_dirent curdirent;
181 u64 oldoffset = 0;
182 175
183 p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); 176 p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
184 fid = filp->private_data; 177 fid = file->private_data;
185 178
186 buflen = fid->clnt->msize - P9_READDIRHDRSZ; 179 buflen = fid->clnt->msize - P9_READDIRHDRSZ;
187 180
188 rdir = v9fs_alloc_rdir_buf(filp, buflen); 181 rdir = v9fs_alloc_rdir_buf(file, buflen);
189 if (!rdir) 182 if (!rdir)
190 return -ENOMEM; 183 return -ENOMEM;
191 184
192 while (1) { 185 while (1) {
193 if (rdir->tail == rdir->head) { 186 if (rdir->tail == rdir->head) {
194 err = p9_client_readdir(fid, rdir->buf, buflen, 187 err = p9_client_readdir(fid, rdir->buf, buflen,
195 filp->f_pos); 188 ctx->pos);
196 if (err <= 0) 189 if (err <= 0)
197 return err; 190 return err;
198 191
@@ -210,22 +203,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
210 return -EIO; 203 return -EIO;
211 } 204 }
212 205
213 /* d_off in dirent structure tracks the offset into 206 if (!dir_emit(ctx, curdirent.d_name,
214 * the next dirent in the dir. However, filldir() 207 strlen(curdirent.d_name),
215 * expects offset into the current dirent. Hence 208 v9fs_qid2ino(&curdirent.qid),
216 * while calling filldir send the offset from the 209 curdirent.d_type))
217 * previous dirent structure.
218 */
219 over = filldir(dirent, curdirent.d_name,
220 strlen(curdirent.d_name),
221 oldoffset, v9fs_qid2ino(&curdirent.qid),
222 curdirent.d_type);
223 oldoffset = curdirent.d_off;
224
225 if (over)
226 return 0; 210 return 0;
227 211
228 filp->f_pos = curdirent.d_off; 212 ctx->pos = curdirent.d_off;
229 rdir->head += err; 213 rdir->head += err;
230 } 214 }
231 } 215 }
@@ -254,7 +238,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
254const struct file_operations v9fs_dir_operations = { 238const struct file_operations v9fs_dir_operations = {
255 .read = generic_read_dir, 239 .read = generic_read_dir,
256 .llseek = generic_file_llseek, 240 .llseek = generic_file_llseek,
257 .readdir = v9fs_dir_readdir, 241 .iterate = v9fs_dir_readdir,
258 .open = v9fs_file_open, 242 .open = v9fs_file_open,
259 .release = v9fs_dir_release, 243 .release = v9fs_dir_release,
260}; 244};
@@ -262,7 +246,7 @@ const struct file_operations v9fs_dir_operations = {
262const struct file_operations v9fs_dir_operations_dotl = { 246const struct file_operations v9fs_dir_operations_dotl = {
263 .read = generic_read_dir, 247 .read = generic_read_dir,
264 .llseek = generic_file_llseek, 248 .llseek = generic_file_llseek,
265 .readdir = v9fs_dir_readdir_dotl, 249 .iterate = v9fs_dir_readdir_dotl,
266 .open = v9fs_file_open, 250 .open = v9fs_file_open,
267 .release = v9fs_dir_release, 251 .release = v9fs_dir_release,
268 .fsync = v9fs_file_fsync_dotl, 252 .fsync = v9fs_file_fsync_dotl,
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 9cf874ce8336..0d138c0de293 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -17,47 +17,43 @@
17static DEFINE_RWLOCK(adfs_dir_lock); 17static DEFINE_RWLOCK(adfs_dir_lock);
18 18
19static int 19static int
20adfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 20adfs_readdir(struct file *file, struct dir_context *ctx)
21{ 21{
22 struct inode *inode = file_inode(filp); 22 struct inode *inode = file_inode(file);
23 struct super_block *sb = inode->i_sb; 23 struct super_block *sb = inode->i_sb;
24 struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; 24 struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
25 struct object_info obj; 25 struct object_info obj;
26 struct adfs_dir dir; 26 struct adfs_dir dir;
27 int ret = 0; 27 int ret = 0;
28 28
29 if (filp->f_pos >> 32) 29 if (ctx->pos >> 32)
30 goto out; 30 return 0;
31 31
32 ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); 32 ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
33 if (ret) 33 if (ret)
34 goto out; 34 return ret;
35 35
36 switch ((unsigned long)filp->f_pos) { 36 if (ctx->pos == 0) {
37 case 0: 37 if (!dir_emit_dot(file, ctx))
38 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
39 goto free_out; 38 goto free_out;
40 filp->f_pos += 1; 39 ctx->pos = 1;
41 40 }
42 case 1: 41 if (ctx->pos == 1) {
43 if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0) 42 if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
44 goto free_out; 43 goto free_out;
45 filp->f_pos += 1; 44 ctx->pos = 2;
46
47 default:
48 break;
49 } 45 }
50 46
51 read_lock(&adfs_dir_lock); 47 read_lock(&adfs_dir_lock);
52 48
53 ret = ops->setpos(&dir, filp->f_pos - 2); 49 ret = ops->setpos(&dir, ctx->pos - 2);
54 if (ret) 50 if (ret)
55 goto unlock_out; 51 goto unlock_out;
56 while (ops->getnext(&dir, &obj) == 0) { 52 while (ops->getnext(&dir, &obj) == 0) {
57 if (filldir(dirent, obj.name, obj.name_len, 53 if (!dir_emit(ctx, obj.name, obj.name_len,
58 filp->f_pos, obj.file_id, DT_UNKNOWN) < 0) 54 obj.file_id, DT_UNKNOWN))
59 goto unlock_out; 55 break;
60 filp->f_pos += 1; 56 ctx->pos++;
61 } 57 }
62 58
63unlock_out: 59unlock_out:
@@ -65,8 +61,6 @@ unlock_out:
65 61
66free_out: 62free_out:
67 ops->free(&dir); 63 ops->free(&dir);
68
69out:
70 return ret; 64 return ret;
71} 65}
72 66
@@ -192,13 +186,12 @@ out:
192const struct file_operations adfs_dir_operations = { 186const struct file_operations adfs_dir_operations = {
193 .read = generic_read_dir, 187 .read = generic_read_dir,
194 .llseek = generic_file_llseek, 188 .llseek = generic_file_llseek,
195 .readdir = adfs_readdir, 189 .iterate = adfs_readdir,
196 .fsync = generic_file_fsync, 190 .fsync = generic_file_fsync,
197}; 191};
198 192
199static int 193static int
200adfs_hash(const struct dentry *parent, const struct inode *inode, 194adfs_hash(const struct dentry *parent, struct qstr *qstr)
201 struct qstr *qstr)
202{ 195{
203 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; 196 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
204 const unsigned char *name; 197 const unsigned char *name;
@@ -234,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode,
234 * requirements of the underlying filesystem. 227 * requirements of the underlying filesystem.
235 */ 228 */
236static int 229static int
237adfs_compare(const struct dentry *parent, const struct inode *pinode, 230adfs_compare(const struct dentry *parent, const struct dentry *dentry,
238 const struct dentry *dentry, const struct inode *inode,
239 unsigned int len, const char *str, const struct qstr *name) 231 unsigned int len, const char *str, const struct qstr *name)
240{ 232{
241 int i; 233 int i;
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index fd11a6d608ee..f1eba8c3644e 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -15,12 +15,12 @@
15 15
16#include "affs.h" 16#include "affs.h"
17 17
18static int affs_readdir(struct file *, void *, filldir_t); 18static int affs_readdir(struct file *, struct dir_context *);
19 19
20const struct file_operations affs_dir_operations = { 20const struct file_operations affs_dir_operations = {
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .readdir = affs_readdir, 23 .iterate = affs_readdir,
24 .fsync = affs_file_fsync, 24 .fsync = affs_file_fsync,
25}; 25};
26 26
@@ -40,52 +40,35 @@ const struct inode_operations affs_dir_inode_operations = {
40}; 40};
41 41
42static int 42static int
43affs_readdir(struct file *filp, void *dirent, filldir_t filldir) 43affs_readdir(struct file *file, struct dir_context *ctx)
44{ 44{
45 struct inode *inode = file_inode(filp); 45 struct inode *inode = file_inode(file);
46 struct super_block *sb = inode->i_sb; 46 struct super_block *sb = inode->i_sb;
47 struct buffer_head *dir_bh; 47 struct buffer_head *dir_bh = NULL;
48 struct buffer_head *fh_bh; 48 struct buffer_head *fh_bh = NULL;
49 unsigned char *name; 49 unsigned char *name;
50 int namelen; 50 int namelen;
51 u32 i; 51 u32 i;
52 int hash_pos; 52 int hash_pos;
53 int chain_pos; 53 int chain_pos;
54 u32 f_pos;
55 u32 ino; 54 u32 ino;
56 int stored;
57 int res;
58 55
59 pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos); 56 pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
60 57
61 stored = 0; 58 if (ctx->pos < 2) {
62 res = -EIO; 59 file->private_data = (void *)0;
63 dir_bh = NULL; 60 if (!dir_emit_dots(file, ctx))
64 fh_bh = NULL;
65 f_pos = filp->f_pos;
66
67 if (f_pos == 0) {
68 filp->private_data = (void *)0;
69 if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
70 return 0; 61 return 0;
71 filp->f_pos = f_pos = 1;
72 stored++;
73 }
74 if (f_pos == 1) {
75 if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
76 return stored;
77 filp->f_pos = f_pos = 2;
78 stored++;
79 } 62 }
80 63
81 affs_lock_dir(inode); 64 affs_lock_dir(inode);
82 chain_pos = (f_pos - 2) & 0xffff; 65 chain_pos = (ctx->pos - 2) & 0xffff;
83 hash_pos = (f_pos - 2) >> 16; 66 hash_pos = (ctx->pos - 2) >> 16;
84 if (chain_pos == 0xffff) { 67 if (chain_pos == 0xffff) {
85 affs_warning(sb, "readdir", "More than 65535 entries in chain"); 68 affs_warning(sb, "readdir", "More than 65535 entries in chain");
86 chain_pos = 0; 69 chain_pos = 0;
87 hash_pos++; 70 hash_pos++;
88 filp->f_pos = ((hash_pos << 16) | chain_pos) + 2; 71 ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
89 } 72 }
90 dir_bh = affs_bread(sb, inode->i_ino); 73 dir_bh = affs_bread(sb, inode->i_ino);
91 if (!dir_bh) 74 if (!dir_bh)
@@ -94,8 +77,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
94 /* If the directory hasn't changed since the last call to readdir(), 77 /* If the directory hasn't changed since the last call to readdir(),
95 * we can jump directly to where we left off. 78 * we can jump directly to where we left off.
96 */ 79 */
97 ino = (u32)(long)filp->private_data; 80 ino = (u32)(long)file->private_data;
98 if (ino && filp->f_version == inode->i_version) { 81 if (ino && file->f_version == inode->i_version) {
99 pr_debug("AFFS: readdir() left off=%d\n", ino); 82 pr_debug("AFFS: readdir() left off=%d\n", ino);
100 goto inside; 83 goto inside;
101 } 84 }
@@ -105,7 +88,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
105 fh_bh = affs_bread(sb, ino); 88 fh_bh = affs_bread(sb, ino);
106 if (!fh_bh) { 89 if (!fh_bh) {
107 affs_error(sb, "readdir","Cannot read block %d", i); 90 affs_error(sb, "readdir","Cannot read block %d", i);
108 goto readdir_out; 91 return -EIO;
109 } 92 }
110 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); 93 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
111 affs_brelse(fh_bh); 94 affs_brelse(fh_bh);
@@ -119,38 +102,34 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
119 ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]); 102 ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
120 if (!ino) 103 if (!ino)
121 continue; 104 continue;
122 f_pos = (hash_pos << 16) + 2; 105 ctx->pos = (hash_pos << 16) + 2;
123inside: 106inside:
124 do { 107 do {
125 fh_bh = affs_bread(sb, ino); 108 fh_bh = affs_bread(sb, ino);
126 if (!fh_bh) { 109 if (!fh_bh) {
127 affs_error(sb, "readdir","Cannot read block %d", ino); 110 affs_error(sb, "readdir","Cannot read block %d", ino);
128 goto readdir_done; 111 break;
129 } 112 }
130 113
131 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); 114 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
132 name = AFFS_TAIL(sb, fh_bh)->name + 1; 115 name = AFFS_TAIL(sb, fh_bh)->name + 1;
133 pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", 116 pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
134 namelen, name, ino, hash_pos, f_pos); 117 namelen, name, ino, hash_pos, (u32)ctx->pos);
135 if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0) 118 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
136 goto readdir_done; 119 goto readdir_done;
137 stored++; 120 ctx->pos++;
138 f_pos++;
139 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); 121 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
140 affs_brelse(fh_bh); 122 affs_brelse(fh_bh);
141 fh_bh = NULL; 123 fh_bh = NULL;
142 } while (ino); 124 } while (ino);
143 } 125 }
144readdir_done: 126readdir_done:
145 filp->f_pos = f_pos; 127 file->f_version = inode->i_version;
146 filp->f_version = inode->i_version; 128 file->private_data = (void *)(long)ino;
147 filp->private_data = (void *)(long)ino;
148 res = stored;
149 129
150readdir_out: 130readdir_out:
151 affs_brelse(dir_bh); 131 affs_brelse(dir_bh);
152 affs_brelse(fh_bh); 132 affs_brelse(fh_bh);
153 affs_unlock_dir(inode); 133 affs_unlock_dir(inode);
154 pr_debug("AFFS: readdir()=%d\n", stored); 134 return 0;
155 return res;
156} 135}
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index ff65884a7839..c36cbb4537a2 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,12 @@
13typedef int (*toupper_t)(int); 13typedef int (*toupper_t)(int);
14 14
15static int affs_toupper(int ch); 15static int affs_toupper(int ch);
16static int affs_hash_dentry(const struct dentry *, 16static int affs_hash_dentry(const struct dentry *, struct qstr *);
17 const struct inode *, struct qstr *); 17static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
18static int affs_compare_dentry(const struct dentry *parent,
19 const struct inode *pinode,
20 const struct dentry *dentry, const struct inode *inode,
21 unsigned int len, const char *str, const struct qstr *name); 18 unsigned int len, const char *str, const struct qstr *name);
22static int affs_intl_toupper(int ch); 19static int affs_intl_toupper(int ch);
23static int affs_intl_hash_dentry(const struct dentry *, 20static int affs_intl_hash_dentry(const struct dentry *, struct qstr *);
24 const struct inode *, struct qstr *); 21static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
25static int affs_intl_compare_dentry(const struct dentry *parent,
26 const struct inode *pinode,
27 const struct dentry *dentry, const struct inode *inode,
28 unsigned int len, const char *str, const struct qstr *name); 22 unsigned int len, const char *str, const struct qstr *name);
29 23
30const struct dentry_operations affs_dentry_operations = { 24const struct dentry_operations affs_dentry_operations = {
@@ -86,14 +80,12 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
86} 80}
87 81
88static int 82static int
89affs_hash_dentry(const struct dentry *dentry, const struct inode *inode, 83affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
90 struct qstr *qstr)
91{ 84{
92 return __affs_hash_dentry(qstr, affs_toupper); 85 return __affs_hash_dentry(qstr, affs_toupper);
93} 86}
94static int 87static int
95affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode, 88affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
96 struct qstr *qstr)
97{ 89{
98 return __affs_hash_dentry(qstr, affs_intl_toupper); 90 return __affs_hash_dentry(qstr, affs_intl_toupper);
99} 91}
@@ -131,15 +123,13 @@ static inline int __affs_compare_dentry(unsigned int len,
131} 123}
132 124
133static int 125static int
134affs_compare_dentry(const struct dentry *parent, const struct inode *pinode, 126affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
135 const struct dentry *dentry, const struct inode *inode,
136 unsigned int len, const char *str, const struct qstr *name) 127 unsigned int len, const char *str, const struct qstr *name)
137{ 128{
138 return __affs_compare_dentry(len, str, name, affs_toupper); 129 return __affs_compare_dentry(len, str, name, affs_toupper);
139} 130}
140static int 131static int
141affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode, 132affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
142 const struct dentry *dentry, const struct inode *inode,
143 unsigned int len, const char *str, const struct qstr *name) 133 unsigned int len, const char *str, const struct qstr *name)
144{ 134{
145 return __affs_compare_dentry(len, str, name, affs_intl_toupper); 135 return __affs_compare_dentry(len, str, name, affs_intl_toupper);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7a465ed04444..34494fbead0a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -22,7 +22,7 @@
22static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, 22static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
23 unsigned int flags); 23 unsigned int flags);
24static int afs_dir_open(struct inode *inode, struct file *file); 24static int afs_dir_open(struct inode *inode, struct file *file);
25static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); 25static int afs_readdir(struct file *file, struct dir_context *ctx);
26static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); 26static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
27static int afs_d_delete(const struct dentry *dentry); 27static int afs_d_delete(const struct dentry *dentry);
28static void afs_d_release(struct dentry *dentry); 28static void afs_d_release(struct dentry *dentry);
@@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
43const struct file_operations afs_dir_file_operations = { 43const struct file_operations afs_dir_file_operations = {
44 .open = afs_dir_open, 44 .open = afs_dir_open,
45 .release = afs_release, 45 .release = afs_release,
46 .readdir = afs_readdir, 46 .iterate = afs_readdir,
47 .lock = afs_lock, 47 .lock = afs_lock,
48 .llseek = generic_file_llseek, 48 .llseek = generic_file_llseek,
49}; 49};
@@ -119,9 +119,9 @@ struct afs_dir_page {
119}; 119};
120 120
121struct afs_lookup_cookie { 121struct afs_lookup_cookie {
122 struct dir_context ctx;
122 struct afs_fid fid; 123 struct afs_fid fid;
123 const char *name; 124 struct qstr name;
124 size_t nlen;
125 int found; 125 int found;
126}; 126};
127 127
@@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file)
228/* 228/*
229 * deal with one block in an AFS directory 229 * deal with one block in an AFS directory
230 */ 230 */
231static int afs_dir_iterate_block(unsigned *fpos, 231static int afs_dir_iterate_block(struct dir_context *ctx,
232 union afs_dir_block *block, 232 union afs_dir_block *block,
233 unsigned blkoff, 233 unsigned blkoff)
234 void *cookie,
235 filldir_t filldir)
236{ 234{
237 union afs_dirent *dire; 235 union afs_dirent *dire;
238 unsigned offset, next, curr; 236 unsigned offset, next, curr;
239 size_t nlen; 237 size_t nlen;
240 int tmp, ret; 238 int tmp;
241 239
242 _enter("%u,%x,%p,,",*fpos,blkoff,block); 240 _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
243 241
244 curr = (*fpos - blkoff) / sizeof(union afs_dirent); 242 curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
245 243
246 /* walk through the block, an entry at a time */ 244 /* walk through the block, an entry at a time */
247 for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; 245 for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
@@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
256 _debug("ENT[%Zu.%u]: unused", 254 _debug("ENT[%Zu.%u]: unused",
257 blkoff / sizeof(union afs_dir_block), offset); 255 blkoff / sizeof(union afs_dir_block), offset);
258 if (offset >= curr) 256 if (offset >= curr)
259 *fpos = blkoff + 257 ctx->pos = blkoff +
260 next * sizeof(union afs_dirent); 258 next * sizeof(union afs_dirent);
261 continue; 259 continue;
262 } 260 }
@@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
302 continue; 300 continue;
303 301
304 /* found the next entry */ 302 /* found the next entry */
305 ret = filldir(cookie, 303 if (!dir_emit(ctx, dire->u.name, nlen,
306 dire->u.name,
307 nlen,
308 blkoff + offset * sizeof(union afs_dirent),
309 ntohl(dire->u.vnode), 304 ntohl(dire->u.vnode),
310 filldir == afs_lookup_filldir ? 305 ctx->actor == afs_lookup_filldir ?
311 ntohl(dire->u.unique) : DT_UNKNOWN); 306 ntohl(dire->u.unique) : DT_UNKNOWN)) {
312 if (ret < 0) {
313 _leave(" = 0 [full]"); 307 _leave(" = 0 [full]");
314 return 0; 308 return 0;
315 } 309 }
316 310
317 *fpos = blkoff + next * sizeof(union afs_dirent); 311 ctx->pos = blkoff + next * sizeof(union afs_dirent);
318 } 312 }
319 313
320 _leave(" = 1 [more]"); 314 _leave(" = 1 [more]");
@@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos,
324/* 318/*
325 * iterate through the data blob that lists the contents of an AFS directory 319 * iterate through the data blob that lists the contents of an AFS directory
326 */ 320 */
327static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, 321static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
328 filldir_t filldir, struct key *key) 322 struct key *key)
329{ 323{
330 union afs_dir_block *dblock; 324 union afs_dir_block *dblock;
331 struct afs_dir_page *dbuf; 325 struct afs_dir_page *dbuf;
@@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
333 unsigned blkoff, limit; 327 unsigned blkoff, limit;
334 int ret; 328 int ret;
335 329
336 _enter("{%lu},%u,,", dir->i_ino, *fpos); 330 _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
337 331
338 if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { 332 if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
339 _leave(" = -ESTALE"); 333 _leave(" = -ESTALE");
@@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
341 } 335 }
342 336
343 /* round the file position up to the next entry boundary */ 337 /* round the file position up to the next entry boundary */
344 *fpos += sizeof(union afs_dirent) - 1; 338 ctx->pos += sizeof(union afs_dirent) - 1;
345 *fpos &= ~(sizeof(union afs_dirent) - 1); 339 ctx->pos &= ~(sizeof(union afs_dirent) - 1);
346 340
347 /* walk through the blocks in sequence */ 341 /* walk through the blocks in sequence */
348 ret = 0; 342 ret = 0;
349 while (*fpos < dir->i_size) { 343 while (ctx->pos < dir->i_size) {
350 blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1); 344 blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
351 345
352 /* fetch the appropriate page from the directory */ 346 /* fetch the appropriate page from the directory */
353 page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); 347 page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
@@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
364 do { 358 do {
365 dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / 359 dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
366 sizeof(union afs_dir_block)]; 360 sizeof(union afs_dir_block)];
367 ret = afs_dir_iterate_block(fpos, dblock, blkoff, 361 ret = afs_dir_iterate_block(ctx, dblock, blkoff);
368 cookie, filldir);
369 if (ret != 1) { 362 if (ret != 1) {
370 afs_dir_put_page(page); 363 afs_dir_put_page(page);
371 goto out; 364 goto out;
@@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
373 366
374 blkoff += sizeof(union afs_dir_block); 367 blkoff += sizeof(union afs_dir_block);
375 368
376 } while (*fpos < dir->i_size && blkoff < limit); 369 } while (ctx->pos < dir->i_size && blkoff < limit);
377 370
378 afs_dir_put_page(page); 371 afs_dir_put_page(page);
379 ret = 0; 372 ret = 0;
@@ -387,23 +380,10 @@ out:
387/* 380/*
388 * read an AFS directory 381 * read an AFS directory
389 */ 382 */
390static int afs_readdir(struct file *file, void *cookie, filldir_t filldir) 383static int afs_readdir(struct file *file, struct dir_context *ctx)
391{ 384{
392 unsigned fpos; 385 return afs_dir_iterate(file_inode(file),
393 int ret; 386 ctx, file->private_data);
394
395 _enter("{%Ld,{%lu}}",
396 file->f_pos, file_inode(file)->i_ino);
397
398 ASSERT(file->private_data != NULL);
399
400 fpos = file->f_pos;
401 ret = afs_dir_iterate(file_inode(file), &fpos,
402 cookie, filldir, file->private_data);
403 file->f_pos = fpos;
404
405 _leave(" = %d", ret);
406 return ret;
407} 387}
408 388
409/* 389/*
@@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
416{ 396{
417 struct afs_lookup_cookie *cookie = _cookie; 397 struct afs_lookup_cookie *cookie = _cookie;
418 398
419 _enter("{%s,%Zu},%s,%u,,%llu,%u", 399 _enter("{%s,%u},%s,%u,,%llu,%u",
420 cookie->name, cookie->nlen, name, nlen, 400 cookie->name.name, cookie->name.len, name, nlen,
421 (unsigned long long) ino, dtype); 401 (unsigned long long) ino, dtype);
422 402
423 /* insanity checks first */ 403 /* insanity checks first */
424 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); 404 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
425 BUILD_BUG_ON(sizeof(union afs_dirent) != 32); 405 BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
426 406
427 if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) { 407 if (cookie->name.len != nlen ||
408 memcmp(cookie->name.name, name, nlen) != 0) {
428 _leave(" = 0 [no]"); 409 _leave(" = 0 [no]");
429 return 0; 410 return 0;
430 } 411 }
@@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
444static int afs_do_lookup(struct inode *dir, struct dentry *dentry, 425static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
445 struct afs_fid *fid, struct key *key) 426 struct afs_fid *fid, struct key *key)
446{ 427{
447 struct afs_lookup_cookie cookie; 428 struct afs_super_info *as = dir->i_sb->s_fs_info;
448 struct afs_super_info *as; 429 struct afs_lookup_cookie cookie = {
449 unsigned fpos; 430 .ctx.actor = afs_lookup_filldir,
431 .name = dentry->d_name,
432 .fid.vid = as->volume->vid
433 };
450 int ret; 434 int ret;
451 435
452 _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name); 436 _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
453 437
454 as = dir->i_sb->s_fs_info;
455
456 /* search the directory */ 438 /* search the directory */
457 cookie.name = dentry->d_name.name; 439 ret = afs_dir_iterate(dir, &cookie.ctx, key);
458 cookie.nlen = dentry->d_name.len;
459 cookie.fid.vid = as->volume->vid;
460 cookie.found = 0;
461
462 fpos = 0;
463 ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
464 key);
465 if (ret < 0) { 440 if (ret < 0) {
466 _leave(" = %d [iter]", ret); 441 _leave(" = %d [iter]", ret);
467 return ret; 442 return ret;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d565..66d50fe2ee45 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
19#include "internal.h" 19#include "internal.h"
20 20
21static int afs_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
22static void afs_invalidatepage(struct page *page, unsigned long offset); 22static void afs_invalidatepage(struct page *page, unsigned int offset,
23 unsigned int length);
23static int afs_releasepage(struct page *page, gfp_t gfp_flags); 24static int afs_releasepage(struct page *page, gfp_t gfp_flags);
24static int afs_launder_page(struct page *page); 25static int afs_launder_page(struct page *page);
25 26
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
310 * - release a page and clean up its private data if offset is 0 (indicating 311 * - release a page and clean up its private data if offset is 0 (indicating
311 * the entire page) 312 * the entire page)
312 */ 313 */
313static void afs_invalidatepage(struct page *page, unsigned long offset) 314static void afs_invalidatepage(struct page *page, unsigned int offset,
315 unsigned int length)
314{ 316{
315 struct afs_writeback *wb = (struct afs_writeback *) page_private(page); 317 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
316 318
317 _enter("{%lu},%lu", page->index, offset); 319 _enter("{%lu},%u,%u", page->index, offset, length);
318 320
319 BUG_ON(!PageLocked(page)); 321 BUG_ON(!PageLocked(page));
320 322
321 /* we clean up only if the entire page is being invalidated */ 323 /* we clean up only if the entire page is being invalidated */
322 if (offset == 0) { 324 if (offset == 0 && length == PAGE_CACHE_SIZE) {
323#ifdef CONFIG_AFS_FSCACHE 325#ifdef CONFIG_AFS_FSCACHE
324 if (PageFsCache(page)) { 326 if (PageFsCache(page)) {
325 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); 327 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 2497bf306c70..a8cf2cff836c 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
252 */ 252 */
253static int afs_do_setlk(struct file *file, struct file_lock *fl) 253static int afs_do_setlk(struct file *file, struct file_lock *fl)
254{ 254{
255 struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); 255 struct inode *inode = file_inode(file);
256 struct afs_vnode *vnode = AFS_FS_I(inode);
256 afs_lock_type_t type; 257 afs_lock_type_t type;
257 struct key *key = file->private_data; 258 struct key *key = file->private_data;
258 int ret; 259 int ret;
@@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
273 274
274 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; 275 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
275 276
276 lock_flocks(); 277 spin_lock(&inode->i_lock);
277 278
278 /* make sure we've got a callback on this file and that our view of the 279 /* make sure we've got a callback on this file and that our view of the
279 * data version is up to date */ 280 * data version is up to date */
@@ -420,7 +421,7 @@ given_lock:
420 afs_vnode_fetch_status(vnode, NULL, key); 421 afs_vnode_fetch_status(vnode, NULL, key);
421 422
422error: 423error:
423 unlock_flocks(); 424 spin_unlock(&inode->i_lock);
424 _leave(" = %d", ret); 425 _leave(" = %d", ret);
425 return ret; 426 return ret;
426 427
diff --git a/fs/aio.c b/fs/aio.c
index c5b1a8c10411..a8ecc8313fb0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -39,6 +39,8 @@
39#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
40#include <asm/uaccess.h> 40#include <asm/uaccess.h>
41 41
42#include "internal.h"
43
42#define AIO_RING_MAGIC 0xa10a10a1 44#define AIO_RING_MAGIC 0xa10a10a1
43#define AIO_RING_COMPAT_FEATURES 1 45#define AIO_RING_COMPAT_FEATURES 1
44#define AIO_RING_INCOMPAT_FEATURES 0 46#define AIO_RING_INCOMPAT_FEATURES 0
@@ -141,9 +143,6 @@ static void aio_free_ring(struct kioctx *ctx)
141 for (i = 0; i < ctx->nr_pages; i++) 143 for (i = 0; i < ctx->nr_pages; i++)
142 put_page(ctx->ring_pages[i]); 144 put_page(ctx->ring_pages[i]);
143 145
144 if (ctx->mmap_size)
145 vm_munmap(ctx->mmap_base, ctx->mmap_size);
146
147 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) 146 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
148 kfree(ctx->ring_pages); 147 kfree(ctx->ring_pages);
149} 148}
@@ -307,7 +306,9 @@ static void free_ioctx(struct kioctx *ctx)
307 kunmap_atomic(ring); 306 kunmap_atomic(ring);
308 307
309 while (atomic_read(&ctx->reqs_active) > 0) { 308 while (atomic_read(&ctx->reqs_active) > 0) {
310 wait_event(ctx->wait, head != ctx->tail); 309 wait_event(ctx->wait,
310 head != ctx->tail ||
311 atomic_read(&ctx->reqs_active) <= 0);
311 312
312 avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; 313 avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
313 314
@@ -320,11 +321,6 @@ static void free_ioctx(struct kioctx *ctx)
320 321
321 aio_free_ring(ctx); 322 aio_free_ring(ctx);
322 323
323 spin_lock(&aio_nr_lock);
324 BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
325 aio_nr -= ctx->max_reqs;
326 spin_unlock(&aio_nr_lock);
327
328 pr_debug("freeing %p\n", ctx); 324 pr_debug("freeing %p\n", ctx);
329 325
330 /* 326 /*
@@ -433,17 +429,24 @@ static void kill_ioctx(struct kioctx *ctx)
433{ 429{
434 if (!atomic_xchg(&ctx->dead, 1)) { 430 if (!atomic_xchg(&ctx->dead, 1)) {
435 hlist_del_rcu(&ctx->list); 431 hlist_del_rcu(&ctx->list);
436 /* Between hlist_del_rcu() and dropping the initial ref */
437 synchronize_rcu();
438 432
439 /* 433 /*
440 * We can't punt to workqueue here because put_ioctx() -> 434 * It'd be more correct to do this in free_ioctx(), after all
441 * free_ioctx() will unmap the ringbuffer, and that has to be 435 * the outstanding kiocbs have finished - but by then io_destroy
442 * done in the original process's context. kill_ioctx_rcu/work() 436 * has already returned, so io_setup() could potentially return
443 * exist for exit_aio(), as in that path free_ioctx() won't do 437 * -EAGAIN with no ioctxs actually in use (as far as userspace
444 * the unmap. 438 * could tell).
445 */ 439 */
446 kill_ioctx_work(&ctx->rcu_work); 440 spin_lock(&aio_nr_lock);
441 BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
442 aio_nr -= ctx->max_reqs;
443 spin_unlock(&aio_nr_lock);
444
445 if (ctx->mmap_size)
446 vm_munmap(ctx->mmap_base, ctx->mmap_size);
447
448 /* Between hlist_del_rcu() and dropping the initial ref */
449 call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
447 } 450 }
448} 451}
449 452
@@ -493,10 +496,7 @@ void exit_aio(struct mm_struct *mm)
493 */ 496 */
494 ctx->mmap_size = 0; 497 ctx->mmap_size = 0;
495 498
496 if (!atomic_xchg(&ctx->dead, 1)) { 499 kill_ioctx(ctx);
497 hlist_del_rcu(&ctx->list);
498 call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
499 }
500 } 500 }
501} 501}
502 502
@@ -1299,8 +1299,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
1299 * < min_nr if the timeout specified by timeout has elapsed 1299 * < min_nr if the timeout specified by timeout has elapsed
1300 * before sufficient events are available, where timeout == NULL 1300 * before sufficient events are available, where timeout == NULL
1301 * specifies an infinite timeout. Note that the timeout pointed to by 1301 * specifies an infinite timeout. Note that the timeout pointed to by
1302 * timeout is relative and will be updated if not NULL and the 1302 * timeout is relative. Will fail with -ENOSYS if not implemented.
1303 * operation blocks. Will fail with -ENOSYS if not implemented.
1304 */ 1303 */
1305SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, 1304SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1306 long, min_nr, 1305 long, min_nr,
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 085da86e07c2..ca8e55548d98 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = {
41 .open = dcache_dir_open, 41 .open = dcache_dir_open,
42 .release = dcache_dir_close, 42 .release = dcache_dir_close,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = dcache_readdir, 44 .iterate = dcache_readdir,
45 .llseek = dcache_dir_lseek, 45 .llseek = dcache_dir_lseek,
46 .unlocked_ioctl = autofs4_root_ioctl, 46 .unlocked_ioctl = autofs4_root_ioctl,
47#ifdef CONFIG_COMPAT 47#ifdef CONFIG_COMPAT
@@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = {
53 .open = autofs4_dir_open, 53 .open = autofs4_dir_open,
54 .release = dcache_dir_close, 54 .release = dcache_dir_close,
55 .read = generic_read_dir, 55 .read = generic_read_dir,
56 .readdir = dcache_readdir, 56 .iterate = dcache_readdir,
57 .llseek = dcache_dir_lseek, 57 .llseek = dcache_dir_lseek,
58}; 58};
59 59
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 922ad460bff9..7c93953030fb 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
45 return -EIO; 45 return -EIO;
46} 46}
47 47
48static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir) 48static int bad_file_readdir(struct file *file, struct dir_context *ctx)
49{ 49{
50 return -EIO; 50 return -EIO;
51} 51}
@@ -152,7 +152,7 @@ static const struct file_operations bad_file_ops =
152 .write = bad_file_write, 152 .write = bad_file_write,
153 .aio_read = bad_file_aio_read, 153 .aio_read = bad_file_aio_read,
154 .aio_write = bad_file_aio_write, 154 .aio_write = bad_file_aio_write,
155 .readdir = bad_file_readdir, 155 .iterate = bad_file_readdir,
156 .poll = bad_file_poll, 156 .poll = bad_file_poll,
157 .unlocked_ioctl = bad_file_unlocked_ioctl, 157 .unlocked_ioctl = bad_file_unlocked_ioctl,
158 .compat_ioctl = bad_file_compat_ioctl, 158 .compat_ioctl = bad_file_compat_ioctl,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 8615ee89ab55..e9c75e20db32 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -31,7 +31,7 @@ MODULE_LICENSE("GPL");
31/* The units the vfs expects inode->i_blocks to be in */ 31/* The units the vfs expects inode->i_blocks to be in */
32#define VFS_BLOCK_SIZE 512 32#define VFS_BLOCK_SIZE 512
33 33
34static int befs_readdir(struct file *, void *, filldir_t); 34static int befs_readdir(struct file *, struct dir_context *);
35static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); 35static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
36static int befs_readpage(struct file *file, struct page *page); 36static int befs_readpage(struct file *file, struct page *page);
37static sector_t befs_bmap(struct address_space *mapping, sector_t block); 37static sector_t befs_bmap(struct address_space *mapping, sector_t block);
@@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
66 66
67static const struct file_operations befs_dir_operations = { 67static const struct file_operations befs_dir_operations = {
68 .read = generic_read_dir, 68 .read = generic_read_dir,
69 .readdir = befs_readdir, 69 .iterate = befs_readdir,
70 .llseek = generic_file_llseek, 70 .llseek = generic_file_llseek,
71}; 71};
72 72
@@ -211,9 +211,9 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
211} 211}
212 212
213static int 213static int
214befs_readdir(struct file *filp, void *dirent, filldir_t filldir) 214befs_readdir(struct file *file, struct dir_context *ctx)
215{ 215{
216 struct inode *inode = file_inode(filp); 216 struct inode *inode = file_inode(file);
217 struct super_block *sb = inode->i_sb; 217 struct super_block *sb = inode->i_sb;
218 befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; 218 befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
219 befs_off_t value; 219 befs_off_t value;
@@ -221,15 +221,14 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
221 size_t keysize; 221 size_t keysize;
222 unsigned char d_type; 222 unsigned char d_type;
223 char keybuf[BEFS_NAME_LEN + 1]; 223 char keybuf[BEFS_NAME_LEN + 1];
224 char *nlsname; 224 const char *dirname = file->f_path.dentry->d_name.name;
225 int nlsnamelen;
226 const char *dirname = filp->f_path.dentry->d_name.name;
227 225
228 befs_debug(sb, "---> befs_readdir() " 226 befs_debug(sb, "---> befs_readdir() "
229 "name %s, inode %ld, filp->f_pos %Ld", 227 "name %s, inode %ld, ctx->pos %Ld",
230 dirname, inode->i_ino, filp->f_pos); 228 dirname, inode->i_ino, ctx->pos);
231 229
232 result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1, 230more:
231 result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
233 keybuf, &keysize, &value); 232 keybuf, &keysize, &value);
234 233
235 if (result == BEFS_ERR) { 234 if (result == BEFS_ERR) {
@@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
251 250
252 /* Convert to NLS */ 251 /* Convert to NLS */
253 if (BEFS_SB(sb)->nls) { 252 if (BEFS_SB(sb)->nls) {
253 char *nlsname;
254 int nlsnamelen;
254 result = 255 result =
255 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); 256 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
256 if (result < 0) { 257 if (result < 0) {
257 befs_debug(sb, "<--- befs_readdir() ERROR"); 258 befs_debug(sb, "<--- befs_readdir() ERROR");
258 return result; 259 return result;
259 } 260 }
260 result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos, 261 if (!dir_emit(ctx, nlsname, nlsnamelen,
261 (ino_t) value, d_type); 262 (ino_t) value, d_type)) {
263 kfree(nlsname);
264 return 0;
265 }
262 kfree(nlsname); 266 kfree(nlsname);
263
264 } else { 267 } else {
265 result = filldir(dirent, keybuf, keysize, filp->f_pos, 268 if (!dir_emit(ctx, keybuf, keysize,
266 (ino_t) value, d_type); 269 (ino_t) value, d_type))
270 return 0;
267 } 271 }
272 ctx->pos++;
273 goto more;
268 274
269 filp->f_pos++; 275 befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
270
271 befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos);
272 276
273 return 0; 277 return 0;
274} 278}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 3f422f6bb5ca..a399e6d9dc74 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -26,58 +26,51 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,
26 const unsigned char *name, int namelen, 26 const unsigned char *name, int namelen,
27 struct bfs_dirent **res_dir); 27 struct bfs_dirent **res_dir);
28 28
29static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) 29static int bfs_readdir(struct file *f, struct dir_context *ctx)
30{ 30{
31 struct inode *dir = file_inode(f); 31 struct inode *dir = file_inode(f);
32 struct buffer_head *bh; 32 struct buffer_head *bh;
33 struct bfs_dirent *de; 33 struct bfs_dirent *de;
34 struct bfs_sb_info *info = BFS_SB(dir->i_sb);
35 unsigned int offset; 34 unsigned int offset;
36 int block; 35 int block;
37 36
38 mutex_lock(&info->bfs_lock); 37 if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
39
40 if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
41 printf("Bad f_pos=%08lx for %s:%08lx\n", 38 printf("Bad f_pos=%08lx for %s:%08lx\n",
42 (unsigned long)f->f_pos, 39 (unsigned long)ctx->pos,
43 dir->i_sb->s_id, dir->i_ino); 40 dir->i_sb->s_id, dir->i_ino);
44 mutex_unlock(&info->bfs_lock); 41 return -EINVAL;
45 return -EBADF;
46 } 42 }
47 43
48 while (f->f_pos < dir->i_size) { 44 while (ctx->pos < dir->i_size) {
49 offset = f->f_pos & (BFS_BSIZE - 1); 45 offset = ctx->pos & (BFS_BSIZE - 1);
50 block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS); 46 block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
51 bh = sb_bread(dir->i_sb, block); 47 bh = sb_bread(dir->i_sb, block);
52 if (!bh) { 48 if (!bh) {
53 f->f_pos += BFS_BSIZE - offset; 49 ctx->pos += BFS_BSIZE - offset;
54 continue; 50 continue;
55 } 51 }
56 do { 52 do {
57 de = (struct bfs_dirent *)(bh->b_data + offset); 53 de = (struct bfs_dirent *)(bh->b_data + offset);
58 if (de->ino) { 54 if (de->ino) {
59 int size = strnlen(de->name, BFS_NAMELEN); 55 int size = strnlen(de->name, BFS_NAMELEN);
60 if (filldir(dirent, de->name, size, f->f_pos, 56 if (!dir_emit(ctx, de->name, size,
61 le16_to_cpu(de->ino), 57 le16_to_cpu(de->ino),
62 DT_UNKNOWN) < 0) { 58 DT_UNKNOWN)) {
63 brelse(bh); 59 brelse(bh);
64 mutex_unlock(&info->bfs_lock);
65 return 0; 60 return 0;
66 } 61 }
67 } 62 }
68 offset += BFS_DIRENT_SIZE; 63 offset += BFS_DIRENT_SIZE;
69 f->f_pos += BFS_DIRENT_SIZE; 64 ctx->pos += BFS_DIRENT_SIZE;
70 } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size)); 65 } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
71 brelse(bh); 66 brelse(bh);
72 } 67 }
73 68 return 0;
74 mutex_unlock(&info->bfs_lock);
75 return 0;
76} 69}
77 70
78const struct file_operations bfs_dir_operations = { 71const struct file_operations bfs_dir_operations = {
79 .read = generic_read_dir, 72 .read = generic_read_dir,
80 .readdir = bfs_readdir, 73 .iterate = bfs_readdir,
81 .fsync = generic_file_fsync, 74 .fsync = generic_file_fsync,
82 .llseek = generic_file_llseek, 75 .llseek = generic_file_llseek,
83}; 76};
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2091db8cdd78..431b6a04ebfd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -325,31 +325,10 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
325static loff_t block_llseek(struct file *file, loff_t offset, int whence) 325static loff_t block_llseek(struct file *file, loff_t offset, int whence)
326{ 326{
327 struct inode *bd_inode = file->f_mapping->host; 327 struct inode *bd_inode = file->f_mapping->host;
328 loff_t size;
329 loff_t retval; 328 loff_t retval;
330 329
331 mutex_lock(&bd_inode->i_mutex); 330 mutex_lock(&bd_inode->i_mutex);
332 size = i_size_read(bd_inode); 331 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
333
334 retval = -EINVAL;
335 switch (whence) {
336 case SEEK_END:
337 offset += size;
338 break;
339 case SEEK_CUR:
340 offset += file->f_pos;
341 case SEEK_SET:
342 break;
343 default:
344 goto out;
345 }
346 if (offset >= 0 && offset <= size) {
347 if (offset != file->f_pos) {
348 file->f_pos = offset;
349 }
350 retval = offset;
351 }
352out:
353 mutex_unlock(&bd_inode->i_mutex); 332 mutex_unlock(&bd_inode->i_mutex);
354 return retval; 333 return retval;
355} 334}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b4fb41558111..290e347b6db3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -918,7 +918,8 @@ again:
918 ref->parent, bsz, 0); 918 ref->parent, bsz, 0);
919 if (!eb || !extent_buffer_uptodate(eb)) { 919 if (!eb || !extent_buffer_uptodate(eb)) {
920 free_extent_buffer(eb); 920 free_extent_buffer(eb);
921 return -EIO; 921 ret = -EIO;
922 goto out;
922 } 923 }
923 ret = find_extent_in_eb(eb, bytenr, 924 ret = find_extent_in_eb(eb, bytenr,
924 *extent_item_pos, &eie); 925 *extent_item_pos, &eie);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 18af6f48781a..1431a6965017 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1700,7 +1700,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
1700 unsigned int j; 1700 unsigned int j;
1701 DECLARE_COMPLETION_ONSTACK(complete); 1701 DECLARE_COMPLETION_ONSTACK(complete);
1702 1702
1703 bio = bio_alloc(GFP_NOFS, num_pages - i); 1703 bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
1704 if (!bio) { 1704 if (!bio) {
1705 printk(KERN_INFO 1705 printk(KERN_INFO
1706 "btrfsic: bio_alloc() for %u pages failed!\n", 1706 "btrfsic: bio_alloc() for %u pages failed!\n",
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index de6de8e60b46..02fae7f7e42c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -951,10 +951,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
951 BUG_ON(ret); /* -ENOMEM */ 951 BUG_ON(ret); /* -ENOMEM */
952 } 952 }
953 if (new_flags != 0) { 953 if (new_flags != 0) {
954 int level = btrfs_header_level(buf);
955
954 ret = btrfs_set_disk_extent_flags(trans, root, 956 ret = btrfs_set_disk_extent_flags(trans, root,
955 buf->start, 957 buf->start,
956 buf->len, 958 buf->len,
957 new_flags, 0); 959 new_flags, level, 0);
958 if (ret) 960 if (ret)
959 return ret; 961 return ret;
960 } 962 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 63c328a9ce95..d6dd49b51ba8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -88,12 +88,12 @@ struct btrfs_ordered_sum;
88/* holds checksums of all the data extents */ 88/* holds checksums of all the data extents */
89#define BTRFS_CSUM_TREE_OBJECTID 7ULL 89#define BTRFS_CSUM_TREE_OBJECTID 7ULL
90 90
91/* for storing balance parameters in the root tree */
92#define BTRFS_BALANCE_OBJECTID -4ULL
93
94/* holds quota configuration and tracking */ 91/* holds quota configuration and tracking */
95#define BTRFS_QUOTA_TREE_OBJECTID 8ULL 92#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
96 93
94/* for storing balance parameters in the root tree */
95#define BTRFS_BALANCE_OBJECTID -4ULL
96
97/* orhpan objectid for tracking unlinked/truncated files */ 97/* orhpan objectid for tracking unlinked/truncated files */
98#define BTRFS_ORPHAN_OBJECTID -5ULL 98#define BTRFS_ORPHAN_OBJECTID -5ULL
99 99
@@ -3075,7 +3075,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3075int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3075int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3076 struct btrfs_root *root, 3076 struct btrfs_root *root,
3077 u64 bytenr, u64 num_bytes, u64 flags, 3077 u64 bytenr, u64 num_bytes, u64 flags,
3078 int is_data); 3078 int level, int is_data);
3079int btrfs_free_extent(struct btrfs_trans_handle *trans, 3079int btrfs_free_extent(struct btrfs_trans_handle *trans,
3080 struct btrfs_root *root, 3080 struct btrfs_root *root,
3081 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 3081 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f26f38ccd194..eb34438ddedb 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1681,8 +1681,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
1681 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree 1681 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
1682 * 1682 *
1683 */ 1683 */
1684int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, 1684int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
1685 filldir_t filldir,
1686 struct list_head *ins_list) 1685 struct list_head *ins_list)
1687{ 1686{
1688 struct btrfs_dir_item *di; 1687 struct btrfs_dir_item *di;
@@ -1704,13 +1703,13 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
1704 list_for_each_entry_safe(curr, next, ins_list, readdir_list) { 1703 list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1705 list_del(&curr->readdir_list); 1704 list_del(&curr->readdir_list);
1706 1705
1707 if (curr->key.offset < filp->f_pos) { 1706 if (curr->key.offset < ctx->pos) {
1708 if (atomic_dec_and_test(&curr->refs)) 1707 if (atomic_dec_and_test(&curr->refs))
1709 kfree(curr); 1708 kfree(curr);
1710 continue; 1709 continue;
1711 } 1710 }
1712 1711
1713 filp->f_pos = curr->key.offset; 1712 ctx->pos = curr->key.offset;
1714 1713
1715 di = (struct btrfs_dir_item *)curr->data; 1714 di = (struct btrfs_dir_item *)curr->data;
1716 name = (char *)(di + 1); 1715 name = (char *)(di + 1);
@@ -1719,7 +1718,7 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
1719 d_type = btrfs_filetype_table[di->type]; 1718 d_type = btrfs_filetype_table[di->type];
1720 btrfs_disk_key_to_cpu(&location, &di->location); 1719 btrfs_disk_key_to_cpu(&location, &di->location);
1721 1720
1722 over = filldir(dirent, name, name_len, curr->key.offset, 1721 over = !dir_emit(ctx, name, name_len,
1723 location.objectid, d_type); 1722 location.objectid, d_type);
1724 1723
1725 if (atomic_dec_and_test(&curr->refs)) 1724 if (atomic_dec_and_test(&curr->refs))
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1d5c5f7abe3e..a4b38f934d14 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -139,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
139 struct list_head *del_list); 139 struct list_head *del_list);
140int btrfs_should_delete_dir_index(struct list_head *del_list, 140int btrfs_should_delete_dir_index(struct list_head *del_list,
141 u64 index); 141 u64 index);
142int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, 142int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
143 filldir_t filldir,
144 struct list_head *ins_list); 143 struct list_head *ins_list);
145 144
146/* for init */ 145/* for init */
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f75fcaf79aeb..70b962cc177d 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -60,6 +60,7 @@ struct btrfs_delayed_ref_node {
60struct btrfs_delayed_extent_op { 60struct btrfs_delayed_extent_op {
61 struct btrfs_disk_key key; 61 struct btrfs_disk_key key;
62 u64 flags_to_set; 62 u64 flags_to_set;
63 int level;
63 unsigned int update_key:1; 64 unsigned int update_key:1;
64 unsigned int update_flags:1; 65 unsigned int update_flags:1;
65 unsigned int is_data:1; 66 unsigned int is_data:1;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7ba7b3900cb8..65241f32d3f8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -313,6 +313,11 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
313 struct btrfs_device *tgt_device = NULL; 313 struct btrfs_device *tgt_device = NULL;
314 struct btrfs_device *src_device = NULL; 314 struct btrfs_device *src_device = NULL;
315 315
316 if (btrfs_fs_incompat(fs_info, RAID56)) {
317 pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n");
318 return -EINVAL;
319 }
320
316 switch (args->start.cont_reading_from_srcdev_mode) { 321 switch (args->start.cont_reading_from_srcdev_mode) {
317 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 322 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
318 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 323 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4e9ebe1f1827..b0292b3ead54 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -152,7 +152,7 @@ static struct btrfs_lockdep_keyset {
152 { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, 152 { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
153 { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, 153 { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
154 { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, 154 { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
155 { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" }, 155 { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
156 { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, 156 { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
157 { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, 157 { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
158 { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, 158 { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
@@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
1013 return try_release_extent_buffer(page); 1013 return try_release_extent_buffer(page);
1014} 1014}
1015 1015
1016static void btree_invalidatepage(struct page *page, unsigned long offset) 1016static void btree_invalidatepage(struct page *page, unsigned int offset,
1017 unsigned int length)
1017{ 1018{
1018 struct extent_io_tree *tree; 1019 struct extent_io_tree *tree;
1019 tree = &BTRFS_I(page->mapping->host)->io_tree; 1020 tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1513,7 +1514,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1513 } 1514 }
1514 1515
1515 root->commit_root = btrfs_root_node(root); 1516 root->commit_root = btrfs_root_node(root);
1516 BUG_ON(!root->node); /* -ENOMEM */
1517out: 1517out:
1518 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1518 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1519 root->ref_cows = 1; 1519 root->ref_cows = 1;
@@ -1988,30 +1988,33 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1988{ 1988{
1989 free_extent_buffer(info->tree_root->node); 1989 free_extent_buffer(info->tree_root->node);
1990 free_extent_buffer(info->tree_root->commit_root); 1990 free_extent_buffer(info->tree_root->commit_root);
1991 free_extent_buffer(info->dev_root->node);
1992 free_extent_buffer(info->dev_root->commit_root);
1993 free_extent_buffer(info->extent_root->node);
1994 free_extent_buffer(info->extent_root->commit_root);
1995 free_extent_buffer(info->csum_root->node);
1996 free_extent_buffer(info->csum_root->commit_root);
1997 if (info->quota_root) {
1998 free_extent_buffer(info->quota_root->node);
1999 free_extent_buffer(info->quota_root->commit_root);
2000 }
2001
2002 info->tree_root->node = NULL; 1991 info->tree_root->node = NULL;
2003 info->tree_root->commit_root = NULL; 1992 info->tree_root->commit_root = NULL;
2004 info->dev_root->node = NULL; 1993
2005 info->dev_root->commit_root = NULL; 1994 if (info->dev_root) {
2006 info->extent_root->node = NULL; 1995 free_extent_buffer(info->dev_root->node);
2007 info->extent_root->commit_root = NULL; 1996 free_extent_buffer(info->dev_root->commit_root);
2008 info->csum_root->node = NULL; 1997 info->dev_root->node = NULL;
2009 info->csum_root->commit_root = NULL; 1998 info->dev_root->commit_root = NULL;
1999 }
2000 if (info->extent_root) {
2001 free_extent_buffer(info->extent_root->node);
2002 free_extent_buffer(info->extent_root->commit_root);
2003 info->extent_root->node = NULL;
2004 info->extent_root->commit_root = NULL;
2005 }
2006 if (info->csum_root) {
2007 free_extent_buffer(info->csum_root->node);
2008 free_extent_buffer(info->csum_root->commit_root);
2009 info->csum_root->node = NULL;
2010 info->csum_root->commit_root = NULL;
2011 }
2010 if (info->quota_root) { 2012 if (info->quota_root) {
2013 free_extent_buffer(info->quota_root->node);
2014 free_extent_buffer(info->quota_root->commit_root);
2011 info->quota_root->node = NULL; 2015 info->quota_root->node = NULL;
2012 info->quota_root->commit_root = NULL; 2016 info->quota_root->commit_root = NULL;
2013 } 2017 }
2014
2015 if (chunk_root) { 2018 if (chunk_root) {
2016 free_extent_buffer(info->chunk_root->node); 2019 free_extent_buffer(info->chunk_root->node);
2017 free_extent_buffer(info->chunk_root->commit_root); 2020 free_extent_buffer(info->chunk_root->commit_root);
@@ -2857,8 +2860,8 @@ fail_qgroup:
2857 btrfs_free_qgroup_config(fs_info); 2860 btrfs_free_qgroup_config(fs_info);
2858fail_trans_kthread: 2861fail_trans_kthread:
2859 kthread_stop(fs_info->transaction_kthread); 2862 kthread_stop(fs_info->transaction_kthread);
2860 del_fs_roots(fs_info);
2861 btrfs_cleanup_transaction(fs_info->tree_root); 2863 btrfs_cleanup_transaction(fs_info->tree_root);
2864 del_fs_roots(fs_info);
2862fail_cleaner: 2865fail_cleaner:
2863 kthread_stop(fs_info->cleaner_kthread); 2866 kthread_stop(fs_info->cleaner_kthread);
2864 2867
@@ -3128,7 +3131,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
3128 * caller 3131 * caller
3129 */ 3132 */
3130 device->flush_bio = NULL; 3133 device->flush_bio = NULL;
3131 bio = bio_alloc(GFP_NOFS, 0); 3134 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
3132 if (!bio) 3135 if (!bio)
3133 return -ENOMEM; 3136 return -ENOMEM;
3134 3137
@@ -3510,15 +3513,15 @@ int close_ctree(struct btrfs_root *root)
3510 percpu_counter_sum(&fs_info->delalloc_bytes)); 3513 percpu_counter_sum(&fs_info->delalloc_bytes));
3511 } 3514 }
3512 3515
3513 free_root_pointers(fs_info, 1);
3514
3515 btrfs_free_block_groups(fs_info); 3516 btrfs_free_block_groups(fs_info);
3516 3517
3518 btrfs_stop_all_workers(fs_info);
3519
3517 del_fs_roots(fs_info); 3520 del_fs_roots(fs_info);
3518 3521
3519 iput(fs_info->btree_inode); 3522 free_root_pointers(fs_info, 1);
3520 3523
3521 btrfs_stop_all_workers(fs_info); 3524 iput(fs_info->btree_inode);
3522 3525
3523#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3526#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3524 if (btrfs_test_opt(root, CHECK_INTEGRITY)) 3527 if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3659,8 +3662,11 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3659 ordered_operations); 3662 ordered_operations);
3660 3663
3661 list_del_init(&btrfs_inode->ordered_operations); 3664 list_del_init(&btrfs_inode->ordered_operations);
3665 spin_unlock(&root->fs_info->ordered_extent_lock);
3662 3666
3663 btrfs_invalidate_inodes(btrfs_inode->root); 3667 btrfs_invalidate_inodes(btrfs_inode->root);
3668
3669 spin_lock(&root->fs_info->ordered_extent_lock);
3664 } 3670 }
3665 3671
3666 spin_unlock(&root->fs_info->ordered_extent_lock); 3672 spin_unlock(&root->fs_info->ordered_extent_lock);
@@ -3782,8 +3788,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3782 list_del_init(&btrfs_inode->delalloc_inodes); 3788 list_del_init(&btrfs_inode->delalloc_inodes);
3783 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 3789 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3784 &btrfs_inode->runtime_flags); 3790 &btrfs_inode->runtime_flags);
3791 spin_unlock(&root->fs_info->delalloc_lock);
3785 3792
3786 btrfs_invalidate_inodes(btrfs_inode->root); 3793 btrfs_invalidate_inodes(btrfs_inode->root);
3794
3795 spin_lock(&root->fs_info->delalloc_lock);
3787 } 3796 }
3788 3797
3789 spin_unlock(&root->fs_info->delalloc_lock); 3798 spin_unlock(&root->fs_info->delalloc_lock);
@@ -3808,7 +3817,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
3808 while (start <= end) { 3817 while (start <= end) {
3809 eb = btrfs_find_tree_block(root, start, 3818 eb = btrfs_find_tree_block(root, start,
3810 root->leafsize); 3819 root->leafsize);
3811 start += eb->len; 3820 start += root->leafsize;
3812 if (!eb) 3821 if (!eb)
3813 continue; 3822 continue;
3814 wait_on_extent_buffer_writeback(eb); 3823 wait_on_extent_buffer_writeback(eb);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2305b5c5cf00..df472ab1b5ac 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2070,8 +2070,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2070 u32 item_size; 2070 u32 item_size;
2071 int ret; 2071 int ret;
2072 int err = 0; 2072 int err = 0;
2073 int metadata = (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2073 int metadata = !extent_op->is_data;
2074 node->type == BTRFS_SHARED_BLOCK_REF_KEY);
2075 2074
2076 if (trans->aborted) 2075 if (trans->aborted)
2077 return 0; 2076 return 0;
@@ -2086,11 +2085,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2086 key.objectid = node->bytenr; 2085 key.objectid = node->bytenr;
2087 2086
2088 if (metadata) { 2087 if (metadata) {
2089 struct btrfs_delayed_tree_ref *tree_ref;
2090
2091 tree_ref = btrfs_delayed_node_to_tree_ref(node);
2092 key.type = BTRFS_METADATA_ITEM_KEY; 2088 key.type = BTRFS_METADATA_ITEM_KEY;
2093 key.offset = tree_ref->level; 2089 key.offset = extent_op->level;
2094 } else { 2090 } else {
2095 key.type = BTRFS_EXTENT_ITEM_KEY; 2091 key.type = BTRFS_EXTENT_ITEM_KEY;
2096 key.offset = node->num_bytes; 2092 key.offset = node->num_bytes;
@@ -2719,7 +2715,7 @@ out:
2719int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2715int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2720 struct btrfs_root *root, 2716 struct btrfs_root *root,
2721 u64 bytenr, u64 num_bytes, u64 flags, 2717 u64 bytenr, u64 num_bytes, u64 flags,
2722 int is_data) 2718 int level, int is_data)
2723{ 2719{
2724 struct btrfs_delayed_extent_op *extent_op; 2720 struct btrfs_delayed_extent_op *extent_op;
2725 int ret; 2721 int ret;
@@ -2732,6 +2728,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2732 extent_op->update_flags = 1; 2728 extent_op->update_flags = 1;
2733 extent_op->update_key = 0; 2729 extent_op->update_key = 0;
2734 extent_op->is_data = is_data ? 1 : 0; 2730 extent_op->is_data = is_data ? 1 : 0;
2731 extent_op->level = level;
2735 2732
2736 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2733 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2737 num_bytes, extent_op); 2734 num_bytes, extent_op);
@@ -3109,6 +3106,11 @@ again:
3109 WARN_ON(ret); 3106 WARN_ON(ret);
3110 3107
3111 if (i_size_read(inode) > 0) { 3108 if (i_size_read(inode) > 0) {
3109 ret = btrfs_check_trunc_cache_free_space(root,
3110 &root->fs_info->global_block_rsv);
3111 if (ret)
3112 goto out_put;
3113
3112 ret = btrfs_truncate_free_space_cache(root, trans, path, 3114 ret = btrfs_truncate_free_space_cache(root, trans, path,
3113 inode); 3115 inode);
3114 if (ret) 3116 if (ret)
@@ -4562,6 +4564,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4562 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4564 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4563 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4565 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4564 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4566 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4567 if (fs_info->quota_root)
4568 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
4565 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4569 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4566 4570
4567 update_global_block_rsv(fs_info); 4571 update_global_block_rsv(fs_info);
@@ -6651,51 +6655,51 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6651 struct btrfs_block_rsv *block_rsv; 6655 struct btrfs_block_rsv *block_rsv;
6652 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 6656 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6653 int ret; 6657 int ret;
6658 bool global_updated = false;
6654 6659
6655 block_rsv = get_block_rsv(trans, root); 6660 block_rsv = get_block_rsv(trans, root);
6656 6661
6657 if (block_rsv->size == 0) { 6662 if (unlikely(block_rsv->size == 0))
6658 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6663 goto try_reserve;
6659 BTRFS_RESERVE_NO_FLUSH); 6664again:
6660 /* 6665 ret = block_rsv_use_bytes(block_rsv, blocksize);
6661 * If we couldn't reserve metadata bytes try and use some from 6666 if (!ret)
6662 * the global reserve.
6663 */
6664 if (ret && block_rsv != global_rsv) {
6665 ret = block_rsv_use_bytes(global_rsv, blocksize);
6666 if (!ret)
6667 return global_rsv;
6668 return ERR_PTR(ret);
6669 } else if (ret) {
6670 return ERR_PTR(ret);
6671 }
6672 return block_rsv; 6667 return block_rsv;
6668
6669 if (block_rsv->failfast)
6670 return ERR_PTR(ret);
6671
6672 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
6673 global_updated = true;
6674 update_global_block_rsv(root->fs_info);
6675 goto again;
6673 } 6676 }
6674 6677
6675 ret = block_rsv_use_bytes(block_rsv, blocksize); 6678 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6679 static DEFINE_RATELIMIT_STATE(_rs,
6680 DEFAULT_RATELIMIT_INTERVAL * 10,
6681 /*DEFAULT_RATELIMIT_BURST*/ 1);
6682 if (__ratelimit(&_rs))
6683 WARN(1, KERN_DEBUG
6684 "btrfs: block rsv returned %d\n", ret);
6685 }
6686try_reserve:
6687 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6688 BTRFS_RESERVE_NO_FLUSH);
6676 if (!ret) 6689 if (!ret)
6677 return block_rsv; 6690 return block_rsv;
6678 if (ret && !block_rsv->failfast) { 6691 /*
6679 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6692 * If we couldn't reserve metadata bytes try and use some from
6680 static DEFINE_RATELIMIT_STATE(_rs, 6693 * the global reserve if its space type is the same as the global
6681 DEFAULT_RATELIMIT_INTERVAL * 10, 6694 * reservation.
6682 /*DEFAULT_RATELIMIT_BURST*/ 1); 6695 */
6683 if (__ratelimit(&_rs)) 6696 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
6684 WARN(1, KERN_DEBUG 6697 block_rsv->space_info == global_rsv->space_info) {
6685 "btrfs: block rsv returned %d\n", ret); 6698 ret = block_rsv_use_bytes(global_rsv, blocksize);
6686 } 6699 if (!ret)
6687 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6700 return global_rsv;
6688 BTRFS_RESERVE_NO_FLUSH);
6689 if (!ret) {
6690 return block_rsv;
6691 } else if (ret && block_rsv != global_rsv) {
6692 ret = block_rsv_use_bytes(global_rsv, blocksize);
6693 if (!ret)
6694 return global_rsv;
6695 }
6696 } 6701 }
6697 6702 return ERR_PTR(ret);
6698 return ERR_PTR(-ENOSPC);
6699} 6703}
6700 6704
6701static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 6705static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
@@ -6763,6 +6767,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6763 extent_op->update_key = 1; 6767 extent_op->update_key = 1;
6764 extent_op->update_flags = 1; 6768 extent_op->update_flags = 1;
6765 extent_op->is_data = 0; 6769 extent_op->is_data = 0;
6770 extent_op->level = level;
6766 6771
6767 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6772 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6768 ins.objectid, 6773 ins.objectid,
@@ -6934,7 +6939,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6934 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 6939 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6935 BUG_ON(ret); /* -ENOMEM */ 6940 BUG_ON(ret); /* -ENOMEM */
6936 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6941 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6937 eb->len, flag, 0); 6942 eb->len, flag,
6943 btrfs_header_level(eb), 0);
6938 BUG_ON(ret); /* -ENOMEM */ 6944 BUG_ON(ret); /* -ENOMEM */
6939 wc->flags[level] |= flag; 6945 wc->flags[level] |= flag;
6940 } 6946 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 32d67a822e93..6bca9472f313 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -23,6 +23,7 @@
23 23
24static struct kmem_cache *extent_state_cache; 24static struct kmem_cache *extent_state_cache;
25static struct kmem_cache *extent_buffer_cache; 25static struct kmem_cache *extent_buffer_cache;
26static struct bio_set *btrfs_bioset;
26 27
27#ifdef CONFIG_BTRFS_DEBUG 28#ifdef CONFIG_BTRFS_DEBUG
28static LIST_HEAD(buffers); 29static LIST_HEAD(buffers);
@@ -125,10 +126,20 @@ int __init extent_io_init(void)
125 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 126 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
126 if (!extent_buffer_cache) 127 if (!extent_buffer_cache)
127 goto free_state_cache; 128 goto free_state_cache;
129
130 btrfs_bioset = bioset_create(BIO_POOL_SIZE,
131 offsetof(struct btrfs_io_bio, bio));
132 if (!btrfs_bioset)
133 goto free_buffer_cache;
128 return 0; 134 return 0;
129 135
136free_buffer_cache:
137 kmem_cache_destroy(extent_buffer_cache);
138 extent_buffer_cache = NULL;
139
130free_state_cache: 140free_state_cache:
131 kmem_cache_destroy(extent_state_cache); 141 kmem_cache_destroy(extent_state_cache);
142 extent_state_cache = NULL;
132 return -ENOMEM; 143 return -ENOMEM;
133} 144}
134 145
@@ -145,6 +156,8 @@ void extent_io_exit(void)
145 kmem_cache_destroy(extent_state_cache); 156 kmem_cache_destroy(extent_state_cache);
146 if (extent_buffer_cache) 157 if (extent_buffer_cache)
147 kmem_cache_destroy(extent_buffer_cache); 158 kmem_cache_destroy(extent_buffer_cache);
159 if (btrfs_bioset)
160 bioset_free(btrfs_bioset);
148} 161}
149 162
150void extent_io_tree_init(struct extent_io_tree *tree, 163void extent_io_tree_init(struct extent_io_tree *tree,
@@ -1948,28 +1961,6 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1948} 1961}
1949 1962
1950/* 1963/*
1951 * helper function to unlock a page if all the extents in the tree
1952 * for that page are unlocked
1953 */
1954static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1955{
1956 u64 start = page_offset(page);
1957 u64 end = start + PAGE_CACHE_SIZE - 1;
1958 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1959 unlock_page(page);
1960}
1961
1962/*
1963 * helper function to end page writeback if all the extents
1964 * in the tree for that page are done with writeback
1965 */
1966static void check_page_writeback(struct extent_io_tree *tree,
1967 struct page *page)
1968{
1969 end_page_writeback(page);
1970}
1971
1972/*
1973 * When IO fails, either with EIO or csum verification fails, we 1964 * When IO fails, either with EIO or csum verification fails, we
1974 * try other mirrors that might have a good copy of the data. This 1965 * try other mirrors that might have a good copy of the data. This
1975 * io_failure_record is used to record state as we go through all the 1966 * io_failure_record is used to record state as we go through all the
@@ -2046,7 +2037,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2046 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 2037 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
2047 return 0; 2038 return 0;
2048 2039
2049 bio = bio_alloc(GFP_NOFS, 1); 2040 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2050 if (!bio) 2041 if (!bio)
2051 return -EIO; 2042 return -EIO;
2052 bio->bi_private = &compl; 2043 bio->bi_private = &compl;
@@ -2336,7 +2327,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2336 return -EIO; 2327 return -EIO;
2337 } 2328 }
2338 2329
2339 bio = bio_alloc(GFP_NOFS, 1); 2330 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2340 if (!bio) { 2331 if (!bio) {
2341 free_io_failure(inode, failrec, 0); 2332 free_io_failure(inode, failrec, 0);
2342 return -EIO; 2333 return -EIO;
@@ -2398,19 +2389,24 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
2398 struct extent_io_tree *tree; 2389 struct extent_io_tree *tree;
2399 u64 start; 2390 u64 start;
2400 u64 end; 2391 u64 end;
2401 int whole_page;
2402 2392
2403 do { 2393 do {
2404 struct page *page = bvec->bv_page; 2394 struct page *page = bvec->bv_page;
2405 tree = &BTRFS_I(page->mapping->host)->io_tree; 2395 tree = &BTRFS_I(page->mapping->host)->io_tree;
2406 2396
2407 start = page_offset(page) + bvec->bv_offset; 2397 /* We always issue full-page reads, but if some block
2408 end = start + bvec->bv_len - 1; 2398 * in a page fails to read, blk_update_request() will
2399 * advance bv_offset and adjust bv_len to compensate.
2400 * Print a warning for nonzero offsets, and an error
2401 * if they don't add up to a full page. */
2402 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
2403 printk("%s page write in btrfs with offset %u and length %u\n",
2404 bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
2405 ? KERN_ERR "partial" : KERN_INFO "incomplete",
2406 bvec->bv_offset, bvec->bv_len);
2409 2407
2410 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2408 start = page_offset(page);
2411 whole_page = 1; 2409 end = start + bvec->bv_offset + bvec->bv_len - 1;
2412 else
2413 whole_page = 0;
2414 2410
2415 if (--bvec >= bio->bi_io_vec) 2411 if (--bvec >= bio->bi_io_vec)
2416 prefetchw(&bvec->bv_page->flags); 2412 prefetchw(&bvec->bv_page->flags);
@@ -2418,10 +2414,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
2418 if (end_extent_writepage(page, err, start, end)) 2414 if (end_extent_writepage(page, err, start, end))
2419 continue; 2415 continue;
2420 2416
2421 if (whole_page) 2417 end_page_writeback(page);
2422 end_page_writeback(page);
2423 else
2424 check_page_writeback(tree, page);
2425 } while (bvec >= bio->bi_io_vec); 2418 } while (bvec >= bio->bi_io_vec);
2426 2419
2427 bio_put(bio); 2420 bio_put(bio);
@@ -2446,7 +2439,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2446 struct extent_io_tree *tree; 2439 struct extent_io_tree *tree;
2447 u64 start; 2440 u64 start;
2448 u64 end; 2441 u64 end;
2449 int whole_page;
2450 int mirror; 2442 int mirror;
2451 int ret; 2443 int ret;
2452 2444
@@ -2457,19 +2449,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2457 struct page *page = bvec->bv_page; 2449 struct page *page = bvec->bv_page;
2458 struct extent_state *cached = NULL; 2450 struct extent_state *cached = NULL;
2459 struct extent_state *state; 2451 struct extent_state *state;
2452 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2460 2453
2461 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2454 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2462 "mirror=%ld\n", (u64)bio->bi_sector, err, 2455 "mirror=%lu\n", (u64)bio->bi_sector, err,
2463 (long int)bio->bi_bdev); 2456 io_bio->mirror_num);
2464 tree = &BTRFS_I(page->mapping->host)->io_tree; 2457 tree = &BTRFS_I(page->mapping->host)->io_tree;
2465 2458
2466 start = page_offset(page) + bvec->bv_offset; 2459 /* We always issue full-page reads, but if some block
2467 end = start + bvec->bv_len - 1; 2460 * in a page fails to read, blk_update_request() will
2461 * advance bv_offset and adjust bv_len to compensate.
2462 * Print a warning for nonzero offsets, and an error
2463 * if they don't add up to a full page. */
2464 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
2465 printk("%s page read in btrfs with offset %u and length %u\n",
2466 bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
2467 ? KERN_ERR "partial" : KERN_INFO "incomplete",
2468 bvec->bv_offset, bvec->bv_len);
2468 2469
2469 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2470 start = page_offset(page);
2470 whole_page = 1; 2471 end = start + bvec->bv_offset + bvec->bv_len - 1;
2471 else
2472 whole_page = 0;
2473 2472
2474 if (++bvec <= bvec_end) 2473 if (++bvec <= bvec_end)
2475 prefetchw(&bvec->bv_page->flags); 2474 prefetchw(&bvec->bv_page->flags);
@@ -2485,7 +2484,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2485 } 2484 }
2486 spin_unlock(&tree->lock); 2485 spin_unlock(&tree->lock);
2487 2486
2488 mirror = (int)(unsigned long)bio->bi_bdev; 2487 mirror = io_bio->mirror_num;
2489 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2488 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2490 ret = tree->ops->readpage_end_io_hook(page, start, end, 2489 ret = tree->ops->readpage_end_io_hook(page, start, end,
2491 state, mirror); 2490 state, mirror);
@@ -2528,39 +2527,35 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2528 } 2527 }
2529 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2528 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2530 2529
2531 if (whole_page) { 2530 if (uptodate) {
2532 if (uptodate) { 2531 SetPageUptodate(page);
2533 SetPageUptodate(page);
2534 } else {
2535 ClearPageUptodate(page);
2536 SetPageError(page);
2537 }
2538 unlock_page(page);
2539 } else { 2532 } else {
2540 if (uptodate) { 2533 ClearPageUptodate(page);
2541 check_page_uptodate(tree, page); 2534 SetPageError(page);
2542 } else {
2543 ClearPageUptodate(page);
2544 SetPageError(page);
2545 }
2546 check_page_locked(tree, page);
2547 } 2535 }
2536 unlock_page(page);
2548 } while (bvec <= bvec_end); 2537 } while (bvec <= bvec_end);
2549 2538
2550 bio_put(bio); 2539 bio_put(bio);
2551} 2540}
2552 2541
2542/*
2543 * this allocates from the btrfs_bioset. We're returning a bio right now
2544 * but you can call btrfs_io_bio for the appropriate container_of magic
2545 */
2553struct bio * 2546struct bio *
2554btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2547btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2555 gfp_t gfp_flags) 2548 gfp_t gfp_flags)
2556{ 2549{
2557 struct bio *bio; 2550 struct bio *bio;
2558 2551
2559 bio = bio_alloc(gfp_flags, nr_vecs); 2552 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
2560 2553
2561 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2554 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2562 while (!bio && (nr_vecs /= 2)) 2555 while (!bio && (nr_vecs /= 2)) {
2563 bio = bio_alloc(gfp_flags, nr_vecs); 2556 bio = bio_alloc_bioset(gfp_flags,
2557 nr_vecs, btrfs_bioset);
2558 }
2564 } 2559 }
2565 2560
2566 if (bio) { 2561 if (bio) {
@@ -2571,6 +2566,19 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2571 return bio; 2566 return bio;
2572} 2567}
2573 2568
2569struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
2570{
2571 return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
2572}
2573
2574
2575/* this also allocates from the btrfs_bioset */
2576struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
2577{
2578 return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
2579}
2580
2581
2574static int __must_check submit_one_bio(int rw, struct bio *bio, 2582static int __must_check submit_one_bio(int rw, struct bio *bio,
2575 int mirror_num, unsigned long bio_flags) 2583 int mirror_num, unsigned long bio_flags)
2576{ 2584{
@@ -2949,7 +2957,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2949 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2957 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2950 if (page->index > end_index || 2958 if (page->index > end_index ||
2951 (page->index == end_index && !pg_offset)) { 2959 (page->index == end_index && !pg_offset)) {
2952 page->mapping->a_ops->invalidatepage(page, 0); 2960 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
2953 unlock_page(page); 2961 unlock_page(page);
2954 return 0; 2962 return 0;
2955 } 2963 }
@@ -3988,7 +3996,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3988 last_for_get_extent = isize; 3996 last_for_get_extent = isize;
3989 } 3997 }
3990 3998
3991 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3999 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
3992 &cached_state); 4000 &cached_state);
3993 4001
3994 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4002 em = get_extent_skip_holes(inode, start, last_for_get_extent,
@@ -4075,7 +4083,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4075out_free: 4083out_free:
4076 free_extent_map(em); 4084 free_extent_map(em);
4077out: 4085out:
4078 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 4086 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4079 &cached_state, GFP_NOFS); 4087 &cached_state, GFP_NOFS);
4080 return ret; 4088 return ret;
4081} 4089}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a2c03a175009..41fb81e7ec53 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -336,6 +336,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
336struct bio * 336struct bio *
337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
338 gfp_t gfp_flags); 338 gfp_t gfp_flags);
339struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
340struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
339 341
340struct btrfs_fs_info; 342struct btrfs_fs_info;
341 343
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4205ba752d40..89da56a58b63 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2425,20 +2425,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2425 } 2425 }
2426 } 2426 }
2427 2427
2428 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { 2428 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2429 offset = -EINVAL;
2430 goto out;
2431 }
2432 if (offset > inode->i_sb->s_maxbytes) {
2433 offset = -EINVAL;
2434 goto out;
2435 }
2436
2437 /* Special lock needed here? */
2438 if (offset != file->f_pos) {
2439 file->f_pos = offset;
2440 file->f_version = 0;
2441 }
2442out: 2429out:
2443 mutex_unlock(&inode->i_mutex); 2430 mutex_unlock(&inode->i_mutex);
2444 return offset; 2431 return offset;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ecca6c7375a6..e53009657f0e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -197,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root,
197 block_group->key.objectid); 197 block_group->key.objectid);
198} 198}
199 199
200int btrfs_truncate_free_space_cache(struct btrfs_root *root, 200int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
201 struct btrfs_trans_handle *trans, 201 struct btrfs_block_rsv *rsv)
202 struct btrfs_path *path,
203 struct inode *inode)
204{ 202{
205 struct btrfs_block_rsv *rsv;
206 u64 needed_bytes; 203 u64 needed_bytes;
207 loff_t oldsize; 204 int ret;
208 int ret = 0;
209
210 rsv = trans->block_rsv;
211 trans->block_rsv = &root->fs_info->global_block_rsv;
212 205
213 /* 1 for slack space, 1 for updating the inode */ 206 /* 1 for slack space, 1 for updating the inode */
214 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + 207 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
215 btrfs_calc_trans_metadata_size(root, 1); 208 btrfs_calc_trans_metadata_size(root, 1);
216 209
217 spin_lock(&trans->block_rsv->lock); 210 spin_lock(&rsv->lock);
218 if (trans->block_rsv->reserved < needed_bytes) { 211 if (rsv->reserved < needed_bytes)
219 spin_unlock(&trans->block_rsv->lock); 212 ret = -ENOSPC;
220 trans->block_rsv = rsv; 213 else
221 return -ENOSPC; 214 ret = 0;
222 } 215 spin_unlock(&rsv->lock);
223 spin_unlock(&trans->block_rsv->lock); 216 return 0;
217}
218
219int btrfs_truncate_free_space_cache(struct btrfs_root *root,
220 struct btrfs_trans_handle *trans,
221 struct btrfs_path *path,
222 struct inode *inode)
223{
224 loff_t oldsize;
225 int ret = 0;
224 226
225 oldsize = i_size_read(inode); 227 oldsize = i_size_read(inode);
226 btrfs_i_size_write(inode, 0); 228 btrfs_i_size_write(inode, 0);
@@ -232,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
232 */ 234 */
233 ret = btrfs_truncate_inode_items(trans, root, inode, 235 ret = btrfs_truncate_inode_items(trans, root, inode,
234 0, BTRFS_EXTENT_DATA_KEY); 236 0, BTRFS_EXTENT_DATA_KEY);
235
236 if (ret) { 237 if (ret) {
237 trans->block_rsv = rsv;
238 btrfs_abort_transaction(trans, root, ret); 238 btrfs_abort_transaction(trans, root, ret);
239 return ret; 239 return ret;
240 } 240 }
@@ -242,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
242 ret = btrfs_update_inode(trans, root, inode); 242 ret = btrfs_update_inode(trans, root, inode);
243 if (ret) 243 if (ret)
244 btrfs_abort_transaction(trans, root, ret); 244 btrfs_abort_transaction(trans, root, ret);
245 trans->block_rsv = rsv;
246 245
247 return ret; 246 return ret;
248} 247}
@@ -920,10 +919,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
920 919
921 /* Make sure we can fit our crcs into the first page */ 920 /* Make sure we can fit our crcs into the first page */
922 if (io_ctl.check_crcs && 921 if (io_ctl.check_crcs &&
923 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { 922 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
924 WARN_ON(1);
925 goto out_nospc; 923 goto out_nospc;
926 }
927 924
928 io_ctl_set_generation(&io_ctl, trans->transid); 925 io_ctl_set_generation(&io_ctl, trans->transid);
929 926
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 4dc17d8809c7..8b7f19f44961 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root,
54 struct btrfs_block_group_cache *block_group, 54 struct btrfs_block_group_cache *block_group,
55 struct btrfs_path *path); 55 struct btrfs_path *path);
56 56
57int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
58 struct btrfs_block_rsv *rsv);
57int btrfs_truncate_free_space_cache(struct btrfs_root *root, 59int btrfs_truncate_free_space_cache(struct btrfs_root *root,
58 struct btrfs_trans_handle *trans, 60 struct btrfs_trans_handle *trans,
59 struct btrfs_path *path, 61 struct btrfs_path *path,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d26f67a59e36..2c66ddbbe670 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
429 num_bytes = trans->bytes_reserved; 429 num_bytes = trans->bytes_reserved;
430 /* 430 /*
431 * 1 item for inode item insertion if need 431 * 1 item for inode item insertion if need
432 * 3 items for inode item update (in the worst case) 432 * 4 items for inode item update (in the worst case)
433 * 1 items for slack space if we need do truncation
433 * 1 item for free space object 434 * 1 item for free space object
434 * 3 items for pre-allocation 435 * 3 items for pre-allocation
435 */ 436 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); 437 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
437 ret = btrfs_block_rsv_add(root, trans->block_rsv, 438 ret = btrfs_block_rsv_add(root, trans->block_rsv,
438 trans->bytes_reserved, 439 trans->bytes_reserved,
439 BTRFS_RESERVE_NO_FLUSH); 440 BTRFS_RESERVE_NO_FLUSH);
@@ -468,7 +469,8 @@ again:
468 if (i_size_read(inode) > 0) { 469 if (i_size_read(inode) > 0) {
469 ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 470 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
470 if (ret) { 471 if (ret) {
471 btrfs_abort_transaction(trans, root, ret); 472 if (ret != -ENOSPC)
473 btrfs_abort_transaction(trans, root, ret);
472 goto out_put; 474 goto out_put;
473 } 475 }
474 } 476 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b31b3b091fc..4f9d16b70d3d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -715,8 +715,10 @@ retry:
715 async_extent->ram_size - 1, 0); 715 async_extent->ram_size - 1, 0);
716 716
717 em = alloc_extent_map(); 717 em = alloc_extent_map();
718 if (!em) 718 if (!em) {
719 ret = -ENOMEM;
719 goto out_free_reserve; 720 goto out_free_reserve;
721 }
720 em->start = async_extent->start; 722 em->start = async_extent->start;
721 em->len = async_extent->ram_size; 723 em->len = async_extent->ram_size;
722 em->orig_start = em->start; 724 em->orig_start = em->start;
@@ -923,8 +925,10 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
923 } 925 }
924 926
925 em = alloc_extent_map(); 927 em = alloc_extent_map();
926 if (!em) 928 if (!em) {
929 ret = -ENOMEM;
927 goto out_reserve; 930 goto out_reserve;
931 }
928 em->start = start; 932 em->start = start;
929 em->orig_start = em->start; 933 em->orig_start = em->start;
930 ram_size = ins.offset; 934 ram_size = ins.offset;
@@ -4724,6 +4728,7 @@ void btrfs_evict_inode(struct inode *inode)
4724 btrfs_end_transaction(trans, root); 4728 btrfs_end_transaction(trans, root);
4725 btrfs_btree_balance_dirty(root); 4729 btrfs_btree_balance_dirty(root);
4726no_delete: 4730no_delete:
4731 btrfs_remove_delayed_node(inode);
4727 clear_inode(inode); 4732 clear_inode(inode);
4728 return; 4733 return;
4729} 4734}
@@ -4839,14 +4844,13 @@ static void inode_tree_add(struct inode *inode)
4839 struct rb_node **p; 4844 struct rb_node **p;
4840 struct rb_node *parent; 4845 struct rb_node *parent;
4841 u64 ino = btrfs_ino(inode); 4846 u64 ino = btrfs_ino(inode);
4842again:
4843 p = &root->inode_tree.rb_node;
4844 parent = NULL;
4845 4847
4846 if (inode_unhashed(inode)) 4848 if (inode_unhashed(inode))
4847 return; 4849 return;
4848 4850again:
4851 parent = NULL;
4849 spin_lock(&root->inode_lock); 4852 spin_lock(&root->inode_lock);
4853 p = &root->inode_tree.rb_node;
4850 while (*p) { 4854 while (*p) {
4851 parent = *p; 4855 parent = *p;
4852 entry = rb_entry(parent, struct btrfs_inode, rb_node); 4856 entry = rb_entry(parent, struct btrfs_inode, rb_node);
@@ -5133,10 +5137,9 @@ unsigned char btrfs_filetype_table[] = {
5133 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 5137 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5134}; 5138};
5135 5139
5136static int btrfs_real_readdir(struct file *filp, void *dirent, 5140static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5137 filldir_t filldir)
5138{ 5141{
5139 struct inode *inode = file_inode(filp); 5142 struct inode *inode = file_inode(file);
5140 struct btrfs_root *root = BTRFS_I(inode)->root; 5143 struct btrfs_root *root = BTRFS_I(inode)->root;
5141 struct btrfs_item *item; 5144 struct btrfs_item *item;
5142 struct btrfs_dir_item *di; 5145 struct btrfs_dir_item *di;
@@ -5157,29 +5160,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
5157 char tmp_name[32]; 5160 char tmp_name[32];
5158 char *name_ptr; 5161 char *name_ptr;
5159 int name_len; 5162 int name_len;
5160 int is_curr = 0; /* filp->f_pos points to the current index? */ 5163 int is_curr = 0; /* ctx->pos points to the current index? */
5161 5164
5162 /* FIXME, use a real flag for deciding about the key type */ 5165 /* FIXME, use a real flag for deciding about the key type */
5163 if (root->fs_info->tree_root == root) 5166 if (root->fs_info->tree_root == root)
5164 key_type = BTRFS_DIR_ITEM_KEY; 5167 key_type = BTRFS_DIR_ITEM_KEY;
5165 5168
5166 /* special case for "." */ 5169 if (!dir_emit_dots(file, ctx))
5167 if (filp->f_pos == 0) { 5170 return 0;
5168 over = filldir(dirent, ".", 1, 5171
5169 filp->f_pos, btrfs_ino(inode), DT_DIR);
5170 if (over)
5171 return 0;
5172 filp->f_pos = 1;
5173 }
5174 /* special case for .., just use the back ref */
5175 if (filp->f_pos == 1) {
5176 u64 pino = parent_ino(filp->f_path.dentry);
5177 over = filldir(dirent, "..", 2,
5178 filp->f_pos, pino, DT_DIR);
5179 if (over)
5180 return 0;
5181 filp->f_pos = 2;
5182 }
5183 path = btrfs_alloc_path(); 5172 path = btrfs_alloc_path();
5184 if (!path) 5173 if (!path)
5185 return -ENOMEM; 5174 return -ENOMEM;
@@ -5193,7 +5182,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
5193 } 5182 }
5194 5183
5195 btrfs_set_key_type(&key, key_type); 5184 btrfs_set_key_type(&key, key_type);
5196 key.offset = filp->f_pos; 5185 key.offset = ctx->pos;
5197 key.objectid = btrfs_ino(inode); 5186 key.objectid = btrfs_ino(inode);
5198 5187
5199 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5219,14 +5208,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
5219 break; 5208 break;
5220 if (btrfs_key_type(&found_key) != key_type) 5209 if (btrfs_key_type(&found_key) != key_type)
5221 break; 5210 break;
5222 if (found_key.offset < filp->f_pos) 5211 if (found_key.offset < ctx->pos)
5223 goto next; 5212 goto next;
5224 if (key_type == BTRFS_DIR_INDEX_KEY && 5213 if (key_type == BTRFS_DIR_INDEX_KEY &&
5225 btrfs_should_delete_dir_index(&del_list, 5214 btrfs_should_delete_dir_index(&del_list,
5226 found_key.offset)) 5215 found_key.offset))
5227 goto next; 5216 goto next;
5228 5217
5229 filp->f_pos = found_key.offset; 5218 ctx->pos = found_key.offset;
5230 is_curr = 1; 5219 is_curr = 1;
5231 5220
5232 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5221 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
@@ -5270,9 +5259,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
5270 over = 0; 5259 over = 0;
5271 goto skip; 5260 goto skip;
5272 } 5261 }
5273 over = filldir(dirent, name_ptr, name_len, 5262 over = !dir_emit(ctx, name_ptr, name_len,
5274 found_key.offset, location.objectid, 5263 location.objectid, d_type);
5275 d_type);
5276 5264
5277skip: 5265skip:
5278 if (name_ptr != tmp_name) 5266 if (name_ptr != tmp_name)
@@ -5291,9 +5279,8 @@ next:
5291 5279
5292 if (key_type == BTRFS_DIR_INDEX_KEY) { 5280 if (key_type == BTRFS_DIR_INDEX_KEY) {
5293 if (is_curr) 5281 if (is_curr)
5294 filp->f_pos++; 5282 ctx->pos++;
5295 ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, 5283 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5296 &ins_list);
5297 if (ret) 5284 if (ret)
5298 goto nopos; 5285 goto nopos;
5299 } 5286 }
@@ -5304,9 +5291,9 @@ next:
5304 * 32-bit glibc will use getdents64, but then strtol - 5291 * 32-bit glibc will use getdents64, but then strtol -
5305 * so the last number we can serve is this. 5292 * so the last number we can serve is this.
5306 */ 5293 */
5307 filp->f_pos = 0x7fffffff; 5294 ctx->pos = 0x7fffffff;
5308 else 5295 else
5309 filp->f_pos++; 5296 ctx->pos++;
5310nopos: 5297nopos:
5311 ret = 0; 5298 ret = 0;
5312err: 5299err:
@@ -6928,7 +6915,11 @@ struct btrfs_dio_private {
6928 /* IO errors */ 6915 /* IO errors */
6929 int errors; 6916 int errors;
6930 6917
6918 /* orig_bio is our btrfs_io_bio */
6931 struct bio *orig_bio; 6919 struct bio *orig_bio;
6920
6921 /* dio_bio came from fs/direct-io.c */
6922 struct bio *dio_bio;
6932}; 6923};
6933 6924
6934static void btrfs_endio_direct_read(struct bio *bio, int err) 6925static void btrfs_endio_direct_read(struct bio *bio, int err)
@@ -6938,6 +6929,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
6938 struct bio_vec *bvec = bio->bi_io_vec; 6929 struct bio_vec *bvec = bio->bi_io_vec;
6939 struct inode *inode = dip->inode; 6930 struct inode *inode = dip->inode;
6940 struct btrfs_root *root = BTRFS_I(inode)->root; 6931 struct btrfs_root *root = BTRFS_I(inode)->root;
6932 struct bio *dio_bio;
6941 u64 start; 6933 u64 start;
6942 6934
6943 start = dip->logical_offset; 6935 start = dip->logical_offset;
@@ -6977,14 +6969,15 @@ failed:
6977 6969
6978 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 6970 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
6979 dip->logical_offset + dip->bytes - 1); 6971 dip->logical_offset + dip->bytes - 1);
6980 bio->bi_private = dip->private; 6972 dio_bio = dip->dio_bio;
6981 6973
6982 kfree(dip); 6974 kfree(dip);
6983 6975
6984 /* If we had a csum failure make sure to clear the uptodate flag */ 6976 /* If we had a csum failure make sure to clear the uptodate flag */
6985 if (err) 6977 if (err)
6986 clear_bit(BIO_UPTODATE, &bio->bi_flags); 6978 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
6987 dio_end_io(bio, err); 6979 dio_end_io(dio_bio, err);
6980 bio_put(bio);
6988} 6981}
6989 6982
6990static void btrfs_endio_direct_write(struct bio *bio, int err) 6983static void btrfs_endio_direct_write(struct bio *bio, int err)
@@ -6995,6 +6988,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
6995 struct btrfs_ordered_extent *ordered = NULL; 6988 struct btrfs_ordered_extent *ordered = NULL;
6996 u64 ordered_offset = dip->logical_offset; 6989 u64 ordered_offset = dip->logical_offset;
6997 u64 ordered_bytes = dip->bytes; 6990 u64 ordered_bytes = dip->bytes;
6991 struct bio *dio_bio;
6998 int ret; 6992 int ret;
6999 6993
7000 if (err) 6994 if (err)
@@ -7022,14 +7016,15 @@ out_test:
7022 goto again; 7016 goto again;
7023 } 7017 }
7024out_done: 7018out_done:
7025 bio->bi_private = dip->private; 7019 dio_bio = dip->dio_bio;
7026 7020
7027 kfree(dip); 7021 kfree(dip);
7028 7022
7029 /* If we had an error make sure to clear the uptodate flag */ 7023 /* If we had an error make sure to clear the uptodate flag */
7030 if (err) 7024 if (err)
7031 clear_bit(BIO_UPTODATE, &bio->bi_flags); 7025 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7032 dio_end_io(bio, err); 7026 dio_end_io(dio_bio, err);
7027 bio_put(bio);
7033} 7028}
7034 7029
7035static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 7030static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
@@ -7065,10 +7060,10 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
7065 if (!atomic_dec_and_test(&dip->pending_bios)) 7060 if (!atomic_dec_and_test(&dip->pending_bios))
7066 goto out; 7061 goto out;
7067 7062
7068 if (dip->errors) 7063 if (dip->errors) {
7069 bio_io_error(dip->orig_bio); 7064 bio_io_error(dip->orig_bio);
7070 else { 7065 } else {
7071 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); 7066 set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
7072 bio_endio(dip->orig_bio, 0); 7067 bio_endio(dip->orig_bio, 0);
7073 } 7068 }
7074out: 7069out:
@@ -7243,25 +7238,34 @@ out_err:
7243 return 0; 7238 return 0;
7244} 7239}
7245 7240
7246static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 7241static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7247 loff_t file_offset) 7242 struct inode *inode, loff_t file_offset)
7248{ 7243{
7249 struct btrfs_root *root = BTRFS_I(inode)->root; 7244 struct btrfs_root *root = BTRFS_I(inode)->root;
7250 struct btrfs_dio_private *dip; 7245 struct btrfs_dio_private *dip;
7251 struct bio_vec *bvec = bio->bi_io_vec; 7246 struct bio_vec *bvec = dio_bio->bi_io_vec;
7247 struct bio *io_bio;
7252 int skip_sum; 7248 int skip_sum;
7253 int write = rw & REQ_WRITE; 7249 int write = rw & REQ_WRITE;
7254 int ret = 0; 7250 int ret = 0;
7255 7251
7256 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 7252 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7257 7253
7254 io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
7255
7256 if (!io_bio) {
7257 ret = -ENOMEM;
7258 goto free_ordered;
7259 }
7260
7258 dip = kmalloc(sizeof(*dip), GFP_NOFS); 7261 dip = kmalloc(sizeof(*dip), GFP_NOFS);
7259 if (!dip) { 7262 if (!dip) {
7260 ret = -ENOMEM; 7263 ret = -ENOMEM;
7261 goto free_ordered; 7264 goto free_io_bio;
7262 } 7265 }
7263 7266
7264 dip->private = bio->bi_private; 7267 dip->private = dio_bio->bi_private;
7268 io_bio->bi_private = dio_bio->bi_private;
7265 dip->inode = inode; 7269 dip->inode = inode;
7266 dip->logical_offset = file_offset; 7270 dip->logical_offset = file_offset;
7267 7271
@@ -7269,22 +7273,27 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
7269 do { 7273 do {
7270 dip->bytes += bvec->bv_len; 7274 dip->bytes += bvec->bv_len;
7271 bvec++; 7275 bvec++;
7272 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); 7276 } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
7273 7277
7274 dip->disk_bytenr = (u64)bio->bi_sector << 9; 7278 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
7275 bio->bi_private = dip; 7279 io_bio->bi_private = dip;
7276 dip->errors = 0; 7280 dip->errors = 0;
7277 dip->orig_bio = bio; 7281 dip->orig_bio = io_bio;
7282 dip->dio_bio = dio_bio;
7278 atomic_set(&dip->pending_bios, 0); 7283 atomic_set(&dip->pending_bios, 0);
7279 7284
7280 if (write) 7285 if (write)
7281 bio->bi_end_io = btrfs_endio_direct_write; 7286 io_bio->bi_end_io = btrfs_endio_direct_write;
7282 else 7287 else
7283 bio->bi_end_io = btrfs_endio_direct_read; 7288 io_bio->bi_end_io = btrfs_endio_direct_read;
7284 7289
7285 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 7290 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7286 if (!ret) 7291 if (!ret)
7287 return; 7292 return;
7293
7294free_io_bio:
7295 bio_put(io_bio);
7296
7288free_ordered: 7297free_ordered:
7289 /* 7298 /*
7290 * If this is a write, we need to clean up the reserved space and kill 7299 * If this is a write, we need to clean up the reserved space and kill
@@ -7300,7 +7309,7 @@ free_ordered:
7300 btrfs_put_ordered_extent(ordered); 7309 btrfs_put_ordered_extent(ordered);
7301 btrfs_put_ordered_extent(ordered); 7310 btrfs_put_ordered_extent(ordered);
7302 } 7311 }
7303 bio_endio(bio, ret); 7312 bio_endio(dio_bio, ret);
7304} 7313}
7305 7314
7306static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 7315static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
@@ -7484,7 +7493,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7484 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 7493 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
7485} 7494}
7486 7495
7487static void btrfs_invalidatepage(struct page *page, unsigned long offset) 7496static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7497 unsigned int length)
7488{ 7498{
7489 struct inode *inode = page->mapping->host; 7499 struct inode *inode = page->mapping->host;
7490 struct extent_io_tree *tree; 7500 struct extent_io_tree *tree;
@@ -7979,7 +7989,6 @@ void btrfs_destroy_inode(struct inode *inode)
7979 inode_tree_del(inode); 7989 inode_tree_del(inode);
7980 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 7990 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
7981free: 7991free:
7982 btrfs_remove_delayed_node(inode);
7983 call_rcu(&inode->i_rcu, btrfs_i_callback); 7992 call_rcu(&inode->i_rcu, btrfs_i_callback);
7984} 7993}
7985 7994
@@ -7987,6 +7996,9 @@ int btrfs_drop_inode(struct inode *inode)
7987{ 7996{
7988 struct btrfs_root *root = BTRFS_I(inode)->root; 7997 struct btrfs_root *root = BTRFS_I(inode)->root;
7989 7998
7999 if (root == NULL)
8000 return 1;
8001
7990 /* the snap/subvol tree is on deleting */ 8002 /* the snap/subvol tree is on deleting */
7991 if (btrfs_root_refs(&root->root_item) == 0 && 8003 if (btrfs_root_refs(&root->root_item) == 0 &&
7992 root != root->fs_info->tree_root) 8004 root != root->fs_info->tree_root)
@@ -8703,7 +8715,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
8703static const struct file_operations btrfs_dir_file_operations = { 8715static const struct file_operations btrfs_dir_file_operations = {
8704 .llseek = generic_file_llseek, 8716 .llseek = generic_file_llseek,
8705 .read = generic_read_dir, 8717 .read = generic_read_dir,
8706 .readdir = btrfs_real_readdir, 8718 .iterate = btrfs_real_readdir,
8707 .unlocked_ioctl = btrfs_ioctl, 8719 .unlocked_ioctl = btrfs_ioctl,
8708#ifdef CONFIG_COMPAT 8720#ifdef CONFIG_COMPAT
8709 .compat_ioctl = btrfs_ioctl, 8721 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0de4a2fcfb24..cd7e96c73cb7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1801,7 +1801,11 @@ static noinline int copy_to_sk(struct btrfs_root *root,
1801 item_off = btrfs_item_ptr_offset(leaf, i); 1801 item_off = btrfs_item_ptr_offset(leaf, i);
1802 item_len = btrfs_item_size_nr(leaf, i); 1802 item_len = btrfs_item_size_nr(leaf, i);
1803 1803
1804 if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE) 1804 btrfs_item_key_to_cpu(leaf, key, i);
1805 if (!key_in_sk(key, sk))
1806 continue;
1807
1808 if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
1805 item_len = 0; 1809 item_len = 0;
1806 1810
1807 if (sizeof(sh) + item_len + *sk_offset > 1811 if (sizeof(sh) + item_len + *sk_offset >
@@ -1810,10 +1814,6 @@ static noinline int copy_to_sk(struct btrfs_root *root,
1810 goto overflow; 1814 goto overflow;
1811 } 1815 }
1812 1816
1813 btrfs_item_key_to_cpu(leaf, key, i);
1814 if (!key_in_sk(key, sk))
1815 continue;
1816
1817 sh.objectid = key->objectid; 1817 sh.objectid = key->objectid;
1818 sh.offset = key->offset; 1818 sh.offset = key->offset;
1819 sh.type = key->type; 1819 sh.type = key->type;
@@ -3881,7 +3881,7 @@ drop_write:
3881 3881
3882static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) 3882static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
3883{ 3883{
3884 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3884 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3885 struct btrfs_ioctl_quota_rescan_args *qsa; 3885 struct btrfs_ioctl_quota_rescan_args *qsa;
3886 int ret; 3886 int ret;
3887 3887
@@ -3914,7 +3914,7 @@ drop_write:
3914 3914
3915static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) 3915static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
3916{ 3916{
3917 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3917 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3918 struct btrfs_ioctl_quota_rescan_args *qsa; 3918 struct btrfs_ioctl_quota_rescan_args *qsa;
3919 int ret = 0; 3919 int ret = 0;
3920 3920
@@ -4020,7 +4020,7 @@ out:
4020 4020
4021static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) 4021static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
4022{ 4022{
4023 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 4023 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4024 const char *label = root->fs_info->super_copy->label; 4024 const char *label = root->fs_info->super_copy->label;
4025 size_t len = strnlen(label, BTRFS_LABEL_SIZE); 4025 size_t len = strnlen(label, BTRFS_LABEL_SIZE);
4026 int ret; 4026 int ret;
@@ -4039,7 +4039,7 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
4039 4039
4040static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) 4040static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
4041{ 4041{
4042 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 4042 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4043 struct btrfs_super_block *super_block = root->fs_info->super_copy; 4043 struct btrfs_super_block *super_block = root->fs_info->super_copy;
4044 struct btrfs_trans_handle *trans; 4044 struct btrfs_trans_handle *trans;
4045 char label[BTRFS_LABEL_SIZE]; 4045 char label[BTRFS_LABEL_SIZE];
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0740621daf6c..0525e1389f5b 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1050,7 +1050,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1050 } 1050 }
1051 1051
1052 /* put a new bio on the list */ 1052 /* put a new bio on the list */
1053 bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 1053 bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1054 if (!bio) 1054 if (!bio)
1055 return -ENOMEM; 1055 return -ENOMEM;
1056 1056
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 704a1b8d2a2b..4febca4fc2de 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1773,7 +1773,7 @@ again:
1773 if (!eb || !extent_buffer_uptodate(eb)) { 1773 if (!eb || !extent_buffer_uptodate(eb)) {
1774 ret = (!eb) ? -ENOMEM : -EIO; 1774 ret = (!eb) ? -ENOMEM : -EIO;
1775 free_extent_buffer(eb); 1775 free_extent_buffer(eb);
1776 return ret; 1776 break;
1777 } 1777 }
1778 btrfs_tree_lock(eb); 1778 btrfs_tree_lock(eb);
1779 if (cow) { 1779 if (cow) {
@@ -3350,6 +3350,11 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3350 } 3350 }
3351 3351
3352truncate: 3352truncate:
3353 ret = btrfs_check_trunc_cache_free_space(root,
3354 &fs_info->global_block_rsv);
3355 if (ret)
3356 goto out;
3357
3353 path = btrfs_alloc_path(); 3358 path = btrfs_alloc_path();
3354 if (!path) { 3359 if (!path) {
3355 ret = -ENOMEM; 3360 ret = -ENOMEM;
@@ -4077,7 +4082,7 @@ out:
4077 return inode; 4082 return inode;
4078} 4083}
4079 4084
4080static struct reloc_control *alloc_reloc_control(void) 4085static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4081{ 4086{
4082 struct reloc_control *rc; 4087 struct reloc_control *rc;
4083 4088
@@ -4088,7 +4093,8 @@ static struct reloc_control *alloc_reloc_control(void)
4088 INIT_LIST_HEAD(&rc->reloc_roots); 4093 INIT_LIST_HEAD(&rc->reloc_roots);
4089 backref_cache_init(&rc->backref_cache); 4094 backref_cache_init(&rc->backref_cache);
4090 mapping_tree_init(&rc->reloc_root_tree); 4095 mapping_tree_init(&rc->reloc_root_tree);
4091 extent_io_tree_init(&rc->processed_blocks, NULL); 4096 extent_io_tree_init(&rc->processed_blocks,
4097 fs_info->btree_inode->i_mapping);
4092 return rc; 4098 return rc;
4093} 4099}
4094 4100
@@ -4105,7 +4111,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4105 int rw = 0; 4111 int rw = 0;
4106 int err = 0; 4112 int err = 0;
4107 4113
4108 rc = alloc_reloc_control(); 4114 rc = alloc_reloc_control(fs_info);
4109 if (!rc) 4115 if (!rc)
4110 return -ENOMEM; 4116 return -ENOMEM;
4111 4117
@@ -4306,7 +4312,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4306 if (list_empty(&reloc_roots)) 4312 if (list_empty(&reloc_roots))
4307 goto out; 4313 goto out;
4308 4314
4309 rc = alloc_reloc_control(); 4315 rc = alloc_reloc_control(root->fs_info);
4310 if (!rc) { 4316 if (!rc) {
4311 err = -ENOMEM; 4317 err = -ENOMEM;
4312 goto out; 4318 goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f489e24659a4..79bd479317cb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1296,7 +1296,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1296 } 1296 }
1297 1297
1298 WARN_ON(!page->page); 1298 WARN_ON(!page->page);
1299 bio = bio_alloc(GFP_NOFS, 1); 1299 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1300 if (!bio) { 1300 if (!bio) {
1301 page->io_error = 1; 1301 page->io_error = 1;
1302 sblock->no_io_error_seen = 0; 1302 sblock->no_io_error_seen = 0;
@@ -1431,7 +1431,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1431 return -EIO; 1431 return -EIO;
1432 } 1432 }
1433 1433
1434 bio = bio_alloc(GFP_NOFS, 1); 1434 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1435 if (!bio) 1435 if (!bio)
1436 return -EIO; 1436 return -EIO;
1437 bio->bi_bdev = page_bad->dev->bdev; 1437 bio->bi_bdev = page_bad->dev->bdev;
@@ -1522,7 +1522,7 @@ again:
1522 sbio->dev = wr_ctx->tgtdev; 1522 sbio->dev = wr_ctx->tgtdev;
1523 bio = sbio->bio; 1523 bio = sbio->bio;
1524 if (!bio) { 1524 if (!bio) {
1525 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); 1525 bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1526 if (!bio) { 1526 if (!bio) {
1527 mutex_unlock(&wr_ctx->wr_lock); 1527 mutex_unlock(&wr_ctx->wr_lock);
1528 return -ENOMEM; 1528 return -ENOMEM;
@@ -1930,7 +1930,7 @@ again:
1930 sbio->dev = spage->dev; 1930 sbio->dev = spage->dev;
1931 bio = sbio->bio; 1931 bio = sbio->bio;
1932 if (!bio) { 1932 if (!bio) {
1933 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); 1933 bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1934 if (!bio) 1934 if (!bio)
1935 return -ENOMEM; 1935 return -ENOMEM;
1936 sbio->bio = bio; 1936 sbio->bio = bio;
@@ -3307,7 +3307,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
3307 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); 3307 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3308 return -EIO; 3308 return -EIO;
3309 } 3309 }
3310 bio = bio_alloc(GFP_NOFS, 1); 3310 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
3311 if (!bio) { 3311 if (!bio) {
3312 spin_lock(&sctx->stat_lock); 3312 spin_lock(&sctx->stat_lock);
3313 sctx->stat.malloc_errors++; 3313 sctx->stat.malloc_errors++;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a4807ced23cc..f0857e092a3c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1263,6 +1263,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1263 1263
1264 btrfs_dev_replace_suspend_for_unmount(fs_info); 1264 btrfs_dev_replace_suspend_for_unmount(fs_info);
1265 btrfs_scrub_cancel(fs_info); 1265 btrfs_scrub_cancel(fs_info);
1266 btrfs_pause_balance(fs_info);
1266 1267
1267 ret = btrfs_commit_super(root); 1268 ret = btrfs_commit_super(root);
1268 if (ret) 1269 if (ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0e925ced971b..8bffb9174afb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3120,14 +3120,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3120 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3120 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3121 if (num_devices == 1) 3121 if (num_devices == 1)
3122 allowed |= BTRFS_BLOCK_GROUP_DUP; 3122 allowed |= BTRFS_BLOCK_GROUP_DUP;
3123 else if (num_devices < 4) 3123 else if (num_devices > 1)
3124 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3124 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3125 else 3125 if (num_devices > 2)
3126 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3126 allowed |= BTRFS_BLOCK_GROUP_RAID5;
3127 BTRFS_BLOCK_GROUP_RAID10 | 3127 if (num_devices > 3)
3128 BTRFS_BLOCK_GROUP_RAID5 | 3128 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3129 BTRFS_BLOCK_GROUP_RAID6); 3129 BTRFS_BLOCK_GROUP_RAID6);
3130
3131 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3130 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3132 (!alloc_profile_is_valid(bctl->data.target, 1) || 3131 (!alloc_profile_is_valid(bctl->data.target, 1) ||
3133 (bctl->data.target & ~allowed))) { 3132 (bctl->data.target & ~allowed))) {
@@ -5019,42 +5018,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5019 return 0; 5018 return 0;
5020} 5019}
5021 5020
5022static void *merge_stripe_index_into_bio_private(void *bi_private,
5023 unsigned int stripe_index)
5024{
5025 /*
5026 * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
5027 * at most 1.
5028 * The alternative solution (instead of stealing bits from the
5029 * pointer) would be to allocate an intermediate structure
5030 * that contains the old private pointer plus the stripe_index.
5031 */
5032 BUG_ON((((uintptr_t)bi_private) & 3) != 0);
5033 BUG_ON(stripe_index > 3);
5034 return (void *)(((uintptr_t)bi_private) | stripe_index);
5035}
5036
5037static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
5038{
5039 return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
5040}
5041
5042static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
5043{
5044 return (unsigned int)((uintptr_t)bi_private) & 3;
5045}
5046
5047static void btrfs_end_bio(struct bio *bio, int err) 5021static void btrfs_end_bio(struct bio *bio, int err)
5048{ 5022{
5049 struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); 5023 struct btrfs_bio *bbio = bio->bi_private;
5050 int is_orig_bio = 0; 5024 int is_orig_bio = 0;
5051 5025
5052 if (err) { 5026 if (err) {
5053 atomic_inc(&bbio->error); 5027 atomic_inc(&bbio->error);
5054 if (err == -EIO || err == -EREMOTEIO) { 5028 if (err == -EIO || err == -EREMOTEIO) {
5055 unsigned int stripe_index = 5029 unsigned int stripe_index =
5056 extract_stripe_index_from_bio_private( 5030 btrfs_io_bio(bio)->stripe_index;
5057 bio->bi_private);
5058 struct btrfs_device *dev; 5031 struct btrfs_device *dev;
5059 5032
5060 BUG_ON(stripe_index >= bbio->num_stripes); 5033 BUG_ON(stripe_index >= bbio->num_stripes);
@@ -5084,8 +5057,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
5084 } 5057 }
5085 bio->bi_private = bbio->private; 5058 bio->bi_private = bbio->private;
5086 bio->bi_end_io = bbio->end_io; 5059 bio->bi_end_io = bbio->end_io;
5087 bio->bi_bdev = (struct block_device *) 5060 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5088 (unsigned long)bbio->mirror_num;
5089 /* only send an error to the higher layers if it is 5061 /* only send an error to the higher layers if it is
5090 * beyond the tolerance of the btrfs bio 5062 * beyond the tolerance of the btrfs bio
5091 */ 5063 */
@@ -5211,8 +5183,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5211 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 5183 struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
5212 5184
5213 bio->bi_private = bbio; 5185 bio->bi_private = bbio;
5214 bio->bi_private = merge_stripe_index_into_bio_private( 5186 btrfs_io_bio(bio)->stripe_index = dev_nr;
5215 bio->bi_private, (unsigned int)dev_nr);
5216 bio->bi_end_io = btrfs_end_bio; 5187 bio->bi_end_io = btrfs_end_bio;
5217 bio->bi_sector = physical >> 9; 5188 bio->bi_sector = physical >> 9;
5218#ifdef DEBUG 5189#ifdef DEBUG
@@ -5273,8 +5244,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
5273 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5244 if (atomic_dec_and_test(&bbio->stripes_pending)) {
5274 bio->bi_private = bbio->private; 5245 bio->bi_private = bbio->private;
5275 bio->bi_end_io = bbio->end_io; 5246 bio->bi_end_io = bbio->end_io;
5276 bio->bi_bdev = (struct block_device *) 5247 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5277 (unsigned long)bbio->mirror_num;
5278 bio->bi_sector = logical >> 9; 5248 bio->bi_sector = logical >> 9;
5279 kfree(bbio); 5249 kfree(bbio);
5280 bio_endio(bio, -EIO); 5250 bio_endio(bio, -EIO);
@@ -5352,7 +5322,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5352 } 5322 }
5353 5323
5354 if (dev_nr < total_devs - 1) { 5324 if (dev_nr < total_devs - 1) {
5355 bio = bio_clone(first_bio, GFP_NOFS); 5325 bio = btrfs_bio_clone(first_bio, GFP_NOFS);
5356 BUG_ON(!bio); /* -ENOMEM */ 5326 BUG_ON(!bio); /* -ENOMEM */
5357 } else { 5327 } else {
5358 bio = first_bio; 5328 bio = first_bio;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 845ccbb0d2e3..f6247e2a47f7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -152,6 +152,26 @@ struct btrfs_fs_devices {
152 int rotating; 152 int rotating;
153}; 153};
154 154
155/*
156 * we need the mirror number and stripe index to be passed around
157 * the call chain while we are processing end_io (especially errors).
158 * Really, what we need is a btrfs_bio structure that has this info
159 * and is properly sized with its stripe array, but we're not there
160 * quite yet. We have our own btrfs bioset, and all of the bios
161 * we allocate are actually btrfs_io_bios. We'll cram as much of
162 * struct btrfs_bio as we can into this over time.
163 */
164struct btrfs_io_bio {
165 unsigned long mirror_num;
166 unsigned long stripe_index;
167 struct bio bio;
168};
169
170static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
171{
172 return container_of(bio, struct btrfs_io_bio, bio);
173}
174
155struct btrfs_bio_stripe { 175struct btrfs_bio_stripe {
156 struct btrfs_device *dev; 176 struct btrfs_device *dev;
157 u64 physical; 177 u64 physical;
diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..f93392e2df12 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1454,7 +1454,8 @@ static void discard_buffer(struct buffer_head * bh)
1454 * block_invalidatepage - invalidate part or all of a buffer-backed page 1454 * block_invalidatepage - invalidate part or all of a buffer-backed page
1455 * 1455 *
1456 * @page: the page which is affected 1456 * @page: the page which is affected
1457 * @offset: the index of the truncation point 1457 * @offset: start of the range to invalidate
1458 * @length: length of the range to invalidate
1458 * 1459 *
1459 * block_invalidatepage() is called when all or part of the page has become 1460 * block_invalidatepage() is called when all or part of the page has become
1460 * invalidated by a truncate operation. 1461 * invalidated by a truncate operation.
@@ -1465,15 +1466,22 @@ static void discard_buffer(struct buffer_head * bh)
1465 * point. Because the caller is about to free (and possibly reuse) those 1466 * point. Because the caller is about to free (and possibly reuse) those
1466 * blocks on-disk. 1467 * blocks on-disk.
1467 */ 1468 */
1468void block_invalidatepage(struct page *page, unsigned long offset) 1469void block_invalidatepage(struct page *page, unsigned int offset,
1470 unsigned int length)
1469{ 1471{
1470 struct buffer_head *head, *bh, *next; 1472 struct buffer_head *head, *bh, *next;
1471 unsigned int curr_off = 0; 1473 unsigned int curr_off = 0;
1474 unsigned int stop = length + offset;
1472 1475
1473 BUG_ON(!PageLocked(page)); 1476 BUG_ON(!PageLocked(page));
1474 if (!page_has_buffers(page)) 1477 if (!page_has_buffers(page))
1475 goto out; 1478 goto out;
1476 1479
1480 /*
1481 * Check for overflow
1482 */
1483 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1484
1477 head = page_buffers(page); 1485 head = page_buffers(page);
1478 bh = head; 1486 bh = head;
1479 do { 1487 do {
@@ -1481,6 +1489,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
1481 next = bh->b_this_page; 1489 next = bh->b_this_page;
1482 1490
1483 /* 1491 /*
1492 * Are we still fully in range ?
1493 */
1494 if (next_off > stop)
1495 goto out;
1496
1497 /*
1484 * is this block fully invalidated? 1498 * is this block fully invalidated?
1485 */ 1499 */
1486 if (offset <= curr_off) 1500 if (offset <= curr_off)
@@ -1501,6 +1515,7 @@ out:
1501} 1515}
1502EXPORT_SYMBOL(block_invalidatepage); 1516EXPORT_SYMBOL(block_invalidatepage);
1503 1517
1518
1504/* 1519/*
1505 * We attach and possibly dirty the buffers atomically wrt 1520 * We attach and possibly dirty the buffers atomically wrt
1506 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers 1521 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -2841,7 +2856,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2841 * they may have been added in ext3_writepage(). Make them 2856 * they may have been added in ext3_writepage(). Make them
2842 * freeable here, so the page does not leak. 2857 * freeable here, so the page does not leak.
2843 */ 2858 */
2844 do_invalidatepage(page, 0); 2859 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
2845 unlock_page(page); 2860 unlock_page(page);
2846 return 0; /* don't care */ 2861 return 0; /* don't care */
2847 } 2862 }
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce532e130..d4c1206af9fc 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -13,8 +13,6 @@
13#include <linux/mount.h> 13#include <linux/mount.h>
14#include "internal.h" 14#include "internal.h"
15 15
16#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
17
18struct cachefiles_lookup_data { 16struct cachefiles_lookup_data {
19 struct cachefiles_xattr *auxdata; /* auxiliary data */ 17 struct cachefiles_xattr *auxdata; /* auxiliary data */
20 char *key; /* key path */ 18 char *key; /* key path */
@@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
212 object = container_of(_object, struct cachefiles_object, fscache); 210 object = container_of(_object, struct cachefiles_object, fscache);
213 cache = container_of(object->fscache.cache, struct cachefiles_cache, 211 cache = container_of(object->fscache.cache, struct cachefiles_cache,
214 cache); 212 cache);
213
214 if (!fscache_use_cookie(_object)) {
215 _leave(" [relinq]");
216 return;
217 }
218
215 cookie = object->fscache.cookie; 219 cookie = object->fscache.cookie;
216 220
217 if (!cookie->def->get_aux) { 221 if (!cookie->def->get_aux) {
222 fscache_unuse_cookie(_object);
218 _leave(" [no aux]"); 223 _leave(" [no aux]");
219 return; 224 return;
220 } 225 }
221 226
222 auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp); 227 auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
223 if (!auxdata) { 228 if (!auxdata) {
229 fscache_unuse_cookie(_object);
224 _leave(" [nomem]"); 230 _leave(" [nomem]");
225 return; 231 return;
226 } 232 }
227 233
228 auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); 234 auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
235 fscache_unuse_cookie(_object);
229 ASSERTCMP(auxlen, <, 511); 236 ASSERTCMP(auxlen, <, 511);
230 237
231 auxdata->len = auxlen + 1; 238 auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
263#endif 270#endif
264 271
265 /* delete retired objects */ 272 /* delete retired objects */
266 if (object->fscache.state == FSCACHE_OBJECT_RECYCLING && 273 if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
267 _object != cache->cache.fsdef 274 _object != cache->cache.fsdef
268 ) { 275 ) {
269 _debug("- retire object OBJ%x", object->fscache.debug_id); 276 _debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fcdf75..25badd1aec5c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
38 printk(KERN_ERR "%sobject: OBJ%x\n", 38 printk(KERN_ERR "%sobject: OBJ%x\n",
39 prefix, object->fscache.debug_id); 39 prefix, object->fscache.debug_id);
40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", 40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
41 prefix, fscache_object_states[object->fscache.state], 41 prefix, object->fscache.state->name,
42 object->fscache.flags, work_busy(&object->fscache.work), 42 object->fscache.flags, work_busy(&object->fscache.work),
43 object->fscache.events, object->fscache.event_mask); 43 object->fscache.events, object->fscache.event_mask);
44 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", 44 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
127found_dentry: 127found_dentry:
128 kdebug("preemptive burial: OBJ%x [%s] %p", 128 kdebug("preemptive burial: OBJ%x [%s] %p",
129 object->fscache.debug_id, 129 object->fscache.debug_id,
130 fscache_object_states[object->fscache.state], 130 object->fscache.state->name,
131 dentry); 131 dentry);
132 132
133 if (object->fscache.state < FSCACHE_OBJECT_DYING) { 133 if (fscache_object_is_live(&object->fscache)) {
134 printk(KERN_ERR "\n"); 134 printk(KERN_ERR "\n");
135 printk(KERN_ERR "CacheFiles: Error:" 135 printk(KERN_ERR "CacheFiles: Error:"
136 " Can't preemptively bury live object\n"); 136 " Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@ try_again:
192 /* an old object from a previous incarnation is hogging the slot - we 192 /* an old object from a previous incarnation is hogging the slot - we
193 * need to wait for it to be destroyed */ 193 * need to wait for it to be destroyed */
194wait_for_old_object: 194wait_for_old_object:
195 if (xobject->fscache.state < FSCACHE_OBJECT_DYING) { 195 if (fscache_object_is_live(&object->fscache)) {
196 printk(KERN_ERR "\n"); 196 printk(KERN_ERR "\n");
197 printk(KERN_ERR "CacheFiles: Error:" 197 printk(KERN_ERR "CacheFiles: Error:"
198 " Unexpected object collision\n"); 198 " Unexpected object collision\n");
@@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
836 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); 836 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
837 837
838 /* look up the victim */ 838 /* look up the victim */
839 mutex_lock_nested(&dir->d_inode->i_mutex, 1); 839 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
840 840
841 start = jiffies; 841 start = jiffies;
842 victim = lookup_one_len(filename, dir, strlen(filename)); 842 victim = lookup_one_len(filename, dir, strlen(filename));
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b46288b54b..2476e5162609 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
109 struct dentry *dentry = object->dentry; 109 struct dentry *dentry = object->dentry;
110 int ret; 110 int ret;
111 111
112 ASSERT(object->fscache.cookie);
113 ASSERT(dentry); 112 ASSERT(dentry);
114 113
115 _enter("%p,#%d", object, auxdata->len); 114 _enter("%p,#%d", object, auxdata->len);
116 115
117 /* attempt to install the cache metadata directly */ 116 /* attempt to install the cache metadata directly */
118 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); 117 _debug("SET #%u", auxdata->len);
119 118
120 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, 119 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
121 &auxdata->type, auxdata->len, 120 &auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
138 struct dentry *dentry = object->dentry; 137 struct dentry *dentry = object->dentry;
139 int ret; 138 int ret;
140 139
141 ASSERT(object->fscache.cookie);
142 ASSERT(dentry); 140 ASSERT(dentry);
143 141
144 _enter("%p,#%d", object, auxdata->len); 142 _enter("%p,#%d", object, auxdata->len);
145 143
146 /* attempt to install the cache metadata directly */ 144 /* attempt to install the cache metadata directly */
147 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); 145 _debug("SET #%u", auxdata->len);
148 146
149 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, 147 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
150 &auxdata->type, auxdata->len, 148 &auxdata->type, auxdata->len,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac101040..38b5c1bc6776 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
143 * dirty page counters appropriately. Only called if there is private 143 * dirty page counters appropriately. Only called if there is private
144 * data on the page. 144 * data on the page.
145 */ 145 */
146static void ceph_invalidatepage(struct page *page, unsigned long offset) 146static void ceph_invalidatepage(struct page *page, unsigned int offset,
147 unsigned int length)
147{ 148{
148 struct inode *inode; 149 struct inode *inode;
149 struct ceph_inode_info *ci; 150 struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
163 if (!PageDirty(page)) 164 if (!PageDirty(page))
164 pr_err("%p invalidatepage %p page not dirty\n", inode, page); 165 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
165 166
166 if (offset == 0) 167 if (offset == 0 && length == PAGE_CACHE_SIZE)
167 ClearPageChecked(page); 168 ClearPageChecked(page);
168 169
169 ci = ceph_inode(inode); 170 ci = ceph_inode(inode);
170 if (offset == 0) { 171 if (offset == 0 && length == PAGE_CACHE_SIZE) {
171 dout("%p invalidatepage %p idx %lu full dirty page %lu\n", 172 dout("%p invalidatepage %p idx %lu full dirty page\n",
172 inode, page, page->index, offset); 173 inode, page, page->index);
173 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 174 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
174 ceph_put_snap_context(snapc); 175 ceph_put_snap_context(snapc);
175 page->private = 0; 176 page->private = 0;
176 ClearPagePrivate(page); 177 ClearPagePrivate(page);
177 } else { 178 } else {
178 dout("%p invalidatepage %p idx %lu partial dirty page\n", 179 dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
179 inode, page, page->index); 180 inode, page, page->index, offset, length);
180 } 181 }
181} 182}
182 183
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f02d82b7933e..a40ceda47a32 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -111,11 +111,10 @@ static unsigned fpos_off(loff_t p)
111 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 111 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
112 * the MDS if/when the directory is modified). 112 * the MDS if/when the directory is modified).
113 */ 113 */
114static int __dcache_readdir(struct file *filp, 114static int __dcache_readdir(struct file *file, struct dir_context *ctx)
115 void *dirent, filldir_t filldir)
116{ 115{
117 struct ceph_file_info *fi = filp->private_data; 116 struct ceph_file_info *fi = file->private_data;
118 struct dentry *parent = filp->f_dentry; 117 struct dentry *parent = file->f_dentry;
119 struct inode *dir = parent->d_inode; 118 struct inode *dir = parent->d_inode;
120 struct list_head *p; 119 struct list_head *p;
121 struct dentry *dentry, *last; 120 struct dentry *dentry, *last;
@@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp,
126 last = fi->dentry; 125 last = fi->dentry;
127 fi->dentry = NULL; 126 fi->dentry = NULL;
128 127
129 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, 128 dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
130 last); 129 last);
131 130
132 spin_lock(&parent->d_lock); 131 spin_lock(&parent->d_lock);
133 132
134 /* start at beginning? */ 133 /* start at beginning? */
135 if (filp->f_pos == 2 || last == NULL || 134 if (ctx->pos == 2 || last == NULL ||
136 filp->f_pos < ceph_dentry(last)->offset) { 135 ctx->pos < ceph_dentry(last)->offset) {
137 if (list_empty(&parent->d_subdirs)) 136 if (list_empty(&parent->d_subdirs))
138 goto out_unlock; 137 goto out_unlock;
139 p = parent->d_subdirs.prev; 138 p = parent->d_subdirs.prev;
@@ -157,11 +156,11 @@ more:
157 if (!d_unhashed(dentry) && dentry->d_inode && 156 if (!d_unhashed(dentry) && dentry->d_inode &&
158 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 157 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
159 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 158 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
160 filp->f_pos <= di->offset) 159 ctx->pos <= di->offset)
161 break; 160 break;
162 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, 161 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
163 dentry->d_name.len, dentry->d_name.name, di->offset, 162 dentry->d_name.len, dentry->d_name.name, di->offset,
164 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", 163 ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
165 !dentry->d_inode ? " null" : ""); 164 !dentry->d_inode ? " null" : "");
166 spin_unlock(&dentry->d_lock); 165 spin_unlock(&dentry->d_lock);
167 p = p->prev; 166 p = p->prev;
@@ -173,29 +172,27 @@ more:
173 spin_unlock(&dentry->d_lock); 172 spin_unlock(&dentry->d_lock);
174 spin_unlock(&parent->d_lock); 173 spin_unlock(&parent->d_lock);
175 174
176 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 175 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
177 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 176 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
178 filp->f_pos = di->offset; 177 ctx->pos = di->offset;
179 err = filldir(dirent, dentry->d_name.name, 178 if (!dir_emit(ctx, dentry->d_name.name,
180 dentry->d_name.len, di->offset, 179 dentry->d_name.len,
181 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), 180 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
182 dentry->d_inode->i_mode >> 12); 181 dentry->d_inode->i_mode >> 12)) {
183 182 if (last) {
184 if (last) {
185 if (err < 0) {
186 /* remember our position */ 183 /* remember our position */
187 fi->dentry = last; 184 fi->dentry = last;
188 fi->next_offset = di->offset; 185 fi->next_offset = di->offset;
189 } else {
190 dput(last);
191 } 186 }
187 dput(dentry);
188 return 0;
192 } 189 }
193 last = dentry;
194 190
195 if (err < 0) 191 if (last)
196 goto out; 192 dput(last);
193 last = dentry;
197 194
198 filp->f_pos++; 195 ctx->pos++;
199 196
200 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 197 /* make sure a dentry wasn't dropped while we didn't have parent lock */
201 if (!ceph_dir_is_complete(dir)) { 198 if (!ceph_dir_is_complete(dir)) {
@@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
235 return 0; 232 return 0;
236} 233}
237 234
238static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) 235static int ceph_readdir(struct file *file, struct dir_context *ctx)
239{ 236{
240 struct ceph_file_info *fi = filp->private_data; 237 struct ceph_file_info *fi = file->private_data;
241 struct inode *inode = file_inode(filp); 238 struct inode *inode = file_inode(file);
242 struct ceph_inode_info *ci = ceph_inode(inode); 239 struct ceph_inode_info *ci = ceph_inode(inode);
243 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 240 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
244 struct ceph_mds_client *mdsc = fsc->mdsc; 241 struct ceph_mds_client *mdsc = fsc->mdsc;
245 unsigned frag = fpos_frag(filp->f_pos); 242 unsigned frag = fpos_frag(ctx->pos);
246 int off = fpos_off(filp->f_pos); 243 int off = fpos_off(ctx->pos);
247 int err; 244 int err;
248 u32 ftype; 245 u32 ftype;
249 struct ceph_mds_reply_info_parsed *rinfo; 246 struct ceph_mds_reply_info_parsed *rinfo;
250 const int max_entries = fsc->mount_options->max_readdir; 247 const int max_entries = fsc->mount_options->max_readdir;
251 const int max_bytes = fsc->mount_options->max_readdir_bytes; 248 const int max_bytes = fsc->mount_options->max_readdir_bytes;
252 249
253 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 250 dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
254 if (fi->flags & CEPH_F_ATEND) 251 if (fi->flags & CEPH_F_ATEND)
255 return 0; 252 return 0;
256 253
257 /* always start with . and .. */ 254 /* always start with . and .. */
258 if (filp->f_pos == 0) { 255 if (ctx->pos == 0) {
259 /* note dir version at start of readdir so we can tell 256 /* note dir version at start of readdir so we can tell
260 * if any dentries get dropped */ 257 * if any dentries get dropped */
261 fi->dir_release_count = atomic_read(&ci->i_release_count); 258 fi->dir_release_count = atomic_read(&ci->i_release_count);
262 259
263 dout("readdir off 0 -> '.'\n"); 260 dout("readdir off 0 -> '.'\n");
264 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), 261 if (!dir_emit(ctx, ".", 1,
265 ceph_translate_ino(inode->i_sb, inode->i_ino), 262 ceph_translate_ino(inode->i_sb, inode->i_ino),
266 inode->i_mode >> 12) < 0) 263 inode->i_mode >> 12))
267 return 0; 264 return 0;
268 filp->f_pos = 1; 265 ctx->pos = 1;
269 off = 1; 266 off = 1;
270 } 267 }
271 if (filp->f_pos == 1) { 268 if (ctx->pos == 1) {
272 ino_t ino = parent_ino(filp->f_dentry); 269 ino_t ino = parent_ino(file->f_dentry);
273 dout("readdir off 1 -> '..'\n"); 270 dout("readdir off 1 -> '..'\n");
274 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), 271 if (!dir_emit(ctx, "..", 2,
275 ceph_translate_ino(inode->i_sb, ino), 272 ceph_translate_ino(inode->i_sb, ino),
276 inode->i_mode >> 12) < 0) 273 inode->i_mode >> 12))
277 return 0; 274 return 0;
278 filp->f_pos = 2; 275 ctx->pos = 2;
279 off = 2; 276 off = 2;
280 } 277 }
281 278
282 /* can we use the dcache? */ 279 /* can we use the dcache? */
283 spin_lock(&ci->i_ceph_lock); 280 spin_lock(&ci->i_ceph_lock);
284 if ((filp->f_pos == 2 || fi->dentry) && 281 if ((ctx->pos == 2 || fi->dentry) &&
285 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 282 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
286 ceph_snap(inode) != CEPH_SNAPDIR && 283 ceph_snap(inode) != CEPH_SNAPDIR &&
287 __ceph_dir_is_complete(ci) && 284 __ceph_dir_is_complete(ci) &&
288 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 285 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
289 spin_unlock(&ci->i_ceph_lock); 286 spin_unlock(&ci->i_ceph_lock);
290 err = __dcache_readdir(filp, dirent, filldir); 287 err = __dcache_readdir(file, ctx);
291 if (err != -EAGAIN) 288 if (err != -EAGAIN)
292 return err; 289 return err;
293 } else { 290 } else {
@@ -327,7 +324,7 @@ more:
327 return PTR_ERR(req); 324 return PTR_ERR(req);
328 req->r_inode = inode; 325 req->r_inode = inode;
329 ihold(inode); 326 ihold(inode);
330 req->r_dentry = dget(filp->f_dentry); 327 req->r_dentry = dget(file->f_dentry);
331 /* hints to request -> mds selection code */ 328 /* hints to request -> mds selection code */
332 req->r_direct_mode = USE_AUTH_MDS; 329 req->r_direct_mode = USE_AUTH_MDS;
333 req->r_direct_hash = ceph_frag_value(frag); 330 req->r_direct_hash = ceph_frag_value(frag);
@@ -379,15 +376,16 @@ more:
379 rinfo = &fi->last_readdir->r_reply_info; 376 rinfo = &fi->last_readdir->r_reply_info;
380 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 377 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
381 rinfo->dir_nr, off, fi->offset); 378 rinfo->dir_nr, off, fi->offset);
379
380 ctx->pos = ceph_make_fpos(frag, off);
382 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { 381 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
383 u64 pos = ceph_make_fpos(frag, off);
384 struct ceph_mds_reply_inode *in = 382 struct ceph_mds_reply_inode *in =
385 rinfo->dir_in[off - fi->offset].in; 383 rinfo->dir_in[off - fi->offset].in;
386 struct ceph_vino vino; 384 struct ceph_vino vino;
387 ino_t ino; 385 ino_t ino;
388 386
389 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 387 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
390 off, off - fi->offset, rinfo->dir_nr, pos, 388 off, off - fi->offset, rinfo->dir_nr, ctx->pos,
391 rinfo->dir_dname_len[off - fi->offset], 389 rinfo->dir_dname_len[off - fi->offset],
392 rinfo->dir_dname[off - fi->offset], in); 390 rinfo->dir_dname[off - fi->offset], in);
393 BUG_ON(!in); 391 BUG_ON(!in);
@@ -395,16 +393,15 @@ more:
395 vino.ino = le64_to_cpu(in->ino); 393 vino.ino = le64_to_cpu(in->ino);
396 vino.snap = le64_to_cpu(in->snapid); 394 vino.snap = le64_to_cpu(in->snapid);
397 ino = ceph_vino_to_ino(vino); 395 ino = ceph_vino_to_ino(vino);
398 if (filldir(dirent, 396 if (!dir_emit(ctx,
399 rinfo->dir_dname[off - fi->offset], 397 rinfo->dir_dname[off - fi->offset],
400 rinfo->dir_dname_len[off - fi->offset], 398 rinfo->dir_dname_len[off - fi->offset],
401 pos, 399 ceph_translate_ino(inode->i_sb, ino), ftype)) {
402 ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
403 dout("filldir stopping us...\n"); 400 dout("filldir stopping us...\n");
404 return 0; 401 return 0;
405 } 402 }
406 off++; 403 off++;
407 filp->f_pos = pos + 1; 404 ctx->pos++;
408 } 405 }
409 406
410 if (fi->last_name) { 407 if (fi->last_name) {
@@ -417,7 +414,7 @@ more:
417 if (!ceph_frag_is_rightmost(frag)) { 414 if (!ceph_frag_is_rightmost(frag)) {
418 frag = ceph_frag_next(frag); 415 frag = ceph_frag_next(frag);
419 off = 0; 416 off = 0;
420 filp->f_pos = ceph_make_fpos(frag, off); 417 ctx->pos = ceph_make_fpos(frag, off);
421 dout("readdir next frag is %x\n", frag); 418 dout("readdir next frag is %x\n", frag);
422 goto more; 419 goto more;
423 } 420 }
@@ -432,11 +429,11 @@ more:
432 if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { 429 if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
433 dout(" marking %p complete\n", inode); 430 dout(" marking %p complete\n", inode);
434 __ceph_dir_set_complete(ci, fi->dir_release_count); 431 __ceph_dir_set_complete(ci, fi->dir_release_count);
435 ci->i_max_offset = filp->f_pos; 432 ci->i_max_offset = ctx->pos;
436 } 433 }
437 spin_unlock(&ci->i_ceph_lock); 434 spin_unlock(&ci->i_ceph_lock);
438 435
439 dout("readdir %p filp %p done.\n", inode, filp); 436 dout("readdir %p file %p done.\n", inode, file);
440 return 0; 437 return 0;
441} 438}
442 439
@@ -1268,7 +1265,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
1268 1265
1269const struct file_operations ceph_dir_fops = { 1266const struct file_operations ceph_dir_fops = {
1270 .read = ceph_read_dir, 1267 .read = ceph_read_dir,
1271 .readdir = ceph_readdir, 1268 .iterate = ceph_readdir,
1272 .llseek = ceph_dir_llseek, 1269 .llseek = ceph_dir_llseek,
1273 .open = ceph_open, 1270 .open = ceph_open,
1274 .release = ceph_release, 1271 .release = ceph_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 656e16907430..16c989d3e23c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -866,16 +866,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
866 break; 866 break;
867 } 867 }
868 868
869 if (offset < 0 || offset > inode->i_sb->s_maxbytes) { 869 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
870 offset = -EINVAL;
871 goto out;
872 }
873
874 /* Special lock needed here? */
875 if (offset != file->f_pos) {
876 file->f_pos = offset;
877 file->f_version = 0;
878 }
879 870
880out: 871out:
881 mutex_unlock(&inode->i_mutex); 872 mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 202dd3d68be0..690f73f42425 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -191,27 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
191} 191}
192 192
193/** 193/**
194 * Encode the flock and fcntl locks for the given inode into the pagelist. 194 * Encode the flock and fcntl locks for the given inode into the ceph_filelock
195 * Format is: #fcntl locks, sequential fcntl locks, #flock locks, 195 * array. Must be called with inode->i_lock already held.
196 * sequential flock locks. 196 * If we encounter more of a specific lock type than expected, return -ENOSPC.
197 * Must be called with lock_flocks() already held.
198 * If we encounter more of a specific lock type than expected,
199 * we return the value 1.
200 */ 197 */
201int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, 198int ceph_encode_locks_to_buffer(struct inode *inode,
202 int num_fcntl_locks, int num_flock_locks) 199 struct ceph_filelock *flocks,
200 int num_fcntl_locks, int num_flock_locks)
203{ 201{
204 struct file_lock *lock; 202 struct file_lock *lock;
205 struct ceph_filelock cephlock;
206 int err = 0; 203 int err = 0;
207 int seen_fcntl = 0; 204 int seen_fcntl = 0;
208 int seen_flock = 0; 205 int seen_flock = 0;
206 int l = 0;
209 207
210 dout("encoding %d flock and %d fcntl locks", num_flock_locks, 208 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
211 num_fcntl_locks); 209 num_fcntl_locks);
212 err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); 210
213 if (err)
214 goto fail;
215 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 211 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
216 if (lock->fl_flags & FL_POSIX) { 212 if (lock->fl_flags & FL_POSIX) {
217 ++seen_fcntl; 213 ++seen_fcntl;
@@ -219,19 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
219 err = -ENOSPC; 215 err = -ENOSPC;
220 goto fail; 216 goto fail;
221 } 217 }
222 err = lock_to_ceph_filelock(lock, &cephlock); 218 err = lock_to_ceph_filelock(lock, &flocks[l]);
223 if (err) 219 if (err)
224 goto fail; 220 goto fail;
225 err = ceph_pagelist_append(pagelist, &cephlock, 221 ++l;
226 sizeof(struct ceph_filelock));
227 } 222 }
228 if (err)
229 goto fail;
230 } 223 }
231
232 err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
233 if (err)
234 goto fail;
235 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 224 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
236 if (lock->fl_flags & FL_FLOCK) { 225 if (lock->fl_flags & FL_FLOCK) {
237 ++seen_flock; 226 ++seen_flock;
@@ -239,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
239 err = -ENOSPC; 228 err = -ENOSPC;
240 goto fail; 229 goto fail;
241 } 230 }
242 err = lock_to_ceph_filelock(lock, &cephlock); 231 err = lock_to_ceph_filelock(lock, &flocks[l]);
243 if (err) 232 if (err)
244 goto fail; 233 goto fail;
245 err = ceph_pagelist_append(pagelist, &cephlock, 234 ++l;
246 sizeof(struct ceph_filelock));
247 } 235 }
248 if (err)
249 goto fail;
250 } 236 }
251fail: 237fail:
252 return err; 238 return err;
253} 239}
254 240
241/**
242 * Copy the encoded flock and fcntl locks into the pagelist.
243 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
244 * sequential flock locks.
245 * Returns zero on success.
246 */
247int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
248 struct ceph_pagelist *pagelist,
249 int num_fcntl_locks, int num_flock_locks)
250{
251 int err = 0;
252 __le32 nlocks;
253
254 nlocks = cpu_to_le32(num_fcntl_locks);
255 err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
256 if (err)
257 goto out_fail;
258
259 err = ceph_pagelist_append(pagelist, flocks,
260 num_fcntl_locks * sizeof(*flocks));
261 if (err)
262 goto out_fail;
263
264 nlocks = cpu_to_le32(num_flock_locks);
265 err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
266 if (err)
267 goto out_fail;
268
269 err = ceph_pagelist_append(pagelist,
270 &flocks[num_fcntl_locks],
271 num_flock_locks * sizeof(*flocks));
272out_fail:
273 return err;
274}
275
255/* 276/*
256 * Given a pointer to a lock, convert it to a ceph filelock 277 * Given a pointer to a lock, convert it to a ceph filelock
257 */ 278 */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4f22671a5bd4..74fd2898b2ab 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2478,39 +2478,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2478 2478
2479 if (recon_state->flock) { 2479 if (recon_state->flock) {
2480 int num_fcntl_locks, num_flock_locks; 2480 int num_fcntl_locks, num_flock_locks;
2481 struct ceph_pagelist_cursor trunc_point; 2481 struct ceph_filelock *flocks;
2482 2482
2483 ceph_pagelist_set_cursor(pagelist, &trunc_point); 2483encode_again:
2484 do { 2484 spin_lock(&inode->i_lock);
2485 lock_flocks(); 2485 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2486 ceph_count_locks(inode, &num_fcntl_locks, 2486 spin_unlock(&inode->i_lock);
2487 &num_flock_locks); 2487 flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
2488 rec.v2.flock_len = (2*sizeof(u32) + 2488 sizeof(struct ceph_filelock), GFP_NOFS);
2489 (num_fcntl_locks+num_flock_locks) * 2489 if (!flocks) {
2490 sizeof(struct ceph_filelock)); 2490 err = -ENOMEM;
2491 unlock_flocks(); 2491 goto out_free;
2492 2492 }
2493 /* pre-alloc pagelist */ 2493 spin_lock(&inode->i_lock);
2494 ceph_pagelist_truncate(pagelist, &trunc_point); 2494 err = ceph_encode_locks_to_buffer(inode, flocks,
2495 err = ceph_pagelist_append(pagelist, &rec, reclen); 2495 num_fcntl_locks,
2496 if (!err) 2496 num_flock_locks);
2497 err = ceph_pagelist_reserve(pagelist, 2497 spin_unlock(&inode->i_lock);
2498 rec.v2.flock_len); 2498 if (err) {
2499 2499 kfree(flocks);
2500 /* encode locks */ 2500 if (err == -ENOSPC)
2501 if (!err) { 2501 goto encode_again;
2502 lock_flocks(); 2502 goto out_free;
2503 err = ceph_encode_locks(inode, 2503 }
2504 pagelist, 2504 /*
2505 num_fcntl_locks, 2505 * number of encoded locks is stable, so copy to pagelist
2506 num_flock_locks); 2506 */
2507 unlock_flocks(); 2507 rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
2508 } 2508 (num_fcntl_locks+num_flock_locks) *
2509 } while (err == -ENOSPC); 2509 sizeof(struct ceph_filelock));
2510 err = ceph_pagelist_append(pagelist, &rec, reclen);
2511 if (!err)
2512 err = ceph_locks_to_pagelist(flocks, pagelist,
2513 num_fcntl_locks,
2514 num_flock_locks);
2515 kfree(flocks);
2510 } else { 2516 } else {
2511 err = ceph_pagelist_append(pagelist, &rec, reclen); 2517 err = ceph_pagelist_append(pagelist, &rec, reclen);
2512 } 2518 }
2513
2514out_free: 2519out_free:
2515 kfree(path); 2520 kfree(path);
2516out_dput: 2521out_dput:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 8696be2ff679..7ccfdb4aea2e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -822,8 +822,13 @@ extern const struct export_operations ceph_export_ops;
822extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); 822extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
823extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); 823extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
824extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); 824extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
825extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, 825extern int ceph_encode_locks_to_buffer(struct inode *inode,
826 int p_locks, int f_locks); 826 struct ceph_filelock *flocks,
827 int num_fcntl_locks,
828 int num_flock_locks);
829extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
830 struct ceph_pagelist *pagelist,
831 int num_fcntl_locks, int num_flock_locks);
827extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); 832extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
828 833
829/* debugfs.c */ 834/* debugfs.c */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 8e33ec65847b..58df174deb10 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -18,6 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/vfs.h> 19#include <linux/vfs.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/inet.h>
21#include "cifsglob.h" 22#include "cifsglob.h"
22#include "cifsproto.h" 23#include "cifsproto.h"
23#include "cifsfs.h" 24#include "cifsfs.h"
@@ -48,58 +49,74 @@ void cifs_dfs_release_automount_timer(void)
48} 49}
49 50
50/** 51/**
51 * cifs_get_share_name - extracts share name from UNC 52 * cifs_build_devname - build a devicename from a UNC and optional prepath
52 * @node_name: pointer to UNC string 53 * @nodename: pointer to UNC string
54 * @prepath: pointer to prefixpath (or NULL if there isn't one)
53 * 55 *
54 * Extracts sharename form full UNC. 56 * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer
55 * i.e. strips from UNC trailing path that is not part of share 57 * big enough to hold the final thing. Copy the UNC from the nodename, and
56 * name and fixup missing '\' in the beginning of DFS node refferal 58 * concatenate the prepath onto the end of it if there is one.
57 * if necessary. 59 *
58 * Returns pointer to share name on success or ERR_PTR on error. 60 * Returns pointer to the built string, or a ERR_PTR. Caller is responsible
59 * Caller is responsible for freeing returned string. 61 * for freeing the returned string.
60 */ 62 */
61static char *cifs_get_share_name(const char *node_name) 63static char *
64cifs_build_devname(char *nodename, const char *prepath)
62{ 65{
63 int len; 66 size_t pplen;
64 char *UNC; 67 size_t unclen;
65 char *pSep; 68 char *dev;
66 69 char *pos;
67 len = strlen(node_name); 70
68 UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, 71 /* skip over any preceding delimiters */
69 GFP_KERNEL); 72 nodename += strspn(nodename, "\\");
70 if (!UNC) 73 if (!*nodename)
71 return ERR_PTR(-ENOMEM); 74 return ERR_PTR(-EINVAL);
72 75
73 /* get share name and server name */ 76 /* get length of UNC and set pos to last char */
74 if (node_name[1] != '\\') { 77 unclen = strlen(nodename);
75 UNC[0] = '\\'; 78 pos = nodename + unclen - 1;
76 strncpy(UNC+1, node_name, len);
77 len++;
78 UNC[len] = 0;
79 } else {
80 strncpy(UNC, node_name, len);
81 UNC[len] = 0;
82 }
83 79
84 /* find server name end */ 80 /* trim off any trailing delimiters */
85 pSep = memchr(UNC+2, '\\', len-2); 81 while (*pos == '\\') {
86 if (!pSep) { 82 --pos;
87 cifs_dbg(VFS, "%s: no server name end in node name: %s\n", 83 --unclen;
88 __func__, node_name);
89 kfree(UNC);
90 return ERR_PTR(-EINVAL);
91 } 84 }
92 85
93 /* find sharename end */ 86 /* allocate a buffer:
94 pSep++; 87 * +2 for preceding "//"
95 pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC)); 88 * +1 for delimiter between UNC and prepath
96 if (pSep) { 89 * +1 for trailing NULL
97 /* trim path up to sharename end 90 */
98 * now we have share name in UNC */ 91 pplen = prepath ? strlen(prepath) : 0;
99 *pSep = 0; 92 dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL);
93 if (!dev)
94 return ERR_PTR(-ENOMEM);
95
96 pos = dev;
97 /* add the initial "//" */
98 *pos = '/';
99 ++pos;
100 *pos = '/';
101 ++pos;
102
103 /* copy in the UNC portion from referral */
104 memcpy(pos, nodename, unclen);
105 pos += unclen;
106
107 /* copy the prefixpath remainder (if there is one) */
108 if (pplen) {
109 *pos = '/';
110 ++pos;
111 memcpy(pos, prepath, pplen);
112 pos += pplen;
100 } 113 }
101 114
102 return UNC; 115 /* NULL terminator */
116 *pos = '\0';
117
118 convert_delimiter(dev, '/');
119 return dev;
103} 120}
104 121
105 122
@@ -123,6 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
123{ 140{
124 int rc; 141 int rc;
125 char *mountdata = NULL; 142 char *mountdata = NULL;
143 const char *prepath = NULL;
126 int md_len; 144 int md_len;
127 char *tkn_e; 145 char *tkn_e;
128 char *srvIP = NULL; 146 char *srvIP = NULL;
@@ -132,7 +150,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
132 if (sb_mountdata == NULL) 150 if (sb_mountdata == NULL)
133 return ERR_PTR(-EINVAL); 151 return ERR_PTR(-EINVAL);
134 152
135 *devname = cifs_get_share_name(ref->node_name); 153 if (strlen(fullpath) - ref->path_consumed)
154 prepath = fullpath + ref->path_consumed;
155
156 *devname = cifs_build_devname(ref->node_name, prepath);
136 if (IS_ERR(*devname)) { 157 if (IS_ERR(*devname)) {
137 rc = PTR_ERR(*devname); 158 rc = PTR_ERR(*devname);
138 *devname = NULL; 159 *devname = NULL;
@@ -146,12 +167,14 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
146 goto compose_mount_options_err; 167 goto compose_mount_options_err;
147 } 168 }
148 169
149 /* md_len = strlen(...) + 12 for 'sep+prefixpath=' 170 /*
150 * assuming that we have 'unc=' and 'ip=' in 171 * In most cases, we'll be building a shorter string than the original,
151 * the original sb_mountdata 172 * but we do have to assume that the address in the ip= option may be
173 * much longer than the original. Add the max length of an address
174 * string to the length of the original string to allow for worst case.
152 */ 175 */
153 md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12; 176 md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
154 mountdata = kzalloc(md_len+1, GFP_KERNEL); 177 mountdata = kzalloc(md_len + 1, GFP_KERNEL);
155 if (mountdata == NULL) { 178 if (mountdata == NULL) {
156 rc = -ENOMEM; 179 rc = -ENOMEM;
157 goto compose_mount_options_err; 180 goto compose_mount_options_err;
@@ -195,26 +218,6 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
195 strncat(mountdata, &sep, 1); 218 strncat(mountdata, &sep, 1);
196 strcat(mountdata, "ip="); 219 strcat(mountdata, "ip=");
197 strcat(mountdata, srvIP); 220 strcat(mountdata, srvIP);
198 strncat(mountdata, &sep, 1);
199 strcat(mountdata, "unc=");
200 strcat(mountdata, *devname);
201
202 /* find & copy prefixpath */
203 tkn_e = strchr(ref->node_name + 2, '\\');
204 if (tkn_e == NULL) {
205 /* invalid unc, missing share name*/
206 rc = -EINVAL;
207 goto compose_mount_options_err;
208 }
209
210 tkn_e = strchr(tkn_e + 1, '\\');
211 if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
212 strncat(mountdata, &sep, 1);
213 strcat(mountdata, "prefixpath=");
214 if (tkn_e)
215 strcat(mountdata, tkn_e + 1);
216 strcat(mountdata, fullpath + ref->path_consumed);
217 }
218 221
219 /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ 222 /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
220 /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ 223 /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 72e4efee1389..a445e71746fa 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -372,9 +372,6 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
372 cifs_show_security(s, tcon->ses->server); 372 cifs_show_security(s, tcon->ses->server);
373 cifs_show_cache_flavor(s, cifs_sb); 373 cifs_show_cache_flavor(s, cifs_sb);
374 374
375 seq_printf(s, ",unc=");
376 seq_escape(s, tcon->treeName, " \t\n\\");
377
378 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) 375 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
379 seq_printf(s, ",multiuser"); 376 seq_printf(s, ",multiuser");
380 else if (tcon->ses->user_name) 377 else if (tcon->ses->user_name)
@@ -768,7 +765,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
768 765
769static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 766static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
770{ 767{
771 /* note that this is called by vfs setlease with lock_flocks held 768 /* note that this is called by vfs setlease with i_lock held
772 to protect *lease from going away */ 769 to protect *lease from going away */
773 struct inode *inode = file_inode(file); 770 struct inode *inode = file_inode(file);
774 struct cifsFileInfo *cfile = file->private_data; 771 struct cifsFileInfo *cfile = file->private_data;
@@ -971,7 +968,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
971}; 968};
972 969
973const struct file_operations cifs_dir_ops = { 970const struct file_operations cifs_dir_ops = {
974 .readdir = cifs_readdir, 971 .iterate = cifs_readdir,
975 .release = cifs_closedir, 972 .release = cifs_closedir,
976 .read = generic_read_dir, 973 .read = generic_read_dir,
977 .unlocked_ioctl = cifs_ioctl, 974 .unlocked_ioctl = cifs_ioctl,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0e32c3446ce9..d05b3028e3b9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,7 +101,7 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
101extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); 101extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
102extern const struct file_operations cifs_dir_ops; 102extern const struct file_operations cifs_dir_ops;
103extern int cifs_dir_open(struct inode *inode, struct file *file); 103extern int cifs_dir_open(struct inode *inode, struct file *file);
104extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); 104extern int cifs_readdir(struct file *file, struct dir_context *ctx);
105 105
106/* Functions related to dir entries */ 106/* Functions related to dir entries */
107extern const struct dentry_operations cifs_dentry_ops; 107extern const struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 99eeaa17ee00..e3bc39bb9d12 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1061,6 +1061,7 @@ static int cifs_parse_security_flavors(char *value,
1061#endif 1061#endif
1062 case Opt_sec_none: 1062 case Opt_sec_none:
1063 vol->nullauth = 1; 1063 vol->nullauth = 1;
1064 vol->secFlg |= CIFSSEC_MAY_NTLM;
1064 break; 1065 break;
1065 default: 1066 default:
1066 cifs_dbg(VFS, "bad security option: %s\n", value); 1067 cifs_dbg(VFS, "bad security option: %s\n", value);
@@ -1257,14 +1258,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1257 vol->backupuid_specified = false; /* no backup intent for a user */ 1258 vol->backupuid_specified = false; /* no backup intent for a user */
1258 vol->backupgid_specified = false; /* no backup intent for a group */ 1259 vol->backupgid_specified = false; /* no backup intent for a group */
1259 1260
1260 /* 1261 switch (cifs_parse_devname(devname, vol)) {
1261 * For now, we ignore -EINVAL errors under the assumption that the 1262 case 0:
1262 * unc= and prefixpath= options will be usable. 1263 break;
1263 */ 1264 case -ENOMEM:
1264 if (cifs_parse_devname(devname, vol) == -ENOMEM) { 1265 cifs_dbg(VFS, "Unable to allocate memory for devname.\n");
1265 printk(KERN_ERR "CIFS: Unable to allocate memory to parse " 1266 goto cifs_parse_mount_err;
1266 "device string.\n"); 1267 case -EINVAL:
1267 goto out_nomem; 1268 cifs_dbg(VFS, "Malformed UNC in devname.\n");
1269 goto cifs_parse_mount_err;
1270 default:
1271 cifs_dbg(VFS, "Unknown error parsing devname.\n");
1272 goto cifs_parse_mount_err;
1268 } 1273 }
1269 1274
1270 while ((data = strsep(&options, separator)) != NULL) { 1275 while ((data = strsep(&options, separator)) != NULL) {
@@ -1826,7 +1831,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1826 } 1831 }
1827#endif 1832#endif
1828 if (!vol->UNC) { 1833 if (!vol->UNC) {
1829 cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string or in unc= option!\n"); 1834 cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n");
1830 goto cifs_parse_mount_err; 1835 goto cifs_parse_mount_err;
1831 } 1836 }
1832 1837
@@ -3274,8 +3279,8 @@ build_unc_path_to_root(const struct smb_vol *vol,
3274 pos = full_path + unc_len; 3279 pos = full_path + unc_len;
3275 3280
3276 if (pplen) { 3281 if (pplen) {
3277 *pos++ = CIFS_DIR_SEP(cifs_sb); 3282 *pos = CIFS_DIR_SEP(cifs_sb);
3278 strncpy(pos, vol->prepath, pplen); 3283 strncpy(pos + 1, vol->prepath, pplen);
3279 pos += pplen; 3284 pos += pplen;
3280 } 3285 }
3281 3286
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5699b5036ed8..5175aebf6737 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -822,8 +822,7 @@ const struct dentry_operations cifs_dentry_ops = {
822/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ 822/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
823}; 823};
824 824
825static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, 825static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
826 struct qstr *q)
827{ 826{
828 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; 827 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
829 unsigned long hash; 828 unsigned long hash;
@@ -838,12 +837,10 @@ static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
838 return 0; 837 return 0;
839} 838}
840 839
841static int cifs_ci_compare(const struct dentry *parent, 840static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
842 const struct inode *pinode,
843 const struct dentry *dentry, const struct inode *inode,
844 unsigned int len, const char *str, const struct qstr *name) 841 unsigned int len, const char *str, const struct qstr *name)
845{ 842{
846 struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls; 843 struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
847 844
848 if ((name->len == len) && 845 if ((name->len == len) &&
849 (nls_strnicmp(codepage, name->name, str, len) == 0)) 846 (nls_strnicmp(codepage, name->name, str, len) == 0))
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index e7512e497611..7ede7306599f 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -34,7 +34,7 @@
34 34
35/** 35/**
36 * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. 36 * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
37 * @unc: UNC path specifying the server 37 * @unc: UNC path specifying the server (with '/' as delimiter)
38 * @ip_addr: Where to return the IP address. 38 * @ip_addr: Where to return the IP address.
39 * 39 *
40 * The IP address will be returned in string form, and the caller is 40 * The IP address will be returned in string form, and the caller is
@@ -64,7 +64,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
64 hostname = unc + 2; 64 hostname = unc + 2;
65 65
66 /* Search for server name delimiter */ 66 /* Search for server name delimiter */
67 sep = memchr(hostname, '\\', len); 67 sep = memchr(hostname, '/', len);
68 if (sep) 68 if (sep)
69 len = sep - hostname; 69 len = sep - hostname;
70 else 70 else
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 48b29d24c9f4..91d8629e69a2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -999,7 +999,7 @@ try_again:
999 rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next); 999 rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
1000 if (!rc) 1000 if (!rc)
1001 goto try_again; 1001 goto try_again;
1002 locks_delete_block(flock); 1002 posix_unblock_lock(flock);
1003 } 1003 }
1004 return rc; 1004 return rc;
1005} 1005}
@@ -1092,6 +1092,7 @@ struct lock_to_push {
1092static int 1092static int
1093cifs_push_posix_locks(struct cifsFileInfo *cfile) 1093cifs_push_posix_locks(struct cifsFileInfo *cfile)
1094{ 1094{
1095 struct inode *inode = cfile->dentry->d_inode;
1095 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1096 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1096 struct file_lock *flock, **before; 1097 struct file_lock *flock, **before;
1097 unsigned int count = 0, i = 0; 1098 unsigned int count = 0, i = 0;
@@ -1102,12 +1103,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1102 1103
1103 xid = get_xid(); 1104 xid = get_xid();
1104 1105
1105 lock_flocks(); 1106 spin_lock(&inode->i_lock);
1106 cifs_for_each_lock(cfile->dentry->d_inode, before) { 1107 cifs_for_each_lock(inode, before) {
1107 if ((*before)->fl_flags & FL_POSIX) 1108 if ((*before)->fl_flags & FL_POSIX)
1108 count++; 1109 count++;
1109 } 1110 }
1110 unlock_flocks(); 1111 spin_unlock(&inode->i_lock);
1111 1112
1112 INIT_LIST_HEAD(&locks_to_send); 1113 INIT_LIST_HEAD(&locks_to_send);
1113 1114
@@ -1126,8 +1127,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1126 } 1127 }
1127 1128
1128 el = locks_to_send.next; 1129 el = locks_to_send.next;
1129 lock_flocks(); 1130 spin_lock(&inode->i_lock);
1130 cifs_for_each_lock(cfile->dentry->d_inode, before) { 1131 cifs_for_each_lock(inode, before) {
1131 flock = *before; 1132 flock = *before;
1132 if ((flock->fl_flags & FL_POSIX) == 0) 1133 if ((flock->fl_flags & FL_POSIX) == 0)
1133 continue; 1134 continue;
@@ -1152,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1152 lck->offset = flock->fl_start; 1153 lck->offset = flock->fl_start;
1153 el = el->next; 1154 el = el->next;
1154 } 1155 }
1155 unlock_flocks(); 1156 spin_unlock(&inode->i_lock);
1156 1157
1157 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { 1158 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
1158 int stored_rc; 1159 int stored_rc;
@@ -3546,11 +3547,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
3546 return cifs_fscache_release_page(page, gfp); 3547 return cifs_fscache_release_page(page, gfp);
3547} 3548}
3548 3549
3549static void cifs_invalidate_page(struct page *page, unsigned long offset) 3550static void cifs_invalidate_page(struct page *page, unsigned int offset,
3551 unsigned int length)
3550{ 3552{
3551 struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); 3553 struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
3552 3554
3553 if (offset == 0) 3555 if (offset == 0 && length == PAGE_CACHE_SIZE)
3554 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); 3556 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
3555} 3557}
3556 3558
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index fc3025199cb3..20efd81266c6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -171,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
171 171
172 if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) 172 if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
173 inode->i_flags |= S_AUTOMOUNT; 173 inode->i_flags |= S_AUTOMOUNT;
174 cifs_set_ops(inode); 174 if (inode->i_state & I_NEW)
175 cifs_set_ops(inode);
175} 176}
176 177
177void 178void
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 770d5a9781c1..f1213799de1a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -537,14 +537,14 @@ static int cifs_save_resume_key(const char *current_entry,
537 * every entry (do not increment for . or .. entry). 537 * every entry (do not increment for . or .. entry).
538 */ 538 */
539static int 539static int
540find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, 540find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
541 struct file *file, char **current_entry, int *num_to_ret) 541 struct file *file, char **current_entry, int *num_to_ret)
542{ 542{
543 __u16 search_flags; 543 __u16 search_flags;
544 int rc = 0; 544 int rc = 0;
545 int pos_in_buf = 0; 545 int pos_in_buf = 0;
546 loff_t first_entry_in_buffer; 546 loff_t first_entry_in_buffer;
547 loff_t index_to_find = file->f_pos; 547 loff_t index_to_find = pos;
548 struct cifsFileInfo *cfile = file->private_data; 548 struct cifsFileInfo *cfile = file->private_data;
549 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 549 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
550 struct TCP_Server_Info *server = tcon->ses->server; 550 struct TCP_Server_Info *server = tcon->ses->server;
@@ -659,8 +659,9 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
659 return rc; 659 return rc;
660} 660}
661 661
662static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, 662static int cifs_filldir(char *find_entry, struct file *file,
663 void *dirent, char *scratch_buf, unsigned int max_len) 663 struct dir_context *ctx,
664 char *scratch_buf, unsigned int max_len)
664{ 665{
665 struct cifsFileInfo *file_info = file->private_data; 666 struct cifsFileInfo *file_info = file->private_data;
666 struct super_block *sb = file->f_path.dentry->d_sb; 667 struct super_block *sb = file->f_path.dentry->d_sb;
@@ -740,13 +741,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
740 cifs_prime_dcache(file->f_dentry, &name, &fattr); 741 cifs_prime_dcache(file->f_dentry, &name, &fattr);
741 742
742 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 743 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
743 rc = filldir(dirent, name.name, name.len, file->f_pos, ino, 744 return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
744 fattr.cf_dtype);
745 return rc;
746} 745}
747 746
748 747
749int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) 748int cifs_readdir(struct file *file, struct dir_context *ctx)
750{ 749{
751 int rc = 0; 750 int rc = 0;
752 unsigned int xid; 751 unsigned int xid;
@@ -772,103 +771,86 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
772 goto rddir2_exit; 771 goto rddir2_exit;
773 } 772 }
774 773
775 switch ((int) file->f_pos) { 774 if (!dir_emit_dots(file, ctx))
776 case 0: 775 goto rddir2_exit;
777 if (filldir(direntry, ".", 1, file->f_pos,
778 file_inode(file)->i_ino, DT_DIR) < 0) {
779 cifs_dbg(VFS, "Filldir for current dir failed\n");
780 rc = -ENOMEM;
781 break;
782 }
783 file->f_pos++;
784 case 1:
785 if (filldir(direntry, "..", 2, file->f_pos,
786 parent_ino(file->f_path.dentry), DT_DIR) < 0) {
787 cifs_dbg(VFS, "Filldir for parent dir failed\n");
788 rc = -ENOMEM;
789 break;
790 }
791 file->f_pos++;
792 default:
793 /* 1) If search is active,
794 is in current search buffer?
795 if it before then restart search
796 if after then keep searching till find it */
797
798 if (file->private_data == NULL) {
799 rc = -EINVAL;
800 free_xid(xid);
801 return rc;
802 }
803 cifsFile = file->private_data;
804 if (cifsFile->srch_inf.endOfSearch) {
805 if (cifsFile->srch_inf.emptyDir) {
806 cifs_dbg(FYI, "End of search, empty dir\n");
807 rc = 0;
808 break;
809 }
810 } /* else {
811 cifsFile->invalidHandle = true;
812 tcon->ses->server->close(xid, tcon, &cifsFile->fid);
813 } */
814 776
815 tcon = tlink_tcon(cifsFile->tlink); 777 /* 1) If search is active,
816 rc = find_cifs_entry(xid, tcon, file, &current_entry, 778 is in current search buffer?
817 &num_to_fill); 779 if it before then restart search
818 if (rc) { 780 if after then keep searching till find it */
819 cifs_dbg(FYI, "fce error %d\n", rc); 781
820 goto rddir2_exit; 782 if (file->private_data == NULL) {
821 } else if (current_entry != NULL) { 783 rc = -EINVAL;
822 cifs_dbg(FYI, "entry %lld found\n", file->f_pos); 784 goto rddir2_exit;
823 } else { 785 }
824 cifs_dbg(FYI, "could not find entry\n"); 786 cifsFile = file->private_data;
787 if (cifsFile->srch_inf.endOfSearch) {
788 if (cifsFile->srch_inf.emptyDir) {
789 cifs_dbg(FYI, "End of search, empty dir\n");
790 rc = 0;
825 goto rddir2_exit; 791 goto rddir2_exit;
826 } 792 }
827 cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", 793 } /* else {
828 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); 794 cifsFile->invalidHandle = true;
829 max_len = tcon->ses->server->ops->calc_smb_size( 795 tcon->ses->server->close(xid, tcon, &cifsFile->fid);
830 cifsFile->srch_inf.ntwrk_buf_start); 796 } */
831 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; 797
832 798 tcon = tlink_tcon(cifsFile->tlink);
833 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); 799 rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
834 if (tmp_buf == NULL) { 800 &num_to_fill);
835 rc = -ENOMEM; 801 if (rc) {
802 cifs_dbg(FYI, "fce error %d\n", rc);
803 goto rddir2_exit;
804 } else if (current_entry != NULL) {
805 cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
806 } else {
807 cifs_dbg(FYI, "could not find entry\n");
808 goto rddir2_exit;
809 }
810 cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
811 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
812 max_len = tcon->ses->server->ops->calc_smb_size(
813 cifsFile->srch_inf.ntwrk_buf_start);
814 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
815
816 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
817 if (tmp_buf == NULL) {
818 rc = -ENOMEM;
819 goto rddir2_exit;
820 }
821
822 for (i = 0; i < num_to_fill; i++) {
823 if (current_entry == NULL) {
824 /* evaluate whether this case is an error */
825 cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n",
826 num_to_fill, i);
836 break; 827 break;
837 } 828 }
838 829 /*
839 for (i = 0; (i < num_to_fill) && (rc == 0); i++) { 830 * if buggy server returns . and .. late do we want to
840 if (current_entry == NULL) { 831 * check for that here?
841 /* evaluate whether this case is an error */ 832 */
842 cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n", 833 rc = cifs_filldir(current_entry, file, ctx,
843 num_to_fill, i); 834 tmp_buf, max_len);
844 break; 835 if (rc) {
845 } 836 if (rc > 0)
846 /*
847 * if buggy server returns . and .. late do we want to
848 * check for that here?
849 */
850 rc = cifs_filldir(current_entry, file, filldir,
851 direntry, tmp_buf, max_len);
852 if (rc == -EOVERFLOW) {
853 rc = 0; 837 rc = 0;
854 break; 838 break;
855 }
856
857 file->f_pos++;
858 if (file->f_pos ==
859 cifsFile->srch_inf.index_of_last_entry) {
860 cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
861 file->f_pos, tmp_buf);
862 cifs_save_resume_key(current_entry, cifsFile);
863 break;
864 } else
865 current_entry =
866 nxt_dir_entry(current_entry, end_of_smb,
867 cifsFile->srch_inf.info_level);
868 } 839 }
869 kfree(tmp_buf); 840
870 break; 841 ctx->pos++;
871 } /* end switch */ 842 if (ctx->pos ==
843 cifsFile->srch_inf.index_of_last_entry) {
844 cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
845 ctx->pos, tmp_buf);
846 cifs_save_resume_key(current_entry, cifsFile);
847 break;
848 } else
849 current_entry =
850 nxt_dir_entry(current_entry, end_of_smb,
851 cifsFile->srch_inf.info_level);
852 }
853 kfree(tmp_buf);
872 854
873rddir2_exit: 855rddir2_exit:
874 free_xid(xid); 856 free_xid(xid);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index b7d3a05c062c..14a14808320c 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -43,15 +43,14 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
43 struct inode *new_inode, struct dentry *new_dentry); 43 struct inode *new_inode, struct dentry *new_dentry);
44 44
45/* dir file-ops */ 45/* dir file-ops */
46static int coda_readdir(struct file *file, void *buf, filldir_t filldir); 46static int coda_readdir(struct file *file, struct dir_context *ctx);
47 47
48/* dentry ops */ 48/* dentry ops */
49static int coda_dentry_revalidate(struct dentry *de, unsigned int flags); 49static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
50static int coda_dentry_delete(const struct dentry *); 50static int coda_dentry_delete(const struct dentry *);
51 51
52/* support routines */ 52/* support routines */
53static int coda_venus_readdir(struct file *coda_file, void *buf, 53static int coda_venus_readdir(struct file *, struct dir_context *);
54 filldir_t filldir);
55 54
56/* same as fs/bad_inode.c */ 55/* same as fs/bad_inode.c */
57static int coda_return_EIO(void) 56static int coda_return_EIO(void)
@@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations =
85const struct file_operations coda_dir_operations = { 84const struct file_operations coda_dir_operations = {
86 .llseek = generic_file_llseek, 85 .llseek = generic_file_llseek,
87 .read = generic_read_dir, 86 .read = generic_read_dir,
88 .readdir = coda_readdir, 87 .iterate = coda_readdir,
89 .open = coda_open, 88 .open = coda_open,
90 .release = coda_release, 89 .release = coda_release,
91 .fsync = coda_fsync, 90 .fsync = coda_fsync,
@@ -378,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
378 377
379 378
380/* file operations for directories */ 379/* file operations for directories */
381static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) 380static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
382{ 381{
383 struct coda_file_info *cfi; 382 struct coda_file_info *cfi;
384 struct file *host_file; 383 struct file *host_file;
@@ -391,30 +390,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
391 if (!host_file->f_op) 390 if (!host_file->f_op)
392 return -ENOTDIR; 391 return -ENOTDIR;
393 392
394 if (host_file->f_op->readdir) 393 if (host_file->f_op->iterate) {
395 {
396 /* potemkin case: we were handed a directory inode.
397 * We can't use vfs_readdir because we have to keep the file
398 * position in sync between the coda_file and the host_file.
399 * and as such we need grab the inode mutex. */
400 struct inode *host_inode = file_inode(host_file); 394 struct inode *host_inode = file_inode(host_file);
401
402 mutex_lock(&host_inode->i_mutex); 395 mutex_lock(&host_inode->i_mutex);
403 host_file->f_pos = coda_file->f_pos;
404
405 ret = -ENOENT; 396 ret = -ENOENT;
406 if (!IS_DEADDIR(host_inode)) { 397 if (!IS_DEADDIR(host_inode)) {
407 ret = host_file->f_op->readdir(host_file, buf, filldir); 398 ret = host_file->f_op->iterate(host_file, ctx);
408 file_accessed(host_file); 399 file_accessed(host_file);
409 } 400 }
410
411 coda_file->f_pos = host_file->f_pos;
412 mutex_unlock(&host_inode->i_mutex); 401 mutex_unlock(&host_inode->i_mutex);
402 return ret;
413 } 403 }
414 else /* Venus: we must read Venus dirents from a file */ 404 /* Venus: we must read Venus dirents from a file */
415 ret = coda_venus_readdir(coda_file, buf, filldir); 405 return coda_venus_readdir(coda_file, ctx);
416
417 return ret;
418} 406}
419 407
420static inline unsigned int CDT2DT(unsigned char cdt) 408static inline unsigned int CDT2DT(unsigned char cdt)
@@ -437,10 +425,8 @@ static inline unsigned int CDT2DT(unsigned char cdt)
437} 425}
438 426
439/* support routines */ 427/* support routines */
440static int coda_venus_readdir(struct file *coda_file, void *buf, 428static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
441 filldir_t filldir)
442{ 429{
443 int result = 0; /* # of entries returned */
444 struct coda_file_info *cfi; 430 struct coda_file_info *cfi;
445 struct coda_inode_info *cii; 431 struct coda_inode_info *cii;
446 struct file *host_file; 432 struct file *host_file;
@@ -462,23 +448,12 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
462 vdir = kmalloc(sizeof(*vdir), GFP_KERNEL); 448 vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
463 if (!vdir) return -ENOMEM; 449 if (!vdir) return -ENOMEM;
464 450
465 if (coda_file->f_pos == 0) { 451 if (!dir_emit_dots(coda_file, ctx))
466 ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR); 452 goto out;
467 if (ret < 0) 453
468 goto out;
469 result++;
470 coda_file->f_pos++;
471 }
472 if (coda_file->f_pos == 1) {
473 ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR);
474 if (ret < 0)
475 goto out;
476 result++;
477 coda_file->f_pos++;
478 }
479 while (1) { 454 while (1) {
480 /* read entries from the directory file */ 455 /* read entries from the directory file */
481 ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir, 456 ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
482 sizeof(*vdir)); 457 sizeof(*vdir));
483 if (ret < 0) { 458 if (ret < 0) {
484 printk(KERN_ERR "coda readdir: read dir %s failed %d\n", 459 printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
@@ -507,32 +482,23 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
507 482
508 /* Make sure we skip '.' and '..', we already got those */ 483 /* Make sure we skip '.' and '..', we already got those */
509 if (name.name[0] == '.' && (name.len == 1 || 484 if (name.name[0] == '.' && (name.len == 1 ||
510 (vdir->d_name[1] == '.' && name.len == 2))) 485 (name.name[1] == '.' && name.len == 2)))
511 vdir->d_fileno = name.len = 0; 486 vdir->d_fileno = name.len = 0;
512 487
513 /* skip null entries */ 488 /* skip null entries */
514 if (vdir->d_fileno && name.len) { 489 if (vdir->d_fileno && name.len) {
515 /* try to look up this entry in the dcache, that way 490 ino = vdir->d_fileno;
516 * userspace doesn't have to worry about breaking
517 * getcwd by having mismatched inode numbers for
518 * internal volume mountpoints. */
519 ino = find_inode_number(de, &name);
520 if (!ino) ino = vdir->d_fileno;
521
522 type = CDT2DT(vdir->d_type); 491 type = CDT2DT(vdir->d_type);
523 ret = filldir(buf, name.name, name.len, 492 if (!dir_emit(ctx, name.name, name.len, ino, type))
524 coda_file->f_pos, ino, type); 493 break;
525 /* failure means no space for filling in this round */
526 if (ret < 0) break;
527 result++;
528 } 494 }
529 /* we'll always have progress because d_reclen is unsigned and 495 /* we'll always have progress because d_reclen is unsigned and
530 * we've already established it is non-zero. */ 496 * we've already established it is non-zero. */
531 coda_file->f_pos += vdir->d_reclen; 497 ctx->pos += vdir->d_reclen;
532 } 498 }
533out: 499out:
534 kfree(vdir); 500 kfree(vdir);
535 return result ? result : ret; 501 return 0;
536} 502}
537 503
538/* called when a cache lookup succeeds */ 504/* called when a cache lookup succeeds */
diff --git a/fs/compat.c b/fs/compat.c
index fc3b55dce184..6af20de2c1a3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -832,6 +832,7 @@ struct compat_old_linux_dirent {
832}; 832};
833 833
834struct compat_readdir_callback { 834struct compat_readdir_callback {
835 struct dir_context ctx;
835 struct compat_old_linux_dirent __user *dirent; 836 struct compat_old_linux_dirent __user *dirent;
836 int result; 837 int result;
837}; 838};
@@ -873,15 +874,15 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
873{ 874{
874 int error; 875 int error;
875 struct fd f = fdget(fd); 876 struct fd f = fdget(fd);
876 struct compat_readdir_callback buf; 877 struct compat_readdir_callback buf = {
878 .ctx.actor = compat_fillonedir,
879 .dirent = dirent
880 };
877 881
878 if (!f.file) 882 if (!f.file)
879 return -EBADF; 883 return -EBADF;
880 884
881 buf.result = 0; 885 error = iterate_dir(f.file, &buf.ctx);
882 buf.dirent = dirent;
883
884 error = vfs_readdir(f.file, compat_fillonedir, &buf);
885 if (buf.result) 886 if (buf.result)
886 error = buf.result; 887 error = buf.result;
887 888
@@ -897,6 +898,7 @@ struct compat_linux_dirent {
897}; 898};
898 899
899struct compat_getdents_callback { 900struct compat_getdents_callback {
901 struct dir_context ctx;
900 struct compat_linux_dirent __user *current_dir; 902 struct compat_linux_dirent __user *current_dir;
901 struct compat_linux_dirent __user *previous; 903 struct compat_linux_dirent __user *previous;
902 int count; 904 int count;
@@ -951,7 +953,11 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
951{ 953{
952 struct fd f; 954 struct fd f;
953 struct compat_linux_dirent __user * lastdirent; 955 struct compat_linux_dirent __user * lastdirent;
954 struct compat_getdents_callback buf; 956 struct compat_getdents_callback buf = {
957 .ctx.actor = compat_filldir,
958 .current_dir = dirent,
959 .count = count
960 };
955 int error; 961 int error;
956 962
957 if (!access_ok(VERIFY_WRITE, dirent, count)) 963 if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -961,17 +967,12 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
961 if (!f.file) 967 if (!f.file)
962 return -EBADF; 968 return -EBADF;
963 969
964 buf.current_dir = dirent; 970 error = iterate_dir(f.file, &buf.ctx);
965 buf.previous = NULL;
966 buf.count = count;
967 buf.error = 0;
968
969 error = vfs_readdir(f.file, compat_filldir, &buf);
970 if (error >= 0) 971 if (error >= 0)
971 error = buf.error; 972 error = buf.error;
972 lastdirent = buf.previous; 973 lastdirent = buf.previous;
973 if (lastdirent) { 974 if (lastdirent) {
974 if (put_user(f.file->f_pos, &lastdirent->d_off)) 975 if (put_user(buf.ctx.pos, &lastdirent->d_off))
975 error = -EFAULT; 976 error = -EFAULT;
976 else 977 else
977 error = count - buf.count; 978 error = count - buf.count;
@@ -983,6 +984,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
983#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 984#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
984 985
985struct compat_getdents_callback64 { 986struct compat_getdents_callback64 {
987 struct dir_context ctx;
986 struct linux_dirent64 __user *current_dir; 988 struct linux_dirent64 __user *current_dir;
987 struct linux_dirent64 __user *previous; 989 struct linux_dirent64 __user *previous;
988 int count; 990 int count;
@@ -1036,7 +1038,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1036{ 1038{
1037 struct fd f; 1039 struct fd f;
1038 struct linux_dirent64 __user * lastdirent; 1040 struct linux_dirent64 __user * lastdirent;
1039 struct compat_getdents_callback64 buf; 1041 struct compat_getdents_callback64 buf = {
1042 .ctx.actor = compat_filldir64,
1043 .current_dir = dirent,
1044 .count = count
1045 };
1040 int error; 1046 int error;
1041 1047
1042 if (!access_ok(VERIFY_WRITE, dirent, count)) 1048 if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -1046,17 +1052,12 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1046 if (!f.file) 1052 if (!f.file)
1047 return -EBADF; 1053 return -EBADF;
1048 1054
1049 buf.current_dir = dirent; 1055 error = iterate_dir(f.file, &buf.ctx);
1050 buf.previous = NULL;
1051 buf.count = count;
1052 buf.error = 0;
1053
1054 error = vfs_readdir(f.file, compat_filldir64, &buf);
1055 if (error >= 0) 1056 if (error >= 0)
1056 error = buf.error; 1057 error = buf.error;
1057 lastdirent = buf.previous; 1058 lastdirent = buf.previous;
1058 if (lastdirent) { 1059 if (lastdirent) {
1059 typeof(lastdirent->d_off) d_off = f.file->f_pos; 1060 typeof(lastdirent->d_off) d_off = buf.ctx.pos;
1060 if (__put_user_unaligned(d_off, &lastdirent->d_off)) 1061 if (__put_user_unaligned(d_off, &lastdirent->d_off))
1061 error = -EFAULT; 1062 error = -EFAULT;
1062 else 1063 else
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 996cdc5abb85..5d19acfa7c6c 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -66,7 +66,6 @@
66#include <linux/gigaset_dev.h> 66#include <linux/gigaset_dev.h>
67 67
68#ifdef CONFIG_BLOCK 68#ifdef CONFIG_BLOCK
69#include <linux/loop.h>
70#include <linux/cdrom.h> 69#include <linux/cdrom.h>
71#include <linux/fd.h> 70#include <linux/fd.h>
72#include <scsi/scsi.h> 71#include <scsi/scsi.h>
@@ -954,8 +953,6 @@ COMPATIBLE_IOCTL(MTIOCTOP)
954/* Socket level stuff */ 953/* Socket level stuff */
955COMPATIBLE_IOCTL(FIOQSIZE) 954COMPATIBLE_IOCTL(FIOQSIZE)
956#ifdef CONFIG_BLOCK 955#ifdef CONFIG_BLOCK
957/* loop */
958IGNORE_IOCTL(LOOP_CLR_FD)
959/* md calls this on random blockdevs */ 956/* md calls this on random blockdevs */
960IGNORE_IOCTL(RAID_VERSION) 957IGNORE_IOCTL(RAID_VERSION)
961/* qemu/qemu-img might call these two on plain files for probing */ 958/* qemu/qemu-img might call these two on plain files for probing */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7aabc6ad4e9b..64e5323cbbb0 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1532,84 +1532,66 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)
1532 return (sd->s_mode >> 12) & 15; 1532 return (sd->s_mode >> 12) & 15;
1533} 1533}
1534 1534
1535static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 1535static int configfs_readdir(struct file *file, struct dir_context *ctx)
1536{ 1536{
1537 struct dentry *dentry = filp->f_path.dentry; 1537 struct dentry *dentry = file->f_path.dentry;
1538 struct super_block *sb = dentry->d_sb; 1538 struct super_block *sb = dentry->d_sb;
1539 struct configfs_dirent * parent_sd = dentry->d_fsdata; 1539 struct configfs_dirent * parent_sd = dentry->d_fsdata;
1540 struct configfs_dirent *cursor = filp->private_data; 1540 struct configfs_dirent *cursor = file->private_data;
1541 struct list_head *p, *q = &cursor->s_sibling; 1541 struct list_head *p, *q = &cursor->s_sibling;
1542 ino_t ino = 0; 1542 ino_t ino = 0;
1543 int i = filp->f_pos;
1544 1543
1545 switch (i) { 1544 if (!dir_emit_dots(file, ctx))
1546 case 0: 1545 return 0;
1547 ino = dentry->d_inode->i_ino; 1546 if (ctx->pos == 2) {
1548 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) 1547 spin_lock(&configfs_dirent_lock);
1549 break; 1548 list_move(q, &parent_sd->s_children);
1550 filp->f_pos++; 1549 spin_unlock(&configfs_dirent_lock);
1551 i++; 1550 }
1552 /* fallthrough */ 1551 for (p = q->next; p != &parent_sd->s_children; p = p->next) {
1553 case 1: 1552 struct configfs_dirent *next;
1554 ino = parent_ino(dentry); 1553 const char *name;
1555 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 1554 int len;
1556 break; 1555 struct inode *inode = NULL;
1557 filp->f_pos++; 1556
1558 i++; 1557 next = list_entry(p, struct configfs_dirent, s_sibling);
1559 /* fallthrough */ 1558 if (!next->s_element)
1560 default: 1559 continue;
1561 if (filp->f_pos == 2) {
1562 spin_lock(&configfs_dirent_lock);
1563 list_move(q, &parent_sd->s_children);
1564 spin_unlock(&configfs_dirent_lock);
1565 }
1566 for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
1567 struct configfs_dirent *next;
1568 const char * name;
1569 int len;
1570 struct inode *inode = NULL;
1571 1560
1572 next = list_entry(p, struct configfs_dirent, 1561 name = configfs_get_name(next);
1573 s_sibling); 1562 len = strlen(name);
1574 if (!next->s_element) 1563
1575 continue; 1564 /*
1576 1565 * We'll have a dentry and an inode for
1577 name = configfs_get_name(next); 1566 * PINNED items and for open attribute
1578 len = strlen(name); 1567 * files. We lock here to prevent a race
1579 1568 * with configfs_d_iput() clearing
1580 /* 1569 * s_dentry before calling iput().
1581 * We'll have a dentry and an inode for 1570 *
1582 * PINNED items and for open attribute 1571 * Why do we go to the trouble? If
1583 * files. We lock here to prevent a race 1572 * someone has an attribute file open,
1584 * with configfs_d_iput() clearing 1573 * the inode number should match until
1585 * s_dentry before calling iput(). 1574 * they close it. Beyond that, we don't
1586 * 1575 * care.
1587 * Why do we go to the trouble? If 1576 */
1588 * someone has an attribute file open, 1577 spin_lock(&configfs_dirent_lock);
1589 * the inode number should match until 1578 dentry = next->s_dentry;
1590 * they close it. Beyond that, we don't 1579 if (dentry)
1591 * care. 1580 inode = dentry->d_inode;
1592 */ 1581 if (inode)
1593 spin_lock(&configfs_dirent_lock); 1582 ino = inode->i_ino;
1594 dentry = next->s_dentry; 1583 spin_unlock(&configfs_dirent_lock);
1595 if (dentry) 1584 if (!inode)
1596 inode = dentry->d_inode; 1585 ino = iunique(sb, 2);
1597 if (inode)
1598 ino = inode->i_ino;
1599 spin_unlock(&configfs_dirent_lock);
1600 if (!inode)
1601 ino = iunique(sb, 2);
1602 1586
1603 if (filldir(dirent, name, len, filp->f_pos, ino, 1587 if (!dir_emit(ctx, name, len, ino, dt_type(next)))
1604 dt_type(next)) < 0) 1588 return 0;
1605 return 0;
1606 1589
1607 spin_lock(&configfs_dirent_lock); 1590 spin_lock(&configfs_dirent_lock);
1608 list_move(q, p); 1591 list_move(q, p);
1609 spin_unlock(&configfs_dirent_lock); 1592 spin_unlock(&configfs_dirent_lock);
1610 p = q; 1593 p = q;
1611 filp->f_pos++; 1594 ctx->pos++;
1612 }
1613 } 1595 }
1614 return 0; 1596 return 0;
1615} 1597}
@@ -1661,7 +1643,7 @@ const struct file_operations configfs_dir_operations = {
1661 .release = configfs_dir_close, 1643 .release = configfs_dir_close,
1662 .llseek = configfs_dir_lseek, 1644 .llseek = configfs_dir_lseek,
1663 .read = generic_read_dir, 1645 .read = generic_read_dir,
1664 .readdir = configfs_readdir, 1646 .iterate = configfs_readdir,
1665}; 1647};
1666 1648
1667int configfs_register_subsystem(struct configfs_subsystem *subsys) 1649int configfs_register_subsystem(struct configfs_subsystem *subsys)
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 35b1c7bd18b7..e501ac3a49ff 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -349,18 +349,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
349/* 349/*
350 * Read a cramfs directory entry. 350 * Read a cramfs directory entry.
351 */ 351 */
352static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 352static int cramfs_readdir(struct file *file, struct dir_context *ctx)
353{ 353{
354 struct inode *inode = file_inode(filp); 354 struct inode *inode = file_inode(file);
355 struct super_block *sb = inode->i_sb; 355 struct super_block *sb = inode->i_sb;
356 char *buf; 356 char *buf;
357 unsigned int offset; 357 unsigned int offset;
358 int copied;
359 358
360 /* Offset within the thing. */ 359 /* Offset within the thing. */
361 offset = filp->f_pos; 360 if (ctx->pos >= inode->i_size)
362 if (offset >= inode->i_size)
363 return 0; 361 return 0;
362 offset = ctx->pos;
364 /* Directory entries are always 4-byte aligned */ 363 /* Directory entries are always 4-byte aligned */
365 if (offset & 3) 364 if (offset & 3)
366 return -EINVAL; 365 return -EINVAL;
@@ -369,14 +368,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
369 if (!buf) 368 if (!buf)
370 return -ENOMEM; 369 return -ENOMEM;
371 370
372 copied = 0;
373 while (offset < inode->i_size) { 371 while (offset < inode->i_size) {
374 struct cramfs_inode *de; 372 struct cramfs_inode *de;
375 unsigned long nextoffset; 373 unsigned long nextoffset;
376 char *name; 374 char *name;
377 ino_t ino; 375 ino_t ino;
378 umode_t mode; 376 umode_t mode;
379 int namelen, error; 377 int namelen;
380 378
381 mutex_lock(&read_mutex); 379 mutex_lock(&read_mutex);
382 de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); 380 de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
@@ -402,13 +400,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
402 break; 400 break;
403 namelen--; 401 namelen--;
404 } 402 }
405 error = filldir(dirent, buf, namelen, offset, ino, mode >> 12); 403 if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
406 if (error)
407 break; 404 break;
408 405
409 offset = nextoffset; 406 ctx->pos = offset = nextoffset;
410 filp->f_pos = offset;
411 copied++;
412 } 407 }
413 kfree(buf); 408 kfree(buf);
414 return 0; 409 return 0;
@@ -547,7 +542,7 @@ static const struct address_space_operations cramfs_aops = {
547static const struct file_operations cramfs_directory_operations = { 542static const struct file_operations cramfs_directory_operations = {
548 .llseek = generic_file_llseek, 543 .llseek = generic_file_llseek,
549 .read = generic_read_dir, 544 .read = generic_read_dir,
550 .readdir = cramfs_readdir, 545 .iterate = cramfs_readdir,
551}; 546};
552 547
553static const struct inode_operations cramfs_dir_inode_operations = { 548static const struct inode_operations cramfs_dir_inode_operations = {
diff --git a/fs/dcache.c b/fs/dcache.c
index f09b9085f7d8..87bdb5329c3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1612,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias);
1612 * If a dentry was found and moved, then it is returned. Otherwise NULL 1612 * If a dentry was found and moved, then it is returned. Otherwise NULL
1613 * is returned. This matches the expected return value of ->lookup. 1613 * is returned. This matches the expected return value of ->lookup.
1614 * 1614 *
1615 * Cluster filesystems may call this function with a negative, hashed dentry.
1616 * In that case, we know that the inode will be a regular file, and also this
1617 * will only occur during atomic_open. So we need to check for the dentry
1618 * being already hashed only in the final case.
1615 */ 1619 */
1616struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) 1620struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1617{ 1621{
@@ -1636,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1636 security_d_instantiate(dentry, inode); 1640 security_d_instantiate(dentry, inode);
1637 d_rehash(dentry); 1641 d_rehash(dentry);
1638 } 1642 }
1639 } else 1643 } else {
1640 d_add(dentry, inode); 1644 d_instantiate(dentry, inode);
1645 if (d_unhashed(dentry))
1646 d_rehash(dentry);
1647 }
1641 return new; 1648 return new;
1642} 1649}
1643EXPORT_SYMBOL(d_splice_alias); 1650EXPORT_SYMBOL(d_splice_alias);
@@ -1723,7 +1730,7 @@ EXPORT_SYMBOL(d_add_ci);
1723 * Do the slow-case of the dentry name compare. 1730 * Do the slow-case of the dentry name compare.
1724 * 1731 *
1725 * Unlike the dentry_cmp() function, we need to atomically 1732 * Unlike the dentry_cmp() function, we need to atomically
1726 * load the name, length and inode information, so that the 1733 * load the name and length information, so that the
1727 * filesystem can rely on them, and can use the 'name' and 1734 * filesystem can rely on them, and can use the 'name' and
1728 * 'len' information without worrying about walking off the 1735 * 'len' information without worrying about walking off the
1729 * end of memory etc. 1736 * end of memory etc.
@@ -1741,22 +1748,18 @@ enum slow_d_compare {
1741 1748
1742static noinline enum slow_d_compare slow_dentry_cmp( 1749static noinline enum slow_d_compare slow_dentry_cmp(
1743 const struct dentry *parent, 1750 const struct dentry *parent,
1744 struct inode *inode,
1745 struct dentry *dentry, 1751 struct dentry *dentry,
1746 unsigned int seq, 1752 unsigned int seq,
1747 const struct qstr *name) 1753 const struct qstr *name)
1748{ 1754{
1749 int tlen = dentry->d_name.len; 1755 int tlen = dentry->d_name.len;
1750 const char *tname = dentry->d_name.name; 1756 const char *tname = dentry->d_name.name;
1751 struct inode *i = dentry->d_inode;
1752 1757
1753 if (read_seqcount_retry(&dentry->d_seq, seq)) { 1758 if (read_seqcount_retry(&dentry->d_seq, seq)) {
1754 cpu_relax(); 1759 cpu_relax();
1755 return D_COMP_SEQRETRY; 1760 return D_COMP_SEQRETRY;
1756 } 1761 }
1757 if (parent->d_op->d_compare(parent, inode, 1762 if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
1758 dentry, i,
1759 tlen, tname, name))
1760 return D_COMP_NOMATCH; 1763 return D_COMP_NOMATCH;
1761 return D_COMP_OK; 1764 return D_COMP_OK;
1762} 1765}
@@ -1766,7 +1769,6 @@ static noinline enum slow_d_compare slow_dentry_cmp(
1766 * @parent: parent dentry 1769 * @parent: parent dentry
1767 * @name: qstr of name we wish to find 1770 * @name: qstr of name we wish to find
1768 * @seqp: returns d_seq value at the point where the dentry was found 1771 * @seqp: returns d_seq value at the point where the dentry was found
1769 * @inode: returns dentry->d_inode when the inode was found valid.
1770 * Returns: dentry, or NULL 1772 * Returns: dentry, or NULL
1771 * 1773 *
1772 * __d_lookup_rcu is the dcache lookup function for rcu-walk name 1774 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
@@ -1793,7 +1795,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
1793 */ 1795 */
1794struct dentry *__d_lookup_rcu(const struct dentry *parent, 1796struct dentry *__d_lookup_rcu(const struct dentry *parent,
1795 const struct qstr *name, 1797 const struct qstr *name,
1796 unsigned *seqp, struct inode *inode) 1798 unsigned *seqp)
1797{ 1799{
1798 u64 hashlen = name->hash_len; 1800 u64 hashlen = name->hash_len;
1799 const unsigned char *str = name->name; 1801 const unsigned char *str = name->name;
@@ -1827,11 +1829,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
1827seqretry: 1829seqretry:
1828 /* 1830 /*
1829 * The dentry sequence count protects us from concurrent 1831 * The dentry sequence count protects us from concurrent
1830 * renames, and thus protects inode, parent and name fields. 1832 * renames, and thus protects parent and name fields.
1831 * 1833 *
1832 * The caller must perform a seqcount check in order 1834 * The caller must perform a seqcount check in order
1833 * to do anything useful with the returned dentry, 1835 * to do anything useful with the returned dentry.
1834 * including using the 'd_inode' pointer.
1835 * 1836 *
1836 * NOTE! We do a "raw" seqcount_begin here. That means that 1837 * NOTE! We do a "raw" seqcount_begin here. That means that
1837 * we don't wait for the sequence count to stabilize if it 1838 * we don't wait for the sequence count to stabilize if it
@@ -1845,12 +1846,12 @@ seqretry:
1845 continue; 1846 continue;
1846 if (d_unhashed(dentry)) 1847 if (d_unhashed(dentry))
1847 continue; 1848 continue;
1848 *seqp = seq;
1849 1849
1850 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { 1850 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
1851 if (dentry->d_name.hash != hashlen_hash(hashlen)) 1851 if (dentry->d_name.hash != hashlen_hash(hashlen))
1852 continue; 1852 continue;
1853 switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) { 1853 *seqp = seq;
1854 switch (slow_dentry_cmp(parent, dentry, seq, name)) {
1854 case D_COMP_OK: 1855 case D_COMP_OK:
1855 return dentry; 1856 return dentry;
1856 case D_COMP_NOMATCH: 1857 case D_COMP_NOMATCH:
@@ -1862,6 +1863,7 @@ seqretry:
1862 1863
1863 if (dentry->d_name.hash_len != hashlen) 1864 if (dentry->d_name.hash_len != hashlen)
1864 continue; 1865 continue;
1866 *seqp = seq;
1865 if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) 1867 if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
1866 return dentry; 1868 return dentry;
1867 } 1869 }
@@ -1959,9 +1961,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
1959 if (parent->d_flags & DCACHE_OP_COMPARE) { 1961 if (parent->d_flags & DCACHE_OP_COMPARE) {
1960 int tlen = dentry->d_name.len; 1962 int tlen = dentry->d_name.len;
1961 const char *tname = dentry->d_name.name; 1963 const char *tname = dentry->d_name.name;
1962 if (parent->d_op->d_compare(parent, parent->d_inode, 1964 if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
1963 dentry, dentry->d_inode,
1964 tlen, tname, name))
1965 goto next; 1965 goto next;
1966 } else { 1966 } else {
1967 if (dentry->d_name.len != len) 1967 if (dentry->d_name.len != len)
@@ -1998,7 +1998,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
1998 */ 1998 */
1999 name->hash = full_name_hash(name->name, name->len); 1999 name->hash = full_name_hash(name->name, name->len);
2000 if (dir->d_flags & DCACHE_OP_HASH) { 2000 if (dir->d_flags & DCACHE_OP_HASH) {
2001 int err = dir->d_op->d_hash(dir, dir->d_inode, name); 2001 int err = dir->d_op->d_hash(dir, name);
2002 if (unlikely(err < 0)) 2002 if (unlikely(err < 0))
2003 return ERR_PTR(err); 2003 return ERR_PTR(err);
2004 } 2004 }
@@ -2968,34 +2968,21 @@ rename_retry:
2968 goto again; 2968 goto again;
2969} 2969}
2970 2970
2971/** 2971void d_tmpfile(struct dentry *dentry, struct inode *inode)
2972 * find_inode_number - check for dentry with name
2973 * @dir: directory to check
2974 * @name: Name to find.
2975 *
2976 * Check whether a dentry already exists for the given name,
2977 * and return the inode number if it has an inode. Otherwise
2978 * 0 is returned.
2979 *
2980 * This routine is used to post-process directory listings for
2981 * filesystems using synthetic inode numbers, and is necessary
2982 * to keep getcwd() working.
2983 */
2984
2985ino_t find_inode_number(struct dentry *dir, struct qstr *name)
2986{ 2972{
2987 struct dentry * dentry; 2973 inode_dec_link_count(inode);
2988 ino_t ino = 0; 2974 BUG_ON(dentry->d_name.name != dentry->d_iname ||
2989 2975 !hlist_unhashed(&dentry->d_alias) ||
2990 dentry = d_hash_and_lookup(dir, name); 2976 !d_unlinked(dentry));
2991 if (!IS_ERR_OR_NULL(dentry)) { 2977 spin_lock(&dentry->d_parent->d_lock);
2992 if (dentry->d_inode) 2978 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
2993 ino = dentry->d_inode->i_ino; 2979 dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
2994 dput(dentry); 2980 (unsigned long long)inode->i_ino);
2995 } 2981 spin_unlock(&dentry->d_lock);
2996 return ino; 2982 spin_unlock(&dentry->d_parent->d_lock);
2983 d_instantiate(dentry, inode);
2997} 2984}
2998EXPORT_SYMBOL(find_inode_number); 2985EXPORT_SYMBOL(d_tmpfile);
2999 2986
3000static __initdata unsigned long dhash_entries; 2987static __initdata unsigned long dhash_entries;
3001static int __init set_dhash_entries(char *str) 2988static int __init set_dhash_entries(char *str)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c5ca6ae5a30c..63146295153b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -21,6 +21,7 @@
21#include <linux/debugfs.h> 21#include <linux/debugfs.h>
22#include <linux/io.h> 22#include <linux/io.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/atomic.h>
24 25
25static ssize_t default_read_file(struct file *file, char __user *buf, 26static ssize_t default_read_file(struct file *file, char __user *buf,
26 size_t count, loff_t *ppos) 27 size_t count, loff_t *ppos)
@@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
403} 404}
404EXPORT_SYMBOL_GPL(debugfs_create_size_t); 405EXPORT_SYMBOL_GPL(debugfs_create_size_t);
405 406
407static int debugfs_atomic_t_set(void *data, u64 val)
408{
409 atomic_set((atomic_t *)data, val);
410 return 0;
411}
412static int debugfs_atomic_t_get(void *data, u64 *val)
413{
414 *val = atomic_read((atomic_t *)data);
415 return 0;
416}
417DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
418 debugfs_atomic_t_set, "%lld\n");
419DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
420DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
421
422/**
423 * debugfs_create_atomic_t - create a debugfs file that is used to read and
424 * write an atomic_t value
425 * @name: a pointer to a string containing the name of the file to create.
426 * @mode: the permission that the file should have
427 * @parent: a pointer to the parent dentry for this file. This should be a
428 * directory dentry if set. If this parameter is %NULL, then the
429 * file will be created in the root of the debugfs filesystem.
430 * @value: a pointer to the variable that the file should read to and write
431 * from.
432 */
433struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
434 struct dentry *parent, atomic_t *value)
435{
436 /* if there are no write bits set, make read only */
437 if (!(mode & S_IWUGO))
438 return debugfs_create_file(name, mode, parent, value,
439 &fops_atomic_t_ro);
440 /* if there are no read bits set, make write only */
441 if (!(mode & S_IRUGO))
442 return debugfs_create_file(name, mode, parent, value,
443 &fops_atomic_t_wo);
444
445 return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
446}
447EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
406 448
407static ssize_t read_file_bool(struct file *file, char __user *user_buf, 449static ssize_t read_file_bool(struct file *file, char __user *user_buf,
408 size_t count, loff_t *ppos) 450 size_t count, loff_t *ppos)
@@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
431 if (copy_from_user(buf, user_buf, buf_size)) 473 if (copy_from_user(buf, user_buf, buf_size))
432 return -EFAULT; 474 return -EFAULT;
433 475
476 buf[buf_size] = '\0';
434 if (strtobool(buf, &bv) == 0) 477 if (strtobool(buf, &bv) == 0)
435 *val = bv; 478 *val = bv;
436 479
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7d58d5b112b5..76feb4b60fa6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
138static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl, 138static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
139 const char *buf, size_t len) 139 const char *buf, size_t len)
140{ 140{
141 strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN); 141 strlcpy(dlm_config.ci_cluster_name, buf,
142 strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN); 142 sizeof(dlm_config.ci_cluster_name));
143 strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
143 return len; 144 return len;
144} 145}
145 146
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1b1146670c4b..e223a911a834 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
2038 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; 2038 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
2039 if (b == 1) { 2039 if (b == 1) {
2040 int len = receive_extralen(ms); 2040 int len = receive_extralen(ms);
2041 if (len > DLM_RESNAME_MAXLEN) 2041 if (len > r->res_ls->ls_lvblen)
2042 len = DLM_RESNAME_MAXLEN; 2042 len = r->res_ls->ls_lvblen;
2043 memcpy(lkb->lkb_lvbptr, ms->m_extra, len); 2043 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2044 lkb->lkb_lvbseq = ms->m_lvbseq; 2044 lkb->lkb_lvbseq = ms->m_lvbseq;
2045 } 2045 }
@@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
3893 if (!lkb->lkb_lvbptr) 3893 if (!lkb->lkb_lvbptr)
3894 return -ENOMEM; 3894 return -ENOMEM;
3895 len = receive_extralen(ms); 3895 len = receive_extralen(ms);
3896 if (len > DLM_RESNAME_MAXLEN) 3896 if (len > ls->ls_lvblen)
3897 len = DLM_RESNAME_MAXLEN; 3897 len = ls->ls_lvblen;
3898 memcpy(lkb->lkb_lvbptr, ms->m_extra, len); 3898 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
3899 } 3899 }
3900 return 0; 3900 return 0;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3ca79d3253b9..88556dc0458e 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force)
883void dlm_stop_lockspaces(void) 883void dlm_stop_lockspaces(void)
884{ 884{
885 struct dlm_ls *ls; 885 struct dlm_ls *ls;
886 int count;
886 887
887 restart: 888 restart:
889 count = 0;
888 spin_lock(&lslist_lock); 890 spin_lock(&lslist_lock);
889 list_for_each_entry(ls, &lslist, ls_list) { 891 list_for_each_entry(ls, &lslist, ls_list) {
890 if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) 892 if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
893 count++;
891 continue; 894 continue;
895 }
892 spin_unlock(&lslist_lock); 896 spin_unlock(&lslist_lock);
893 log_error(ls, "no userland control daemon, stopping lockspace"); 897 log_error(ls, "no userland control daemon, stopping lockspace");
894 dlm_ls_stop(ls); 898 dlm_ls_stop(ls);
895 goto restart; 899 goto restart;
896 } 900 }
897 spin_unlock(&lslist_lock); 901 spin_unlock(&lslist_lock);
902
903 if (count)
904 log_print("dlm user daemon left %d lockspaces", count);
898} 905}
899 906
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d0ccd2fd79eb..d90909ec6aa6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,7 +52,6 @@
52#include <linux/mutex.h> 52#include <linux/mutex.h>
53#include <linux/sctp.h> 53#include <linux/sctp.h>
54#include <linux/slab.h> 54#include <linux/slab.h>
55#include <linux/sctp.h>
56#include <net/sctp/sctp.h> 55#include <net/sctp/sctp.h>
57#include <net/ipv6.h> 56#include <net/ipv6.h>
58 57
@@ -126,6 +125,7 @@ struct connection {
126 struct connection *othercon; 125 struct connection *othercon;
127 struct work_struct rwork; /* Receive workqueue */ 126 struct work_struct rwork; /* Receive workqueue */
128 struct work_struct swork; /* Send workqueue */ 127 struct work_struct swork; /* Send workqueue */
128 bool try_new_addr;
129}; 129};
130#define sock2con(x) ((struct connection *)(x)->sk_user_data) 130#define sock2con(x) ((struct connection *)(x)->sk_user_data)
131 131
@@ -144,6 +144,7 @@ struct dlm_node_addr {
144 struct list_head list; 144 struct list_head list;
145 int nodeid; 145 int nodeid;
146 int addr_count; 146 int addr_count;
147 int curr_addr_index;
147 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; 148 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
148}; 149};
149 150
@@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
310} 311}
311 312
312static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, 313static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
313 struct sockaddr *sa_out) 314 struct sockaddr *sa_out, bool try_new_addr)
314{ 315{
315 struct sockaddr_storage sas; 316 struct sockaddr_storage sas;
316 struct dlm_node_addr *na; 317 struct dlm_node_addr *na;
@@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
320 321
321 spin_lock(&dlm_node_addrs_spin); 322 spin_lock(&dlm_node_addrs_spin);
322 na = find_node_addr(nodeid); 323 na = find_node_addr(nodeid);
323 if (na && na->addr_count) 324 if (na && na->addr_count) {
324 memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage)); 325 if (try_new_addr) {
326 na->curr_addr_index++;
327 if (na->curr_addr_index == na->addr_count)
328 na->curr_addr_index = 0;
329 }
330
331 memcpy(&sas, na->addr[na->curr_addr_index ],
332 sizeof(struct sockaddr_storage));
333 }
325 spin_unlock(&dlm_node_addrs_spin); 334 spin_unlock(&dlm_node_addrs_spin);
326 335
327 if (!na) 336 if (!na)
@@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
353{ 362{
354 struct dlm_node_addr *na; 363 struct dlm_node_addr *na;
355 int rv = -EEXIST; 364 int rv = -EEXIST;
365 int addr_i;
356 366
357 spin_lock(&dlm_node_addrs_spin); 367 spin_lock(&dlm_node_addrs_spin);
358 list_for_each_entry(na, &dlm_node_addrs, list) { 368 list_for_each_entry(na, &dlm_node_addrs, list) {
359 if (!na->addr_count) 369 if (!na->addr_count)
360 continue; 370 continue;
361 371
362 if (!addr_compare(na->addr[0], addr)) 372 for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
363 continue; 373 if (addr_compare(na->addr[addr_i], addr)) {
364 374 *nodeid = na->nodeid;
365 *nodeid = na->nodeid; 375 rv = 0;
366 rv = 0; 376 goto unlock;
367 break; 377 }
378 }
368 } 379 }
380unlock:
369 spin_unlock(&dlm_node_addrs_spin); 381 spin_unlock(&dlm_node_addrs_spin);
370 return rv; 382 return rv;
371} 383}
@@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
561 573
562static void sctp_init_failed_foreach(struct connection *con) 574static void sctp_init_failed_foreach(struct connection *con)
563{ 575{
576
577 /*
578 * Don't try to recover base con and handle race where the
579 * other node's assoc init creates a assoc and we get that
580 * notification, then we get a notification that our attempt
581 * failed due. This happens when we are still trying the primary
582 * address, but the other node has already tried secondary addrs
583 * and found one that worked.
584 */
585 if (!con->nodeid || con->sctp_assoc)
586 return;
587
588 log_print("Retrying SCTP association init for node %d\n", con->nodeid);
589
590 con->try_new_addr = true;
564 con->sctp_assoc = 0; 591 con->sctp_assoc = 0;
565 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { 592 if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
566 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) 593 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
567 queue_work(send_workqueue, &con->swork); 594 queue_work(send_workqueue, &con->swork);
568 } 595 }
@@ -579,15 +606,56 @@ static void sctp_init_failed(void)
579 mutex_unlock(&connections_lock); 606 mutex_unlock(&connections_lock);
580} 607}
581 608
609static void retry_failed_sctp_send(struct connection *recv_con,
610 struct sctp_send_failed *sn_send_failed,
611 char *buf)
612{
613 int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
614 struct dlm_mhandle *mh;
615 struct connection *con;
616 char *retry_buf;
617 int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
618
619 log_print("Retry sending %d bytes to node id %d", len, nodeid);
620
621 con = nodeid2con(nodeid, 0);
622 if (!con) {
623 log_print("Could not look up con for nodeid %d\n",
624 nodeid);
625 return;
626 }
627
628 mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
629 if (!mh) {
630 log_print("Could not allocate buf for retry.");
631 return;
632 }
633 memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
634 dlm_lowcomms_commit_buffer(mh);
635
636 /*
637 * If we got a assoc changed event before the send failed event then
638 * we only need to retry the send.
639 */
640 if (con->sctp_assoc) {
641 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
642 queue_work(send_workqueue, &con->swork);
643 } else
644 sctp_init_failed_foreach(con);
645}
646
582/* Something happened to an association */ 647/* Something happened to an association */
583static void process_sctp_notification(struct connection *con, 648static void process_sctp_notification(struct connection *con,
584 struct msghdr *msg, char *buf) 649 struct msghdr *msg, char *buf)
585{ 650{
586 union sctp_notification *sn = (union sctp_notification *)buf; 651 union sctp_notification *sn = (union sctp_notification *)buf;
587 652
588 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) { 653 switch (sn->sn_header.sn_type) {
654 case SCTP_SEND_FAILED:
655 retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
656 break;
657 case SCTP_ASSOC_CHANGE:
589 switch (sn->sn_assoc_change.sac_state) { 658 switch (sn->sn_assoc_change.sac_state) {
590
591 case SCTP_COMM_UP: 659 case SCTP_COMM_UP:
592 case SCTP_RESTART: 660 case SCTP_RESTART:
593 { 661 {
@@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con,
662 log_print("connecting to %d sctp association %d", 730 log_print("connecting to %d sctp association %d",
663 nodeid, (int)sn->sn_assoc_change.sac_assoc_id); 731 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
664 732
733 new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
734 new_con->try_new_addr = false;
665 /* Send any pending writes */ 735 /* Send any pending writes */
666 clear_bit(CF_CONNECT_PENDING, &new_con->flags); 736 clear_bit(CF_CONNECT_PENDING, &new_con->flags);
667 clear_bit(CF_INIT_PENDING, &con->flags); 737 clear_bit(CF_INIT_PENDING, &new_con->flags);
668 if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) { 738 if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
669 queue_work(send_workqueue, &new_con->swork); 739 queue_work(send_workqueue, &new_con->swork);
670 } 740 }
@@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con,
683 } 753 }
684 break; 754 break;
685 755
686 /* We don't know which INIT failed, so clear the PENDING flags
687 * on them all. if assoc_id is zero then it will then try
688 * again */
689
690 case SCTP_CANT_STR_ASSOC: 756 case SCTP_CANT_STR_ASSOC:
691 { 757 {
758 /* Will retry init when we get the send failed notification */
692 log_print("Can't start SCTP association - retrying"); 759 log_print("Can't start SCTP association - retrying");
693 sctp_init_failed();
694 } 760 }
695 break; 761 break;
696 762
@@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con,
699 (int)sn->sn_assoc_change.sac_assoc_id, 765 (int)sn->sn_assoc_change.sac_assoc_id,
700 sn->sn_assoc_change.sac_state); 766 sn->sn_assoc_change.sac_state);
701 } 767 }
768 default:
769 ; /* fall through */
702 } 770 }
703} 771}
704 772
@@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e)
958 kfree(e); 1026 kfree(e);
959} 1027}
960 1028
1029/*
1030 * writequeue_entry_complete - try to delete and free write queue entry
1031 * @e: write queue entry to try to delete
1032 * @completed: bytes completed
1033 *
1034 * writequeue_lock must be held.
1035 */
1036static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
1037{
1038 e->offset += completed;
1039 e->len -= completed;
1040
1041 if (e->len == 0 && e->users == 0) {
1042 list_del(&e->list);
1043 free_entry(e);
1044 }
1045}
1046
961/* Initiate an SCTP association. 1047/* Initiate an SCTP association.
962 This is a special case of send_to_sock() in that we don't yet have a 1048 This is a special case of send_to_sock() in that we don't yet have a
963 peeled-off socket for this association, so we use the listening socket 1049 peeled-off socket for this association, so we use the listening socket
@@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con)
977 int addrlen; 1063 int addrlen;
978 struct kvec iov[1]; 1064 struct kvec iov[1];
979 1065
1066 mutex_lock(&con->sock_mutex);
980 if (test_and_set_bit(CF_INIT_PENDING, &con->flags)) 1067 if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
981 return; 1068 goto unlock;
982
983 if (con->retries++ > MAX_CONNECT_RETRIES)
984 return;
985 1069
986 if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) { 1070 if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
1071 con->try_new_addr)) {
987 log_print("no address for nodeid %d", con->nodeid); 1072 log_print("no address for nodeid %d", con->nodeid);
988 return; 1073 goto unlock;
989 } 1074 }
990 base_con = nodeid2con(0, 0); 1075 base_con = nodeid2con(0, 0);
991 BUG_ON(base_con == NULL); 1076 BUG_ON(base_con == NULL);
@@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con)
1003 if (list_empty(&con->writequeue)) { 1088 if (list_empty(&con->writequeue)) {
1004 spin_unlock(&con->writequeue_lock); 1089 spin_unlock(&con->writequeue_lock);
1005 log_print("writequeue empty for nodeid %d", con->nodeid); 1090 log_print("writequeue empty for nodeid %d", con->nodeid);
1006 return; 1091 goto unlock;
1007 } 1092 }
1008 1093
1009 e = list_first_entry(&con->writequeue, struct writequeue_entry, list); 1094 e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
1010 len = e->len; 1095 len = e->len;
1011 offset = e->offset; 1096 offset = e->offset;
1012 spin_unlock(&con->writequeue_lock);
1013 1097
1014 /* Send the first block off the write queue */ 1098 /* Send the first block off the write queue */
1015 iov[0].iov_base = page_address(e->page)+offset; 1099 iov[0].iov_base = page_address(e->page)+offset;
1016 iov[0].iov_len = len; 1100 iov[0].iov_len = len;
1101 spin_unlock(&con->writequeue_lock);
1102
1103 if (rem_addr.ss_family == AF_INET) {
1104 struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
1105 log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
1106 } else {
1107 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
1108 log_print("Trying to connect to %pI6", &sin6->sin6_addr);
1109 }
1017 1110
1018 cmsg = CMSG_FIRSTHDR(&outmessage); 1111 cmsg = CMSG_FIRSTHDR(&outmessage);
1019 cmsg->cmsg_level = IPPROTO_SCTP; 1112 cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con)
1021 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); 1114 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
1022 sinfo = CMSG_DATA(cmsg); 1115 sinfo = CMSG_DATA(cmsg);
1023 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); 1116 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
1024 sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid()); 1117 sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
1025 outmessage.msg_controllen = cmsg->cmsg_len; 1118 outmessage.msg_controllen = cmsg->cmsg_len;
1119 sinfo->sinfo_flags |= SCTP_ADDR_OVER;
1026 1120
1027 ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len); 1121 ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
1028 if (ret < 0) { 1122 if (ret < 0) {
@@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con)
1035 } 1129 }
1036 else { 1130 else {
1037 spin_lock(&con->writequeue_lock); 1131 spin_lock(&con->writequeue_lock);
1038 e->offset += ret; 1132 writequeue_entry_complete(e, ret);
1039 e->len -= ret;
1040
1041 if (e->len == 0 && e->users == 0) {
1042 list_del(&e->list);
1043 free_entry(e);
1044 }
1045 spin_unlock(&con->writequeue_lock); 1133 spin_unlock(&con->writequeue_lock);
1046 } 1134 }
1135
1136unlock:
1137 mutex_unlock(&con->sock_mutex);
1047} 1138}
1048 1139
1049/* Connect a new socket to its peer */ 1140/* Connect a new socket to its peer */
@@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con)
1075 goto out_err; 1166 goto out_err;
1076 1167
1077 memset(&saddr, 0, sizeof(saddr)); 1168 memset(&saddr, 0, sizeof(saddr));
1078 result = nodeid_to_addr(con->nodeid, &saddr, NULL); 1169 result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
1079 if (result < 0) { 1170 if (result < 0) {
1080 log_print("no address for nodeid %d", con->nodeid); 1171 log_print("no address for nodeid %d", con->nodeid);
1081 goto out_err; 1172 goto out_err;
@@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void)
1254 int result = -EINVAL, num = 1, i, addr_len; 1345 int result = -EINVAL, num = 1, i, addr_len;
1255 struct connection *con = nodeid2con(0, GFP_NOFS); 1346 struct connection *con = nodeid2con(0, GFP_NOFS);
1256 int bufsize = NEEDED_RMEM; 1347 int bufsize = NEEDED_RMEM;
1348 int one = 1;
1257 1349
1258 if (!con) 1350 if (!con)
1259 return -ENOMEM; 1351 return -ENOMEM;
@@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void)
1288 goto create_delsock; 1380 goto create_delsock;
1289 } 1381 }
1290 1382
1383 result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
1384 sizeof(one));
1385 if (result < 0)
1386 log_print("Could not set SCTP NODELAY error %d\n", result);
1387
1291 /* Init con struct */ 1388 /* Init con struct */
1292 sock->sk->sk_user_data = con; 1389 sock->sk->sk_user_data = con;
1293 con->sock = sock; 1390 con->sock = sock;
@@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con)
1493 } 1590 }
1494 1591
1495 spin_lock(&con->writequeue_lock); 1592 spin_lock(&con->writequeue_lock);
1496 e->offset += ret; 1593 writequeue_entry_complete(e, ret);
1497 e->len -= ret;
1498
1499 if (e->len == 0 && e->users == 0) {
1500 list_del(&e->list);
1501 free_entry(e);
1502 }
1503 } 1594 }
1504 spin_unlock(&con->writequeue_lock); 1595 spin_unlock(&con->writequeue_lock);
1505out: 1596out:
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f71ec125290d..cfa109a4d5a2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -2243,12 +2243,11 @@ out:
2243 */ 2243 */
2244int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, 2244int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
2245 size_t *plaintext_name_size, 2245 size_t *plaintext_name_size,
2246 struct dentry *ecryptfs_dir_dentry, 2246 struct super_block *sb,
2247 const char *name, size_t name_size) 2247 const char *name, size_t name_size)
2248{ 2248{
2249 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 2249 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2250 &ecryptfs_superblock_to_private( 2250 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
2251 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2252 char *decoded_name; 2251 char *decoded_name;
2253 size_t decoded_name_size; 2252 size_t decoded_name_size;
2254 size_t packet_size; 2253 size_t packet_size;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index f622a733f7ad..df19d34a033b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -575,7 +575,7 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
575 struct inode *ecryptfs_inode); 575 struct inode *ecryptfs_inode);
576int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 576int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
577 size_t *decrypted_name_size, 577 size_t *decrypted_name_size,
578 struct dentry *ecryptfs_dentry, 578 struct super_block *sb,
579 const char *name, size_t name_size); 579 const char *name, size_t name_size);
580int ecryptfs_fill_zeros(struct file *file, loff_t new_length); 580int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
581int ecryptfs_encrypt_and_encode_filename( 581int ecryptfs_encrypt_and_encode_filename(
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 201f0a0d6b0a..24f1105fda3a 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -68,9 +68,9 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
68} 68}
69 69
70struct ecryptfs_getdents_callback { 70struct ecryptfs_getdents_callback {
71 void *dirent; 71 struct dir_context ctx;
72 struct dentry *dentry; 72 struct dir_context *caller;
73 filldir_t filldir; 73 struct super_block *sb;
74 int filldir_called; 74 int filldir_called;
75 int entries_written; 75 int entries_written;
76}; 76};
@@ -88,7 +88,7 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
88 88
89 buf->filldir_called++; 89 buf->filldir_called++;
90 rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, 90 rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
91 buf->dentry, lower_name, 91 buf->sb, lower_name,
92 lower_namelen); 92 lower_namelen);
93 if (rc) { 93 if (rc) {
94 printk(KERN_ERR "%s: Error attempting to decode and decrypt " 94 printk(KERN_ERR "%s: Error attempting to decode and decrypt "
@@ -96,9 +96,10 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
96 rc); 96 rc);
97 goto out; 97 goto out;
98 } 98 }
99 rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type); 99 buf->caller->pos = buf->ctx.pos;
100 rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
100 kfree(name); 101 kfree(name);
101 if (rc >= 0) 102 if (!rc)
102 buf->entries_written++; 103 buf->entries_written++;
103out: 104out:
104 return rc; 105 return rc;
@@ -107,27 +108,22 @@ out:
107/** 108/**
108 * ecryptfs_readdir 109 * ecryptfs_readdir
109 * @file: The eCryptfs directory file 110 * @file: The eCryptfs directory file
110 * @dirent: Directory entry handle 111 * @ctx: The actor to feed the entries to
111 * @filldir: The filldir callback function
112 */ 112 */
113static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) 113static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
114{ 114{
115 int rc; 115 int rc;
116 struct file *lower_file; 116 struct file *lower_file;
117 struct inode *inode; 117 struct inode *inode = file_inode(file);
118 struct ecryptfs_getdents_callback buf; 118 struct ecryptfs_getdents_callback buf = {
119 119 .ctx.actor = ecryptfs_filldir,
120 .caller = ctx,
121 .sb = inode->i_sb,
122 };
120 lower_file = ecryptfs_file_to_lower(file); 123 lower_file = ecryptfs_file_to_lower(file);
121 lower_file->f_pos = file->f_pos; 124 lower_file->f_pos = ctx->pos;
122 inode = file_inode(file); 125 rc = iterate_dir(lower_file, &buf.ctx);
123 memset(&buf, 0, sizeof(buf)); 126 ctx->pos = buf.ctx.pos;
124 buf.dirent = dirent;
125 buf.dentry = file->f_path.dentry;
126 buf.filldir = filldir;
127 buf.filldir_called = 0;
128 buf.entries_written = 0;
129 rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
130 file->f_pos = lower_file->f_pos;
131 if (rc < 0) 127 if (rc < 0)
132 goto out; 128 goto out;
133 if (buf.filldir_called && !buf.entries_written) 129 if (buf.filldir_called && !buf.entries_written)
@@ -295,6 +291,12 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
295static int 291static int
296ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) 292ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
297{ 293{
294 int rc;
295
296 rc = filemap_write_and_wait(file->f_mapping);
297 if (rc)
298 return rc;
299
298 return vfs_fsync(ecryptfs_file_to_lower(file), datasync); 300 return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
299} 301}
300 302
@@ -338,7 +340,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
338#endif 340#endif
339 341
340const struct file_operations ecryptfs_dir_fops = { 342const struct file_operations ecryptfs_dir_fops = {
341 .readdir = ecryptfs_readdir, 343 .iterate = ecryptfs_readdir,
342 .read = generic_read_dir, 344 .read = generic_read_dir,
343 .unlocked_ioctl = ecryptfs_unlocked_ioctl, 345 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
344#ifdef CONFIG_COMPAT 346#ifdef CONFIG_COMPAT
@@ -359,7 +361,7 @@ const struct file_operations ecryptfs_main_fops = {
359 .aio_read = ecryptfs_read_update_atime, 361 .aio_read = ecryptfs_read_update_atime,
360 .write = do_sync_write, 362 .write = do_sync_write,
361 .aio_write = generic_file_aio_write, 363 .aio_write = generic_file_aio_write,
362 .readdir = ecryptfs_readdir, 364 .iterate = ecryptfs_readdir,
363 .unlocked_ioctl = ecryptfs_unlocked_ioctl, 365 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
364#ifdef CONFIG_COMPAT 366#ifdef CONFIG_COMPAT
365 .compat_ioctl = ecryptfs_compat_ioctl, 367 .compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5eab400e2590..a2f2bb2c256d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -679,7 +679,7 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
679 set_fs(old_fs); 679 set_fs(old_fs);
680 if (rc < 0) 680 if (rc < 0)
681 goto out; 681 goto out;
682 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, 682 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb,
683 lower_buf, rc); 683 lower_buf, rc);
684out: 684out:
685 kfree(lower_buf); 685 kfree(lower_buf);
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index bfb531564319..8dd524f32284 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -44,8 +44,11 @@ static ssize_t efivarfs_file_write(struct file *file,
44 44
45 bytes = efivar_entry_set_get_size(var, attributes, &datasize, 45 bytes = efivar_entry_set_get_size(var, attributes, &datasize,
46 data, &set); 46 data, &set);
47 if (!set && bytes) 47 if (!set && bytes) {
48 if (bytes == -ENOENT)
49 bytes = -EIO;
48 goto out; 50 goto out;
51 }
49 52
50 if (bytes == -ENOENT) { 53 if (bytes == -ENOENT) {
51 drop_nlink(inode); 54 drop_nlink(inode);
@@ -76,7 +79,14 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
76 int err; 79 int err;
77 80
78 err = efivar_entry_size(var, &datasize); 81 err = efivar_entry_size(var, &datasize);
79 if (err) 82
83 /*
84 * efivarfs represents uncommitted variables with
85 * zero-length files. Reading them should return EOF.
86 */
87 if (err == -ENOENT)
88 return 0;
89 else if (err)
80 return err; 90 return err;
81 91
82 data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); 92 data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 141aee31884f..a8766b880c07 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -45,8 +45,8 @@ static struct super_block *efivarfs_sb;
45 * So we need to perform a case-sensitive match on part 1 and a 45 * So we need to perform a case-sensitive match on part 1 and a
46 * case-insensitive match on part 2. 46 * case-insensitive match on part 2.
47 */ 47 */
48static int efivarfs_d_compare(const struct dentry *parent, const struct inode *pinode, 48static int efivarfs_d_compare(const struct dentry *parent,
49 const struct dentry *dentry, const struct inode *inode, 49 const struct dentry *dentry,
50 unsigned int len, const char *str, 50 unsigned int len, const char *str,
51 const struct qstr *name) 51 const struct qstr *name)
52{ 52{
@@ -63,8 +63,7 @@ static int efivarfs_d_compare(const struct dentry *parent, const struct inode *p
63 return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN); 63 return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN);
64} 64}
65 65
66static int efivarfs_d_hash(const struct dentry *dentry, 66static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
67 const struct inode *inode, struct qstr *qstr)
68{ 67{
69 unsigned long hash = init_name_hash(); 68 unsigned long hash = init_name_hash();
70 const unsigned char *s = qstr->name; 69 const unsigned char *s = qstr->name;
@@ -108,7 +107,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
108 q.name = name; 107 q.name = name;
109 q.len = strlen(name); 108 q.len = strlen(name);
110 109
111 err = efivarfs_d_hash(NULL, NULL, &q); 110 err = efivarfs_d_hash(NULL, &q);
112 if (err) 111 if (err)
113 return ERR_PTR(err); 112 return ERR_PTR(err);
114 113
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 055a9e9ca747..b72307ccdf7a 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -7,40 +7,38 @@
7#include <linux/buffer_head.h> 7#include <linux/buffer_head.h>
8#include "efs.h" 8#include "efs.h"
9 9
10static int efs_readdir(struct file *, void *, filldir_t); 10static int efs_readdir(struct file *, struct dir_context *);
11 11
12const struct file_operations efs_dir_operations = { 12const struct file_operations efs_dir_operations = {
13 .llseek = generic_file_llseek, 13 .llseek = generic_file_llseek,
14 .read = generic_read_dir, 14 .read = generic_read_dir,
15 .readdir = efs_readdir, 15 .iterate = efs_readdir,
16}; 16};
17 17
18const struct inode_operations efs_dir_inode_operations = { 18const struct inode_operations efs_dir_inode_operations = {
19 .lookup = efs_lookup, 19 .lookup = efs_lookup,
20}; 20};
21 21
22static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { 22static int efs_readdir(struct file *file, struct dir_context *ctx)
23 struct inode *inode = file_inode(filp); 23{
24 struct buffer_head *bh; 24 struct inode *inode = file_inode(file);
25
26 struct efs_dir *dirblock;
27 struct efs_dentry *dirslot;
28 efs_ino_t inodenum;
29 efs_block_t block; 25 efs_block_t block;
30 int slot, namelen; 26 int slot;
31 char *nameptr;
32 27
33 if (inode->i_size & (EFS_DIRBSIZE-1)) 28 if (inode->i_size & (EFS_DIRBSIZE-1))
34 printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); 29 printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
35 30
36 /* work out where this entry can be found */ 31 /* work out where this entry can be found */
37 block = filp->f_pos >> EFS_DIRBSIZE_BITS; 32 block = ctx->pos >> EFS_DIRBSIZE_BITS;
38 33
39 /* each block contains at most 256 slots */ 34 /* each block contains at most 256 slots */
40 slot = filp->f_pos & 0xff; 35 slot = ctx->pos & 0xff;
41 36
42 /* look at all blocks */ 37 /* look at all blocks */
43 while (block < inode->i_blocks) { 38 while (block < inode->i_blocks) {
39 struct efs_dir *dirblock;
40 struct buffer_head *bh;
41
44 /* read the dir block */ 42 /* read the dir block */
45 bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); 43 bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
46 44
@@ -57,11 +55,14 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
57 break; 55 break;
58 } 56 }
59 57
60 while (slot < dirblock->slots) { 58 for (; slot < dirblock->slots; slot++) {
61 if (dirblock->space[slot] == 0) { 59 struct efs_dentry *dirslot;
62 slot++; 60 efs_ino_t inodenum;
61 const char *nameptr;
62 int namelen;
63
64 if (dirblock->space[slot] == 0)
63 continue; 65 continue;
64 }
65 66
66 dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); 67 dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
67 68
@@ -72,39 +73,29 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
72#ifdef DEBUG 73#ifdef DEBUG
73 printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); 74 printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
74#endif 75#endif
75 if (namelen > 0) { 76 if (!namelen)
76 /* found the next entry */ 77 continue;
77 filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; 78 /* found the next entry */
78 79 ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
79 /* copy filename and data in dirslot */ 80
80 filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN); 81 /* sanity check */
81 82 if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
82 /* sanity check */ 83 printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
83 if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { 84 continue;
84 printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); 85 }
85 slot++; 86
86 continue; 87 /* copy filename and data in dirslot */
87 } 88 if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) {
88
89 /* store position of next slot */
90 if (++slot == dirblock->slots) {
91 slot = 0;
92 block++;
93 }
94 brelse(bh); 89 brelse(bh);
95 filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; 90 return 0;
96 goto out;
97 } 91 }
98 slot++;
99 } 92 }
100 brelse(bh); 93 brelse(bh);
101 94
102 slot = 0; 95 slot = 0;
103 block++; 96 block++;
104 } 97 }
105 98 ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
106 filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
107out:
108 return 0; 99 return 0;
109} 100}
110 101
diff --git a/fs/exec.c b/fs/exec.c
index 643019585574..03b907cfd765 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -110,13 +110,14 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
110 static const struct open_flags uselib_flags = { 110 static const struct open_flags uselib_flags = {
111 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 111 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
112 .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, 112 .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
113 .intent = LOOKUP_OPEN 113 .intent = LOOKUP_OPEN,
114 .lookup_flags = LOOKUP_FOLLOW,
114 }; 115 };
115 116
116 if (IS_ERR(tmp)) 117 if (IS_ERR(tmp))
117 goto out; 118 goto out;
118 119
119 file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); 120 file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
120 putname(tmp); 121 putname(tmp);
121 error = PTR_ERR(file); 122 error = PTR_ERR(file);
122 if (IS_ERR(file)) 123 if (IS_ERR(file))
@@ -756,10 +757,11 @@ struct file *open_exec(const char *name)
756 static const struct open_flags open_exec_flags = { 757 static const struct open_flags open_exec_flags = {
757 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 758 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
758 .acc_mode = MAY_EXEC | MAY_OPEN, 759 .acc_mode = MAY_EXEC | MAY_OPEN,
759 .intent = LOOKUP_OPEN 760 .intent = LOOKUP_OPEN,
761 .lookup_flags = LOOKUP_FOLLOW,
760 }; 762 };
761 763
762 file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW); 764 file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags);
763 if (IS_ERR(file)) 765 if (IS_ERR(file))
764 goto out; 766 goto out;
765 767
@@ -1135,13 +1137,6 @@ void setup_new_exec(struct linux_binprm * bprm)
1135 set_dumpable(current->mm, suid_dumpable); 1137 set_dumpable(current->mm, suid_dumpable);
1136 } 1138 }
1137 1139
1138 /*
1139 * Flush performance counters when crossing a
1140 * security domain:
1141 */
1142 if (!get_dumpable(current->mm))
1143 perf_event_exit_task(current);
1144
1145 /* An exec changes our domain. We are no longer part of the thread 1140 /* An exec changes our domain. We are no longer part of the thread
1146 group */ 1141 group */
1147 1142
@@ -1205,6 +1200,15 @@ void install_exec_creds(struct linux_binprm *bprm)
1205 1200
1206 commit_creds(bprm->cred); 1201 commit_creds(bprm->cred);
1207 bprm->cred = NULL; 1202 bprm->cred = NULL;
1203
1204 /*
1205 * Disable monitoring for regular users
1206 * when executing setuid binaries. Must
1207 * wait until new credentials are committed
1208 * by commit_creds() above
1209 */
1210 if (get_dumpable(current->mm) != SUID_DUMP_USER)
1211 perf_event_exit_task(current);
1208 /* 1212 /*
1209 * cred_guard_mutex must be held at least to this point to prevent 1213 * cred_guard_mutex must be held at least to this point to prevent
1210 * ptrace_attach() from altering our determination of the task's 1214 * ptrace_attach() from altering our determination of the task's
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 46375896cfc0..49f51ab4caac 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
239} 239}
240 240
241static int 241static int
242exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) 242exofs_readdir(struct file *file, struct dir_context *ctx)
243{ 243{
244 loff_t pos = filp->f_pos; 244 loff_t pos = ctx->pos;
245 struct inode *inode = file_inode(filp); 245 struct inode *inode = file_inode(file);
246 unsigned int offset = pos & ~PAGE_CACHE_MASK; 246 unsigned int offset = pos & ~PAGE_CACHE_MASK;
247 unsigned long n = pos >> PAGE_CACHE_SHIFT; 247 unsigned long n = pos >> PAGE_CACHE_SHIFT;
248 unsigned long npages = dir_pages(inode); 248 unsigned long npages = dir_pages(inode);
249 unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); 249 unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
250 unsigned char *types = NULL; 250 int need_revalidate = (file->f_version != inode->i_version);
251 int need_revalidate = (filp->f_version != inode->i_version);
252 251
253 if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) 252 if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
254 return 0; 253 return 0;
255 254
256 types = exofs_filetype_table;
257
258 for ( ; n < npages; n++, offset = 0) { 255 for ( ; n < npages; n++, offset = 0) {
259 char *kaddr, *limit; 256 char *kaddr, *limit;
260 struct exofs_dir_entry *de; 257 struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
263 if (IS_ERR(page)) { 260 if (IS_ERR(page)) {
264 EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", 261 EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
265 inode->i_ino); 262 inode->i_ino);
266 filp->f_pos += PAGE_CACHE_SIZE - offset; 263 ctx->pos += PAGE_CACHE_SIZE - offset;
267 return PTR_ERR(page); 264 return PTR_ERR(page);
268 } 265 }
269 kaddr = page_address(page); 266 kaddr = page_address(page);
@@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
271 if (offset) { 268 if (offset) {
272 offset = exofs_validate_entry(kaddr, offset, 269 offset = exofs_validate_entry(kaddr, offset,
273 chunk_mask); 270 chunk_mask);
274 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; 271 ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
275 } 272 }
276 filp->f_version = inode->i_version; 273 file->f_version = inode->i_version;
277 need_revalidate = 0; 274 need_revalidate = 0;
278 } 275 }
279 de = (struct exofs_dir_entry *)(kaddr + offset); 276 de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
288 return -EIO; 285 return -EIO;
289 } 286 }
290 if (de->inode_no) { 287 if (de->inode_no) {
291 int over; 288 unsigned char t;
292 unsigned char d_type = DT_UNKNOWN;
293 289
294 if (types && de->file_type < EXOFS_FT_MAX) 290 if (de->file_type < EXOFS_FT_MAX)
295 d_type = types[de->file_type]; 291 t = exofs_filetype_table[de->file_type];
292 else
293 t = DT_UNKNOWN;
296 294
297 offset = (char *)de - kaddr; 295 if (!dir_emit(ctx, de->name, de->name_len,
298 over = filldir(dirent, de->name, de->name_len,
299 (n<<PAGE_CACHE_SHIFT) | offset,
300 le64_to_cpu(de->inode_no), 296 le64_to_cpu(de->inode_no),
301 d_type); 297 t)) {
302 if (over) {
303 exofs_put_page(page); 298 exofs_put_page(page);
304 return 0; 299 return 0;
305 } 300 }
306 } 301 }
307 filp->f_pos += le16_to_cpu(de->rec_len); 302 ctx->pos += le16_to_cpu(de->rec_len);
308 } 303 }
309 exofs_put_page(page); 304 exofs_put_page(page);
310 } 305 }
311
312 return 0; 306 return 0;
313} 307}
314 308
@@ -669,5 +663,5 @@ not_empty:
669const struct file_operations exofs_dir_operations = { 663const struct file_operations exofs_dir_operations = {
670 .llseek = generic_file_llseek, 664 .llseek = generic_file_llseek,
671 .read = generic_read_dir, 665 .read = generic_read_dir,
672 .readdir = exofs_readdir, 666 .iterate = exofs_readdir,
673}; 667};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd8828..2ec8eb1ab269 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
953 return 0; 953 return 0;
954} 954}
955 955
956static void exofs_invalidatepage(struct page *page, unsigned long offset) 956static void exofs_invalidatepage(struct page *page, unsigned int offset,
957 unsigned int length)
957{ 958{
958 EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); 959 EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
960 page->index, offset, length);
959 WARN_ON(1); 961 WARN_ON(1);
960} 962}
961 963
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 262fc9940982..293bc2e47a73 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -212,6 +212,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
212} 212}
213 213
214struct getdents_callback { 214struct getdents_callback {
215 struct dir_context ctx;
215 char *name; /* name that was found. It already points to a 216 char *name; /* name that was found. It already points to a
216 buffer NAME_MAX+1 is size */ 217 buffer NAME_MAX+1 is size */
217 unsigned long ino; /* the inum we are looking for */ 218 unsigned long ino; /* the inum we are looking for */
@@ -254,7 +255,11 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
254 struct inode *dir = path->dentry->d_inode; 255 struct inode *dir = path->dentry->d_inode;
255 int error; 256 int error;
256 struct file *file; 257 struct file *file;
257 struct getdents_callback buffer; 258 struct getdents_callback buffer = {
259 .ctx.actor = filldir_one,
260 .name = name,
261 .ino = child->d_inode->i_ino
262 };
258 263
259 error = -ENOTDIR; 264 error = -ENOTDIR;
260 if (!dir || !S_ISDIR(dir->i_mode)) 265 if (!dir || !S_ISDIR(dir->i_mode))
@@ -271,17 +276,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
271 goto out; 276 goto out;
272 277
273 error = -EINVAL; 278 error = -EINVAL;
274 if (!file->f_op->readdir) 279 if (!file->f_op->iterate)
275 goto out_close; 280 goto out_close;
276 281
277 buffer.name = name;
278 buffer.ino = child->d_inode->i_ino;
279 buffer.found = 0;
280 buffer.sequence = 0; 282 buffer.sequence = 0;
281 while (1) { 283 while (1) {
282 int old_seq = buffer.sequence; 284 int old_seq = buffer.sequence;
283 285
284 error = vfs_readdir(file, filldir_one, &buffer); 286 error = iterate_dir(file, &buffer.ctx);
285 if (buffer.found) { 287 if (buffer.found) {
286 error = 0; 288 error = 0;
287 break; 289 break;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 4237722bfd27..6e1d4ab09d72 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -287,17 +287,17 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
287} 287}
288 288
289static int 289static int
290ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) 290ext2_readdir(struct file *file, struct dir_context *ctx)
291{ 291{
292 loff_t pos = filp->f_pos; 292 loff_t pos = ctx->pos;
293 struct inode *inode = file_inode(filp); 293 struct inode *inode = file_inode(file);
294 struct super_block *sb = inode->i_sb; 294 struct super_block *sb = inode->i_sb;
295 unsigned int offset = pos & ~PAGE_CACHE_MASK; 295 unsigned int offset = pos & ~PAGE_CACHE_MASK;
296 unsigned long n = pos >> PAGE_CACHE_SHIFT; 296 unsigned long n = pos >> PAGE_CACHE_SHIFT;
297 unsigned long npages = dir_pages(inode); 297 unsigned long npages = dir_pages(inode);
298 unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); 298 unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
299 unsigned char *types = NULL; 299 unsigned char *types = NULL;
300 int need_revalidate = filp->f_version != inode->i_version; 300 int need_revalidate = file->f_version != inode->i_version;
301 301
302 if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) 302 if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
303 return 0; 303 return 0;
@@ -314,16 +314,16 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
314 ext2_error(sb, __func__, 314 ext2_error(sb, __func__,
315 "bad page in #%lu", 315 "bad page in #%lu",
316 inode->i_ino); 316 inode->i_ino);
317 filp->f_pos += PAGE_CACHE_SIZE - offset; 317 ctx->pos += PAGE_CACHE_SIZE - offset;
318 return PTR_ERR(page); 318 return PTR_ERR(page);
319 } 319 }
320 kaddr = page_address(page); 320 kaddr = page_address(page);
321 if (unlikely(need_revalidate)) { 321 if (unlikely(need_revalidate)) {
322 if (offset) { 322 if (offset) {
323 offset = ext2_validate_entry(kaddr, offset, chunk_mask); 323 offset = ext2_validate_entry(kaddr, offset, chunk_mask);
324 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; 324 ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
325 } 325 }
326 filp->f_version = inode->i_version; 326 file->f_version = inode->i_version;
327 need_revalidate = 0; 327 need_revalidate = 0;
328 } 328 }
329 de = (ext2_dirent *)(kaddr+offset); 329 de = (ext2_dirent *)(kaddr+offset);
@@ -336,22 +336,19 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
336 return -EIO; 336 return -EIO;
337 } 337 }
338 if (de->inode) { 338 if (de->inode) {
339 int over;
340 unsigned char d_type = DT_UNKNOWN; 339 unsigned char d_type = DT_UNKNOWN;
341 340
342 if (types && de->file_type < EXT2_FT_MAX) 341 if (types && de->file_type < EXT2_FT_MAX)
343 d_type = types[de->file_type]; 342 d_type = types[de->file_type];
344 343
345 offset = (char *)de - kaddr; 344 if (!dir_emit(ctx, de->name, de->name_len,
346 over = filldir(dirent, de->name, de->name_len, 345 le32_to_cpu(de->inode),
347 (n<<PAGE_CACHE_SHIFT) | offset, 346 d_type)) {
348 le32_to_cpu(de->inode), d_type);
349 if (over) {
350 ext2_put_page(page); 347 ext2_put_page(page);
351 return 0; 348 return 0;
352 } 349 }
353 } 350 }
354 filp->f_pos += ext2_rec_len_from_disk(de->rec_len); 351 ctx->pos += ext2_rec_len_from_disk(de->rec_len);
355 } 352 }
356 ext2_put_page(page); 353 ext2_put_page(page);
357 } 354 }
@@ -724,7 +721,7 @@ not_empty:
724const struct file_operations ext2_dir_operations = { 721const struct file_operations ext2_dir_operations = {
725 .llseek = generic_file_llseek, 722 .llseek = generic_file_llseek,
726 .read = generic_read_dir, 723 .read = generic_read_dir,
727 .readdir = ext2_readdir, 724 .iterate = ext2_readdir,
728 .unlocked_ioctl = ext2_ioctl, 725 .unlocked_ioctl = ext2_ioctl,
729#ifdef CONFIG_COMPAT 726#ifdef CONFIG_COMPAT
730 .compat_ioctl = ext2_compat_ioctl, 727 .compat_ioctl = ext2_compat_ioctl,
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 73b0d9519836..256dd5f4c1c4 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -119,6 +119,29 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
119 return ext2_add_nondir(dentry, inode); 119 return ext2_add_nondir(dentry, inode);
120} 120}
121 121
122static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
123{
124 struct inode *inode = ext2_new_inode(dir, mode, NULL);
125 if (IS_ERR(inode))
126 return PTR_ERR(inode);
127
128 inode->i_op = &ext2_file_inode_operations;
129 if (ext2_use_xip(inode->i_sb)) {
130 inode->i_mapping->a_ops = &ext2_aops_xip;
131 inode->i_fop = &ext2_xip_file_operations;
132 } else if (test_opt(inode->i_sb, NOBH)) {
133 inode->i_mapping->a_ops = &ext2_nobh_aops;
134 inode->i_fop = &ext2_file_operations;
135 } else {
136 inode->i_mapping->a_ops = &ext2_aops;
137 inode->i_fop = &ext2_file_operations;
138 }
139 mark_inode_dirty(inode);
140 d_tmpfile(dentry, inode);
141 unlock_new_inode(inode);
142 return 0;
143}
144
122static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) 145static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
123{ 146{
124 struct inode * inode; 147 struct inode * inode;
@@ -398,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
398#endif 421#endif
399 .setattr = ext2_setattr, 422 .setattr = ext2_setattr,
400 .get_acl = ext2_get_acl, 423 .get_acl = ext2_get_acl,
424 .tmpfile = ext2_tmpfile,
401}; 425};
402 426
403const struct inode_operations ext2_special_inode_operations = { 427const struct inode_operations ext2_special_inode_operations = {
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 87eccbbca255..f522425aaa24 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -28,8 +28,7 @@ static unsigned char ext3_filetype_table[] = {
28 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 28 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
29}; 29};
30 30
31static int ext3_dx_readdir(struct file * filp, 31static int ext3_dx_readdir(struct file *, struct dir_context *);
32 void * dirent, filldir_t filldir);
33 32
34static unsigned char get_dtype(struct super_block *sb, int filetype) 33static unsigned char get_dtype(struct super_block *sb, int filetype)
35{ 34{
@@ -91,36 +90,30 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
91 return error_msg == NULL ? 1 : 0; 90 return error_msg == NULL ? 1 : 0;
92} 91}
93 92
94static int ext3_readdir(struct file * filp, 93static int ext3_readdir(struct file *file, struct dir_context *ctx)
95 void * dirent, filldir_t filldir)
96{ 94{
97 int error = 0;
98 unsigned long offset; 95 unsigned long offset;
99 int i, stored; 96 int i;
100 struct ext3_dir_entry_2 *de; 97 struct ext3_dir_entry_2 *de;
101 int err; 98 int err;
102 struct inode *inode = file_inode(filp); 99 struct inode *inode = file_inode(file);
103 struct super_block *sb = inode->i_sb; 100 struct super_block *sb = inode->i_sb;
104 int ret = 0;
105 int dir_has_error = 0; 101 int dir_has_error = 0;
106 102
107 if (is_dx_dir(inode)) { 103 if (is_dx_dir(inode)) {
108 err = ext3_dx_readdir(filp, dirent, filldir); 104 err = ext3_dx_readdir(file, ctx);
109 if (err != ERR_BAD_DX_DIR) { 105 if (err != ERR_BAD_DX_DIR)
110 ret = err; 106 return err;
111 goto out;
112 }
113 /* 107 /*
114 * We don't set the inode dirty flag since it's not 108 * We don't set the inode dirty flag since it's not
115 * critical that it get flushed back to the disk. 109 * critical that it get flushed back to the disk.
116 */ 110 */
117 EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL; 111 EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
118 } 112 }
119 stored = 0; 113 offset = ctx->pos & (sb->s_blocksize - 1);
120 offset = filp->f_pos & (sb->s_blocksize - 1);
121 114
122 while (!error && !stored && filp->f_pos < inode->i_size) { 115 while (ctx->pos < inode->i_size) {
123 unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb); 116 unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
124 struct buffer_head map_bh; 117 struct buffer_head map_bh;
125 struct buffer_head *bh = NULL; 118 struct buffer_head *bh = NULL;
126 119
@@ -129,12 +122,12 @@ static int ext3_readdir(struct file * filp,
129 if (err > 0) { 122 if (err > 0) {
130 pgoff_t index = map_bh.b_blocknr >> 123 pgoff_t index = map_bh.b_blocknr >>
131 (PAGE_CACHE_SHIFT - inode->i_blkbits); 124 (PAGE_CACHE_SHIFT - inode->i_blkbits);
132 if (!ra_has_index(&filp->f_ra, index)) 125 if (!ra_has_index(&file->f_ra, index))
133 page_cache_sync_readahead( 126 page_cache_sync_readahead(
134 sb->s_bdev->bd_inode->i_mapping, 127 sb->s_bdev->bd_inode->i_mapping,
135 &filp->f_ra, filp, 128 &file->f_ra, file,
136 index, 1); 129 index, 1);
137 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 130 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
138 bh = ext3_bread(NULL, inode, blk, 0, &err); 131 bh = ext3_bread(NULL, inode, blk, 0, &err);
139 } 132 }
140 133
@@ -146,22 +139,21 @@ static int ext3_readdir(struct file * filp,
146 if (!dir_has_error) { 139 if (!dir_has_error) {
147 ext3_error(sb, __func__, "directory #%lu " 140 ext3_error(sb, __func__, "directory #%lu "
148 "contains a hole at offset %lld", 141 "contains a hole at offset %lld",
149 inode->i_ino, filp->f_pos); 142 inode->i_ino, ctx->pos);
150 dir_has_error = 1; 143 dir_has_error = 1;
151 } 144 }
152 /* corrupt size? Maybe no more blocks to read */ 145 /* corrupt size? Maybe no more blocks to read */
153 if (filp->f_pos > inode->i_blocks << 9) 146 if (ctx->pos > inode->i_blocks << 9)
154 break; 147 break;
155 filp->f_pos += sb->s_blocksize - offset; 148 ctx->pos += sb->s_blocksize - offset;
156 continue; 149 continue;
157 } 150 }
158 151
159revalidate:
160 /* If the dir block has changed since the last call to 152 /* If the dir block has changed since the last call to
161 * readdir(2), then we might be pointing to an invalid 153 * readdir(2), then we might be pointing to an invalid
162 * dirent right now. Scan from the start of the block 154 * dirent right now. Scan from the start of the block
163 * to make sure. */ 155 * to make sure. */
164 if (filp->f_version != inode->i_version) { 156 if (offset && file->f_version != inode->i_version) {
165 for (i = 0; i < sb->s_blocksize && i < offset; ) { 157 for (i = 0; i < sb->s_blocksize && i < offset; ) {
166 de = (struct ext3_dir_entry_2 *) 158 de = (struct ext3_dir_entry_2 *)
167 (bh->b_data + i); 159 (bh->b_data + i);
@@ -177,53 +169,40 @@ revalidate:
177 i += ext3_rec_len_from_disk(de->rec_len); 169 i += ext3_rec_len_from_disk(de->rec_len);
178 } 170 }
179 offset = i; 171 offset = i;
180 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 172 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
181 | offset; 173 | offset;
182 filp->f_version = inode->i_version; 174 file->f_version = inode->i_version;
183 } 175 }
184 176
185 while (!error && filp->f_pos < inode->i_size 177 while (ctx->pos < inode->i_size
186 && offset < sb->s_blocksize) { 178 && offset < sb->s_blocksize) {
187 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); 179 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
188 if (!ext3_check_dir_entry ("ext3_readdir", inode, de, 180 if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
189 bh, offset)) { 181 bh, offset)) {
190 /* On error, skip the f_pos to the 182 /* On error, skip the to the
191 next block. */ 183 next block. */
192 filp->f_pos = (filp->f_pos | 184 ctx->pos = (ctx->pos |
193 (sb->s_blocksize - 1)) + 1; 185 (sb->s_blocksize - 1)) + 1;
194 brelse (bh); 186 break;
195 ret = stored;
196 goto out;
197 } 187 }
198 offset += ext3_rec_len_from_disk(de->rec_len); 188 offset += ext3_rec_len_from_disk(de->rec_len);
199 if (le32_to_cpu(de->inode)) { 189 if (le32_to_cpu(de->inode)) {
200 /* We might block in the next section 190 if (!dir_emit(ctx, de->name, de->name_len,
201 * if the data destination is 191 le32_to_cpu(de->inode),
202 * currently swapped out. So, use a 192 get_dtype(sb, de->file_type))) {
203 * version stamp to detect whether or 193 brelse(bh);
204 * not the directory has been modified 194 return 0;
205 * during the copy operation. 195 }
206 */
207 u64 version = filp->f_version;
208
209 error = filldir(dirent, de->name,
210 de->name_len,
211 filp->f_pos,
212 le32_to_cpu(de->inode),
213 get_dtype(sb, de->file_type));
214 if (error)
215 break;
216 if (version != filp->f_version)
217 goto revalidate;
218 stored ++;
219 } 196 }
220 filp->f_pos += ext3_rec_len_from_disk(de->rec_len); 197 ctx->pos += ext3_rec_len_from_disk(de->rec_len);
221 } 198 }
222 offset = 0; 199 offset = 0;
223 brelse (bh); 200 brelse (bh);
201 if (ctx->pos < inode->i_size)
202 if (!dir_relax(inode))
203 return 0;
224 } 204 }
225out: 205 return 0;
226 return ret;
227} 206}
228 207
229static inline int is_32bit_api(void) 208static inline int is_32bit_api(void)
@@ -452,62 +431,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
452 * for all entres on the fname linked list. (Normally there is only 431 * for all entres on the fname linked list. (Normally there is only
453 * one entry on the linked list, unless there are 62 bit hash collisions.) 432 * one entry on the linked list, unless there are 62 bit hash collisions.)
454 */ 433 */
455static int call_filldir(struct file * filp, void * dirent, 434static bool call_filldir(struct file *file, struct dir_context *ctx,
456 filldir_t filldir, struct fname *fname) 435 struct fname *fname)
457{ 436{
458 struct dir_private_info *info = filp->private_data; 437 struct dir_private_info *info = file->private_data;
459 loff_t curr_pos; 438 struct inode *inode = file_inode(file);
460 struct inode *inode = file_inode(filp); 439 struct super_block *sb = inode->i_sb;
461 struct super_block * sb;
462 int error;
463
464 sb = inode->i_sb;
465 440
466 if (!fname) { 441 if (!fname) {
467 printk("call_filldir: called with null fname?!?\n"); 442 printk("call_filldir: called with null fname?!?\n");
468 return 0; 443 return true;
469 } 444 }
470 curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); 445 ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
471 while (fname) { 446 while (fname) {
472 error = filldir(dirent, fname->name, 447 if (!dir_emit(ctx, fname->name, fname->name_len,
473 fname->name_len, curr_pos,
474 fname->inode, 448 fname->inode,
475 get_dtype(sb, fname->file_type)); 449 get_dtype(sb, fname->file_type))) {
476 if (error) {
477 filp->f_pos = curr_pos;
478 info->extra_fname = fname; 450 info->extra_fname = fname;
479 return error; 451 return false;
480 } 452 }
481 fname = fname->next; 453 fname = fname->next;
482 } 454 }
483 return 0; 455 return true;
484} 456}
485 457
486static int ext3_dx_readdir(struct file * filp, 458static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
487 void * dirent, filldir_t filldir)
488{ 459{
489 struct dir_private_info *info = filp->private_data; 460 struct dir_private_info *info = file->private_data;
490 struct inode *inode = file_inode(filp); 461 struct inode *inode = file_inode(file);
491 struct fname *fname; 462 struct fname *fname;
492 int ret; 463 int ret;
493 464
494 if (!info) { 465 if (!info) {
495 info = ext3_htree_create_dir_info(filp, filp->f_pos); 466 info = ext3_htree_create_dir_info(file, ctx->pos);
496 if (!info) 467 if (!info)
497 return -ENOMEM; 468 return -ENOMEM;
498 filp->private_data = info; 469 file->private_data = info;
499 } 470 }
500 471
501 if (filp->f_pos == ext3_get_htree_eof(filp)) 472 if (ctx->pos == ext3_get_htree_eof(file))
502 return 0; /* EOF */ 473 return 0; /* EOF */
503 474
504 /* Some one has messed with f_pos; reset the world */ 475 /* Some one has messed with f_pos; reset the world */
505 if (info->last_pos != filp->f_pos) { 476 if (info->last_pos != ctx->pos) {
506 free_rb_tree_fname(&info->root); 477 free_rb_tree_fname(&info->root);
507 info->curr_node = NULL; 478 info->curr_node = NULL;
508 info->extra_fname = NULL; 479 info->extra_fname = NULL;
509 info->curr_hash = pos2maj_hash(filp, filp->f_pos); 480 info->curr_hash = pos2maj_hash(file, ctx->pos);
510 info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); 481 info->curr_minor_hash = pos2min_hash(file, ctx->pos);
511 } 482 }
512 483
513 /* 484 /*
@@ -515,7 +486,7 @@ static int ext3_dx_readdir(struct file * filp,
515 * chain, return them first. 486 * chain, return them first.
516 */ 487 */
517 if (info->extra_fname) { 488 if (info->extra_fname) {
518 if (call_filldir(filp, dirent, filldir, info->extra_fname)) 489 if (!call_filldir(file, ctx, info->extra_fname))
519 goto finished; 490 goto finished;
520 info->extra_fname = NULL; 491 info->extra_fname = NULL;
521 goto next_node; 492 goto next_node;
@@ -529,17 +500,17 @@ static int ext3_dx_readdir(struct file * filp,
529 * cached entries. 500 * cached entries.
530 */ 501 */
531 if ((!info->curr_node) || 502 if ((!info->curr_node) ||
532 (filp->f_version != inode->i_version)) { 503 (file->f_version != inode->i_version)) {
533 info->curr_node = NULL; 504 info->curr_node = NULL;
534 free_rb_tree_fname(&info->root); 505 free_rb_tree_fname(&info->root);
535 filp->f_version = inode->i_version; 506 file->f_version = inode->i_version;
536 ret = ext3_htree_fill_tree(filp, info->curr_hash, 507 ret = ext3_htree_fill_tree(file, info->curr_hash,
537 info->curr_minor_hash, 508 info->curr_minor_hash,
538 &info->next_hash); 509 &info->next_hash);
539 if (ret < 0) 510 if (ret < 0)
540 return ret; 511 return ret;
541 if (ret == 0) { 512 if (ret == 0) {
542 filp->f_pos = ext3_get_htree_eof(filp); 513 ctx->pos = ext3_get_htree_eof(file);
543 break; 514 break;
544 } 515 }
545 info->curr_node = rb_first(&info->root); 516 info->curr_node = rb_first(&info->root);
@@ -548,7 +519,7 @@ static int ext3_dx_readdir(struct file * filp,
548 fname = rb_entry(info->curr_node, struct fname, rb_hash); 519 fname = rb_entry(info->curr_node, struct fname, rb_hash);
549 info->curr_hash = fname->hash; 520 info->curr_hash = fname->hash;
550 info->curr_minor_hash = fname->minor_hash; 521 info->curr_minor_hash = fname->minor_hash;
551 if (call_filldir(filp, dirent, filldir, fname)) 522 if (!call_filldir(file, ctx, fname))
552 break; 523 break;
553 next_node: 524 next_node:
554 info->curr_node = rb_next(info->curr_node); 525 info->curr_node = rb_next(info->curr_node);
@@ -559,7 +530,7 @@ static int ext3_dx_readdir(struct file * filp,
559 info->curr_minor_hash = fname->minor_hash; 530 info->curr_minor_hash = fname->minor_hash;
560 } else { 531 } else {
561 if (info->next_hash == ~0) { 532 if (info->next_hash == ~0) {
562 filp->f_pos = ext3_get_htree_eof(filp); 533 ctx->pos = ext3_get_htree_eof(file);
563 break; 534 break;
564 } 535 }
565 info->curr_hash = info->next_hash; 536 info->curr_hash = info->next_hash;
@@ -567,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
567 } 538 }
568 } 539 }
569finished: 540finished:
570 info->last_pos = filp->f_pos; 541 info->last_pos = ctx->pos;
571 return 0; 542 return 0;
572} 543}
573 544
@@ -582,7 +553,7 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
582const struct file_operations ext3_dir_operations = { 553const struct file_operations ext3_dir_operations = {
583 .llseek = ext3_dir_llseek, 554 .llseek = ext3_dir_llseek,
584 .read = generic_read_dir, 555 .read = generic_read_dir,
585 .readdir = ext3_readdir, 556 .iterate = ext3_readdir,
586 .unlocked_ioctl = ext3_ioctl, 557 .unlocked_ioctl = ext3_ioctl,
587#ifdef CONFIG_COMPAT 558#ifdef CONFIG_COMPAT
588 .compat_ioctl = ext3_compat_ioctl, 559 .compat_ioctl = ext3_compat_ioctl,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c712825640..f67668f724ba 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
1825 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); 1825 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1826} 1826}
1827 1827
1828static void ext3_invalidatepage(struct page *page, unsigned long offset) 1828static void ext3_invalidatepage(struct page *page, unsigned int offset,
1829 unsigned int length)
1829{ 1830{
1830 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1831 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1831 1832
1832 trace_ext3_invalidatepage(page, offset); 1833 trace_ext3_invalidatepage(page, offset, length);
1833 1834
1834 /* 1835 /*
1835 * If it's a full truncate we just forget about the pending dirtying 1836 * If it's a full truncate we just forget about the pending dirtying
1836 */ 1837 */
1837 if (offset == 0) 1838 if (offset == 0 && length == PAGE_CACHE_SIZE)
1838 ClearPageChecked(page); 1839 ClearPageChecked(page);
1839 1840
1840 journal_invalidatepage(journal, page, offset); 1841 journal_invalidatepage(journal, page, offset, length);
1841} 1842}
1842 1843
1843static int ext3_releasepage(struct page *page, gfp_t wait) 1844static int ext3_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 692de13e3596..998ea111e537 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
576 if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 576 if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
577 (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) 577 (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
578 +((char *)de - bh->b_data))) { 578 +((char *)de - bh->b_data))) {
579 /* On error, skip the f_pos to the next block. */ 579 /* silently ignore the rest of the block */
580 dir_file->f_pos = (dir_file->f_pos | 580 break;
581 (dir->i_sb->s_blocksize - 1)) + 1;
582 brelse (bh);
583 return count;
584 } 581 }
585 ext3fs_dirhash(de->name, de->name_len, hinfo); 582 ext3fs_dirhash(de->name, de->name_len, hinfo);
586 if ((hinfo->hash < start_hash) || 583 if ((hinfo->hash < start_hash) ||
@@ -1762,6 +1759,45 @@ retry:
1762 return err; 1759 return err;
1763} 1760}
1764 1761
1762static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
1763{
1764 handle_t *handle;
1765 struct inode *inode;
1766 int err, retries = 0;
1767
1768 dquot_initialize(dir);
1769
1770retry:
1771 handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
1772 4 + EXT3_XATTR_TRANS_BLOCKS);
1773
1774 if (IS_ERR(handle))
1775 return PTR_ERR(handle);
1776
1777 inode = ext3_new_inode (handle, dir, NULL, mode);
1778 err = PTR_ERR(inode);
1779 if (!IS_ERR(inode)) {
1780 inode->i_op = &ext3_file_inode_operations;
1781 inode->i_fop = &ext3_file_operations;
1782 ext3_set_aops(inode);
1783 err = ext3_orphan_add(handle, inode);
1784 if (err)
1785 goto err_drop_inode;
1786 mark_inode_dirty(inode);
1787 d_tmpfile(dentry, inode);
1788 unlock_new_inode(inode);
1789 }
1790 ext3_journal_stop(handle);
1791 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1792 goto retry;
1793 return err;
1794err_drop_inode:
1795 ext3_journal_stop(handle);
1796 unlock_new_inode(inode);
1797 iput(inode);
1798 return err;
1799}
1800
1765static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) 1801static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
1766{ 1802{
1767 handle_t *handle; 1803 handle_t *handle;
@@ -2303,7 +2339,7 @@ static int ext3_link (struct dentry * old_dentry,
2303 2339
2304retry: 2340retry:
2305 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2341 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2306 EXT3_INDEX_EXTRA_TRANS_BLOCKS); 2342 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
2307 if (IS_ERR(handle)) 2343 if (IS_ERR(handle))
2308 return PTR_ERR(handle); 2344 return PTR_ERR(handle);
2309 2345
@@ -2317,6 +2353,11 @@ retry:
2317 err = ext3_add_entry(handle, dentry, inode); 2353 err = ext3_add_entry(handle, dentry, inode);
2318 if (!err) { 2354 if (!err) {
2319 ext3_mark_inode_dirty(handle, inode); 2355 ext3_mark_inode_dirty(handle, inode);
2356 /* this can happen only for tmpfile being
2357 * linked the first time
2358 */
2359 if (inode->i_nlink == 1)
2360 ext3_orphan_del(handle, inode);
2320 d_instantiate(dentry, inode); 2361 d_instantiate(dentry, inode);
2321 } else { 2362 } else {
2322 drop_nlink(inode); 2363 drop_nlink(inode);
@@ -2519,6 +2560,7 @@ const struct inode_operations ext3_dir_inode_operations = {
2519 .mkdir = ext3_mkdir, 2560 .mkdir = ext3_mkdir,
2520 .rmdir = ext3_rmdir, 2561 .rmdir = ext3_rmdir,
2521 .mknod = ext3_mknod, 2562 .mknod = ext3_mknod,
2563 .tmpfile = ext3_tmpfile,
2522 .rename = ext3_rename, 2564 .rename = ext3_rename,
2523 .setattr = ext3_setattr, 2565 .setattr = ext3_setattr,
2524#ifdef CONFIG_EXT3_FS_XATTR 2566#ifdef CONFIG_EXT3_FS_XATTR
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..58339393fa6e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
682 682
683static inline int test_root(ext4_group_t a, int b) 683static inline int test_root(ext4_group_t a, int b)
684{ 684{
685 int num = b; 685 while (1) {
686 686 if (a < b)
687 while (a > num) 687 return 0;
688 num *= b; 688 if (a == b)
689 return num == a; 689 return 1;
690 if ((a % b) != 0)
691 return 0;
692 a = a / b;
693 }
690} 694}
691 695
692static int ext4_group_sparse(ext4_group_t group) 696static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f8d56e4254e0..3c7d288ae94c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -29,8 +29,7 @@
29#include "ext4.h" 29#include "ext4.h"
30#include "xattr.h" 30#include "xattr.h"
31 31
32static int ext4_dx_readdir(struct file *filp, 32static int ext4_dx_readdir(struct file *, struct dir_context *);
33 void *dirent, filldir_t filldir);
34 33
35/** 34/**
36 * Check if the given dir-inode refers to an htree-indexed directory 35 * Check if the given dir-inode refers to an htree-indexed directory
@@ -103,60 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
103 return 1; 102 return 1;
104} 103}
105 104
106static int ext4_readdir(struct file *filp, 105static int ext4_readdir(struct file *file, struct dir_context *ctx)
107 void *dirent, filldir_t filldir)
108{ 106{
109 int error = 0;
110 unsigned int offset; 107 unsigned int offset;
111 int i, stored; 108 int i, stored;
112 struct ext4_dir_entry_2 *de; 109 struct ext4_dir_entry_2 *de;
113 int err; 110 int err;
114 struct inode *inode = file_inode(filp); 111 struct inode *inode = file_inode(file);
115 struct super_block *sb = inode->i_sb; 112 struct super_block *sb = inode->i_sb;
116 int ret = 0;
117 int dir_has_error = 0; 113 int dir_has_error = 0;
118 114
119 if (is_dx_dir(inode)) { 115 if (is_dx_dir(inode)) {
120 err = ext4_dx_readdir(filp, dirent, filldir); 116 err = ext4_dx_readdir(file, ctx);
121 if (err != ERR_BAD_DX_DIR) { 117 if (err != ERR_BAD_DX_DIR) {
122 ret = err; 118 return err;
123 goto out;
124 } 119 }
125 /* 120 /*
126 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
127 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
128 */ 123 */
129 ext4_clear_inode_flag(file_inode(filp), 124 ext4_clear_inode_flag(file_inode(file),
130 EXT4_INODE_INDEX); 125 EXT4_INODE_INDEX);
131 } 126 }
132 127
133 if (ext4_has_inline_data(inode)) { 128 if (ext4_has_inline_data(inode)) {
134 int has_inline_data = 1; 129 int has_inline_data = 1;
135 ret = ext4_read_inline_dir(filp, dirent, filldir, 130 int ret = ext4_read_inline_dir(file, ctx,
136 &has_inline_data); 131 &has_inline_data);
137 if (has_inline_data) 132 if (has_inline_data)
138 return ret; 133 return ret;
139 } 134 }
140 135
141 stored = 0; 136 stored = 0;
142 offset = filp->f_pos & (sb->s_blocksize - 1); 137 offset = ctx->pos & (sb->s_blocksize - 1);
143 138
144 while (!error && !stored && filp->f_pos < inode->i_size) { 139 while (ctx->pos < inode->i_size) {
145 struct ext4_map_blocks map; 140 struct ext4_map_blocks map;
146 struct buffer_head *bh = NULL; 141 struct buffer_head *bh = NULL;
147 142
148 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 143 map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
149 map.m_len = 1; 144 map.m_len = 1;
150 err = ext4_map_blocks(NULL, inode, &map, 0); 145 err = ext4_map_blocks(NULL, inode, &map, 0);
151 if (err > 0) { 146 if (err > 0) {
152 pgoff_t index = map.m_pblk >> 147 pgoff_t index = map.m_pblk >>
153 (PAGE_CACHE_SHIFT - inode->i_blkbits); 148 (PAGE_CACHE_SHIFT - inode->i_blkbits);
154 if (!ra_has_index(&filp->f_ra, index)) 149 if (!ra_has_index(&file->f_ra, index))
155 page_cache_sync_readahead( 150 page_cache_sync_readahead(
156 sb->s_bdev->bd_inode->i_mapping, 151 sb->s_bdev->bd_inode->i_mapping,
157 &filp->f_ra, filp, 152 &file->f_ra, file,
158 index, 1); 153 index, 1);
159 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 154 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
160 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); 155 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
161 } 156 }
162 157
@@ -166,16 +161,16 @@ static int ext4_readdir(struct file *filp,
166 */ 161 */
167 if (!bh) { 162 if (!bh) {
168 if (!dir_has_error) { 163 if (!dir_has_error) {
169 EXT4_ERROR_FILE(filp, 0, 164 EXT4_ERROR_FILE(file, 0,
170 "directory contains a " 165 "directory contains a "
171 "hole at offset %llu", 166 "hole at offset %llu",
172 (unsigned long long) filp->f_pos); 167 (unsigned long long) ctx->pos);
173 dir_has_error = 1; 168 dir_has_error = 1;
174 } 169 }
175 /* corrupt size? Maybe no more blocks to read */ 170 /* corrupt size? Maybe no more blocks to read */
176 if (filp->f_pos > inode->i_blocks << 9) 171 if (ctx->pos > inode->i_blocks << 9)
177 break; 172 break;
178 filp->f_pos += sb->s_blocksize - offset; 173 ctx->pos += sb->s_blocksize - offset;
179 continue; 174 continue;
180 } 175 }
181 176
@@ -183,21 +178,20 @@ static int ext4_readdir(struct file *filp,
183 if (!buffer_verified(bh) && 178 if (!buffer_verified(bh) &&
184 !ext4_dirent_csum_verify(inode, 179 !ext4_dirent_csum_verify(inode,
185 (struct ext4_dir_entry *)bh->b_data)) { 180 (struct ext4_dir_entry *)bh->b_data)) {
186 EXT4_ERROR_FILE(filp, 0, "directory fails checksum " 181 EXT4_ERROR_FILE(file, 0, "directory fails checksum "
187 "at offset %llu", 182 "at offset %llu",
188 (unsigned long long)filp->f_pos); 183 (unsigned long long)ctx->pos);
189 filp->f_pos += sb->s_blocksize - offset; 184 ctx->pos += sb->s_blocksize - offset;
190 brelse(bh); 185 brelse(bh);
191 continue; 186 continue;
192 } 187 }
193 set_buffer_verified(bh); 188 set_buffer_verified(bh);
194 189
195revalidate:
196 /* If the dir block has changed since the last call to 190 /* If the dir block has changed since the last call to
197 * readdir(2), then we might be pointing to an invalid 191 * readdir(2), then we might be pointing to an invalid
198 * dirent right now. Scan from the start of the block 192 * dirent right now. Scan from the start of the block
199 * to make sure. */ 193 * to make sure. */
200 if (filp->f_version != inode->i_version) { 194 if (file->f_version != inode->i_version) {
201 for (i = 0; i < sb->s_blocksize && i < offset; ) { 195 for (i = 0; i < sb->s_blocksize && i < offset; ) {
202 de = (struct ext4_dir_entry_2 *) 196 de = (struct ext4_dir_entry_2 *)
203 (bh->b_data + i); 197 (bh->b_data + i);
@@ -214,57 +208,46 @@ revalidate:
214 sb->s_blocksize); 208 sb->s_blocksize);
215 } 209 }
216 offset = i; 210 offset = i;
217 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 211 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
218 | offset; 212 | offset;
219 filp->f_version = inode->i_version; 213 file->f_version = inode->i_version;
220 } 214 }
221 215
222 while (!error && filp->f_pos < inode->i_size 216 while (ctx->pos < inode->i_size
223 && offset < sb->s_blocksize) { 217 && offset < sb->s_blocksize) {
224 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 218 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
225 if (ext4_check_dir_entry(inode, filp, de, bh, 219 if (ext4_check_dir_entry(inode, file, de, bh,
226 bh->b_data, bh->b_size, 220 bh->b_data, bh->b_size,
227 offset)) { 221 offset)) {
228 /* 222 /*
229 * On error, skip the f_pos to the next block 223 * On error, skip to the next block
230 */ 224 */
231 filp->f_pos = (filp->f_pos | 225 ctx->pos = (ctx->pos |
232 (sb->s_blocksize - 1)) + 1; 226 (sb->s_blocksize - 1)) + 1;
233 brelse(bh); 227 break;
234 ret = stored;
235 goto out;
236 } 228 }
237 offset += ext4_rec_len_from_disk(de->rec_len, 229 offset += ext4_rec_len_from_disk(de->rec_len,
238 sb->s_blocksize); 230 sb->s_blocksize);
239 if (le32_to_cpu(de->inode)) { 231 if (le32_to_cpu(de->inode)) {
240 /* We might block in the next section 232 if (!dir_emit(ctx, de->name,
241 * if the data destination is
242 * currently swapped out. So, use a
243 * version stamp to detect whether or
244 * not the directory has been modified
245 * during the copy operation.
246 */
247 u64 version = filp->f_version;
248
249 error = filldir(dirent, de->name,
250 de->name_len, 233 de->name_len,
251 filp->f_pos,
252 le32_to_cpu(de->inode), 234 le32_to_cpu(de->inode),
253 get_dtype(sb, de->file_type)); 235 get_dtype(sb, de->file_type))) {
254 if (error) 236 brelse(bh);
255 break; 237 return 0;
256 if (version != filp->f_version) 238 }
257 goto revalidate;
258 stored++;
259 } 239 }
260 filp->f_pos += ext4_rec_len_from_disk(de->rec_len, 240 ctx->pos += ext4_rec_len_from_disk(de->rec_len,
261 sb->s_blocksize); 241 sb->s_blocksize);
262 } 242 }
263 offset = 0; 243 offset = 0;
264 brelse(bh); 244 brelse(bh);
245 if (ctx->pos < inode->i_size) {
246 if (!dir_relax(inode))
247 return 0;
248 }
265 } 249 }
266out: 250 return 0;
267 return ret;
268} 251}
269 252
270static inline int is_32bit_api(void) 253static inline int is_32bit_api(void)
@@ -492,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
492 * for all entres on the fname linked list. (Normally there is only 475 * for all entres on the fname linked list. (Normally there is only
493 * one entry on the linked list, unless there are 62 bit hash collisions.) 476 * one entry on the linked list, unless there are 62 bit hash collisions.)
494 */ 477 */
495static int call_filldir(struct file *filp, void *dirent, 478static int call_filldir(struct file *file, struct dir_context *ctx,
496 filldir_t filldir, struct fname *fname) 479 struct fname *fname)
497{ 480{
498 struct dir_private_info *info = filp->private_data; 481 struct dir_private_info *info = file->private_data;
499 loff_t curr_pos; 482 struct inode *inode = file_inode(file);
500 struct inode *inode = file_inode(filp); 483 struct super_block *sb = inode->i_sb;
501 struct super_block *sb;
502 int error;
503
504 sb = inode->i_sb;
505 484
506 if (!fname) { 485 if (!fname) {
507 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " 486 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -509,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent,
509 inode->i_ino, current->comm); 488 inode->i_ino, current->comm);
510 return 0; 489 return 0;
511 } 490 }
512 curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); 491 ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
513 while (fname) { 492 while (fname) {
514 error = filldir(dirent, fname->name, 493 if (!dir_emit(ctx, fname->name,
515 fname->name_len, curr_pos, 494 fname->name_len,
516 fname->inode, 495 fname->inode,
517 get_dtype(sb, fname->file_type)); 496 get_dtype(sb, fname->file_type))) {
518 if (error) {
519 filp->f_pos = curr_pos;
520 info->extra_fname = fname; 497 info->extra_fname = fname;
521 return error; 498 return 1;
522 } 499 }
523 fname = fname->next; 500 fname = fname->next;
524 } 501 }
525 return 0; 502 return 0;
526} 503}
527 504
528static int ext4_dx_readdir(struct file *filp, 505static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
529 void *dirent, filldir_t filldir)
530{ 506{
531 struct dir_private_info *info = filp->private_data; 507 struct dir_private_info *info = file->private_data;
532 struct inode *inode = file_inode(filp); 508 struct inode *inode = file_inode(file);
533 struct fname *fname; 509 struct fname *fname;
534 int ret; 510 int ret;
535 511
536 if (!info) { 512 if (!info) {
537 info = ext4_htree_create_dir_info(filp, filp->f_pos); 513 info = ext4_htree_create_dir_info(file, ctx->pos);
538 if (!info) 514 if (!info)
539 return -ENOMEM; 515 return -ENOMEM;
540 filp->private_data = info; 516 file->private_data = info;
541 } 517 }
542 518
543 if (filp->f_pos == ext4_get_htree_eof(filp)) 519 if (ctx->pos == ext4_get_htree_eof(file))
544 return 0; /* EOF */ 520 return 0; /* EOF */
545 521
546 /* Some one has messed with f_pos; reset the world */ 522 /* Some one has messed with f_pos; reset the world */
547 if (info->last_pos != filp->f_pos) { 523 if (info->last_pos != ctx->pos) {
548 free_rb_tree_fname(&info->root); 524 free_rb_tree_fname(&info->root);
549 info->curr_node = NULL; 525 info->curr_node = NULL;
550 info->extra_fname = NULL; 526 info->extra_fname = NULL;
551 info->curr_hash = pos2maj_hash(filp, filp->f_pos); 527 info->curr_hash = pos2maj_hash(file, ctx->pos);
552 info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); 528 info->curr_minor_hash = pos2min_hash(file, ctx->pos);
553 } 529 }
554 530
555 /* 531 /*
@@ -557,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp,
557 * chain, return them first. 533 * chain, return them first.
558 */ 534 */
559 if (info->extra_fname) { 535 if (info->extra_fname) {
560 if (call_filldir(filp, dirent, filldir, info->extra_fname)) 536 if (call_filldir(file, ctx, info->extra_fname))
561 goto finished; 537 goto finished;
562 info->extra_fname = NULL; 538 info->extra_fname = NULL;
563 goto next_node; 539 goto next_node;
@@ -571,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp,
571 * cached entries. 547 * cached entries.
572 */ 548 */
573 if ((!info->curr_node) || 549 if ((!info->curr_node) ||
574 (filp->f_version != inode->i_version)) { 550 (file->f_version != inode->i_version)) {
575 info->curr_node = NULL; 551 info->curr_node = NULL;
576 free_rb_tree_fname(&info->root); 552 free_rb_tree_fname(&info->root);
577 filp->f_version = inode->i_version; 553 file->f_version = inode->i_version;
578 ret = ext4_htree_fill_tree(filp, info->curr_hash, 554 ret = ext4_htree_fill_tree(file, info->curr_hash,
579 info->curr_minor_hash, 555 info->curr_minor_hash,
580 &info->next_hash); 556 &info->next_hash);
581 if (ret < 0) 557 if (ret < 0)
582 return ret; 558 return ret;
583 if (ret == 0) { 559 if (ret == 0) {
584 filp->f_pos = ext4_get_htree_eof(filp); 560 ctx->pos = ext4_get_htree_eof(file);
585 break; 561 break;
586 } 562 }
587 info->curr_node = rb_first(&info->root); 563 info->curr_node = rb_first(&info->root);
@@ -590,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp,
590 fname = rb_entry(info->curr_node, struct fname, rb_hash); 566 fname = rb_entry(info->curr_node, struct fname, rb_hash);
591 info->curr_hash = fname->hash; 567 info->curr_hash = fname->hash;
592 info->curr_minor_hash = fname->minor_hash; 568 info->curr_minor_hash = fname->minor_hash;
593 if (call_filldir(filp, dirent, filldir, fname)) 569 if (call_filldir(file, ctx, fname))
594 break; 570 break;
595 next_node: 571 next_node:
596 info->curr_node = rb_next(info->curr_node); 572 info->curr_node = rb_next(info->curr_node);
@@ -601,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp,
601 info->curr_minor_hash = fname->minor_hash; 577 info->curr_minor_hash = fname->minor_hash;
602 } else { 578 } else {
603 if (info->next_hash == ~0) { 579 if (info->next_hash == ~0) {
604 filp->f_pos = ext4_get_htree_eof(filp); 580 ctx->pos = ext4_get_htree_eof(file);
605 break; 581 break;
606 } 582 }
607 info->curr_hash = info->next_hash; 583 info->curr_hash = info->next_hash;
@@ -609,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp,
609 } 585 }
610 } 586 }
611finished: 587finished:
612 info->last_pos = filp->f_pos; 588 info->last_pos = ctx->pos;
613 return 0; 589 return 0;
614} 590}
615 591
@@ -624,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
624const struct file_operations ext4_dir_operations = { 600const struct file_operations ext4_dir_operations = {
625 .llseek = ext4_dir_llseek, 601 .llseek = ext4_dir_llseek,
626 .read = generic_read_dir, 602 .read = generic_read_dir,
627 .readdir = ext4_readdir, 603 .iterate = ext4_readdir,
628 .unlocked_ioctl = ext4_ioctl, 604 .unlocked_ioctl = ext4_ioctl,
629#ifdef CONFIG_COMPAT 605#ifdef CONFIG_COMPAT
630 .compat_ioctl = ext4_compat_ioctl, 606 .compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0aabb344b02e..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,33 +177,22 @@ struct ext4_map_blocks {
177}; 177};
178 178
179/* 179/*
180 * For delayed allocation tracking
181 */
182struct mpage_da_data {
183 struct inode *inode;
184 sector_t b_blocknr; /* start block number of extent */
185 size_t b_size; /* size of extent */
186 unsigned long b_state; /* state of the extent */
187 unsigned long first_page, next_page; /* extent of pages */
188 struct writeback_control *wbc;
189 int io_done;
190 int pages_written;
191 int retval;
192};
193
194/*
195 * Flags for ext4_io_end->flags 180 * Flags for ext4_io_end->flags
196 */ 181 */
197#define EXT4_IO_END_UNWRITTEN 0x0001 182#define EXT4_IO_END_UNWRITTEN 0x0001
198#define EXT4_IO_END_ERROR 0x0002 183#define EXT4_IO_END_DIRECT 0x0002
199#define EXT4_IO_END_DIRECT 0x0004
200 184
201/* 185/*
202 * For converting uninitialized extents on a work queue. 186 * For converting uninitialized extents on a work queue. 'handle' is used for
187 * buffered writeback.
203 */ 188 */
204typedef struct ext4_io_end { 189typedef struct ext4_io_end {
205 struct list_head list; /* per-file finished IO list */ 190 struct list_head list; /* per-file finished IO list */
191 handle_t *handle; /* handle reserved for extent
192 * conversion */
206 struct inode *inode; /* file being written to */ 193 struct inode *inode; /* file being written to */
194 struct bio *bio; /* Linked list of completed
195 * bios covering the extent */
207 unsigned int flag; /* unwritten or not */ 196 unsigned int flag; /* unwritten or not */
208 loff_t offset; /* offset in the file */ 197 loff_t offset; /* offset in the file */
209 ssize_t size; /* size of the extent */ 198 ssize_t size; /* size of the extent */
@@ -582,11 +571,6 @@ enum {
582#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 571#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
583 572
584/* 573/*
585 * Flags used by ext4_discard_partial_page_buffers
586 */
587#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
588
589/*
590 * ioctl commands 574 * ioctl commands
591 */ 575 */
592#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS 576#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -880,6 +864,7 @@ struct ext4_inode_info {
880 rwlock_t i_es_lock; 864 rwlock_t i_es_lock;
881 struct list_head i_es_lru; 865 struct list_head i_es_lru;
882 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 866 unsigned int i_es_lru_nr; /* protected by i_es_lock */
867 unsigned long i_touch_when; /* jiffies of last accessing */
883 868
884 /* ialloc */ 869 /* ialloc */
885 ext4_group_t i_last_alloc_group; 870 ext4_group_t i_last_alloc_group;
@@ -904,12 +889,22 @@ struct ext4_inode_info {
904 qsize_t i_reserved_quota; 889 qsize_t i_reserved_quota;
905#endif 890#endif
906 891
907 /* completed IOs that might need unwritten extents handling */ 892 /* Lock protecting lists below */
908 struct list_head i_completed_io_list;
909 spinlock_t i_completed_io_lock; 893 spinlock_t i_completed_io_lock;
894 /*
895 * Completed IOs that need unwritten extents handling and have
896 * transaction reserved
897 */
898 struct list_head i_rsv_conversion_list;
899 /*
900 * Completed IOs that need unwritten extents handling and don't have
901 * transaction reserved
902 */
903 struct list_head i_unrsv_conversion_list;
910 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 904 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
911 atomic_t i_unwritten; /* Nr. of inflight conversions pending */ 905 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
912 struct work_struct i_unwritten_work; /* deferred extent conversion */ 906 struct work_struct i_rsv_conversion_work;
907 struct work_struct i_unrsv_conversion_work;
913 908
914 spinlock_t i_block_reservation_lock; 909 spinlock_t i_block_reservation_lock;
915 910
@@ -1246,7 +1241,6 @@ struct ext4_sb_info {
1246 unsigned int s_mb_stats; 1241 unsigned int s_mb_stats;
1247 unsigned int s_mb_order2_reqs; 1242 unsigned int s_mb_order2_reqs;
1248 unsigned int s_mb_group_prealloc; 1243 unsigned int s_mb_group_prealloc;
1249 unsigned int s_max_writeback_mb_bump;
1250 unsigned int s_max_dir_size_kb; 1244 unsigned int s_max_dir_size_kb;
1251 /* where last allocation was done - for stream allocation */ 1245 /* where last allocation was done - for stream allocation */
1252 unsigned long s_mb_last_group; 1246 unsigned long s_mb_last_group;
@@ -1282,8 +1276,10 @@ struct ext4_sb_info {
1282 struct flex_groups *s_flex_groups; 1276 struct flex_groups *s_flex_groups;
1283 ext4_group_t s_flex_groups_allocated; 1277 ext4_group_t s_flex_groups_allocated;
1284 1278
1285 /* workqueue for dio unwritten */ 1279 /* workqueue for unreserved extent convertions (dio) */
1286 struct workqueue_struct *dio_unwritten_wq; 1280 struct workqueue_struct *unrsv_conversion_wq;
1281 /* workqueue for reserved extent conversions (buffered io) */
1282 struct workqueue_struct *rsv_conversion_wq;
1287 1283
1288 /* timer for periodic error stats printing */ 1284 /* timer for periodic error stats printing */
1289 struct timer_list s_err_report; 1285 struct timer_list s_err_report;
@@ -1308,6 +1304,7 @@ struct ext4_sb_info {
1308 /* Reclaim extents from extent status tree */ 1304 /* Reclaim extents from extent status tree */
1309 struct shrinker s_es_shrinker; 1305 struct shrinker s_es_shrinker;
1310 struct list_head s_es_lru; 1306 struct list_head s_es_lru;
1307 unsigned long s_es_last_sorted;
1311 struct percpu_counter s_extent_cache_cnt; 1308 struct percpu_counter s_extent_cache_cnt;
1312 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1309 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1313}; 1310};
@@ -1343,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1343 struct ext4_io_end *io_end) 1340 struct ext4_io_end *io_end)
1344{ 1341{
1345 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1342 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1343 /* Writeback has to have coversion transaction reserved */
1344 WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
1345 !(io_end->flag & EXT4_IO_END_DIRECT));
1346 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1346 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1347 atomic_inc(&EXT4_I(inode)->i_unwritten); 1347 atomic_inc(&EXT4_I(inode)->i_unwritten);
1348 } 1348 }
@@ -2000,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
2000 2000
2001/* fsync.c */ 2001/* fsync.c */
2002extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 2002extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
2003extern int ext4_flush_unwritten_io(struct inode *);
2004 2003
2005/* hash.c */ 2004/* hash.c */
2006extern int ext4fs_dirhash(const char *name, int len, struct 2005extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2089,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
2089extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 2088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
2090extern int ext4_can_truncate(struct inode *inode); 2089extern int ext4_can_truncate(struct inode *inode);
2091extern void ext4_truncate(struct inode *); 2090extern void ext4_truncate(struct inode *);
2092extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); 2091extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
2093extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 2092extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
2094extern void ext4_set_inode_flags(struct inode *); 2093extern void ext4_set_inode_flags(struct inode *);
2095extern void ext4_get_inode_flags(struct ext4_inode_info *); 2094extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2097,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
2097extern void ext4_set_aops(struct inode *inode); 2096extern void ext4_set_aops(struct inode *inode);
2098extern int ext4_writepage_trans_blocks(struct inode *); 2097extern int ext4_writepage_trans_blocks(struct inode *);
2099extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
2100extern int ext4_discard_partial_page_buffers(handle_t *handle, 2099extern int ext4_block_truncate_page(handle_t *handle,
2101 struct address_space *mapping, loff_t from, 2100 struct address_space *mapping, loff_t from);
2102 loff_t length, int flags); 2101extern int ext4_block_zero_page_range(handle_t *handle,
2102 struct address_space *mapping, loff_t from, loff_t length);
2103extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
2104 loff_t lstart, loff_t lend);
2103extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2105extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2104extern qsize_t *ext4_get_reserved_space(struct inode *inode); 2106extern qsize_t *ext4_get_reserved_space(struct inode *inode);
2105extern void ext4_da_update_reserve_space(struct inode *inode, 2107extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2112,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
2112 const struct iovec *iov, loff_t offset, 2114 const struct iovec *iov, loff_t offset,
2113 unsigned long nr_segs); 2115 unsigned long nr_segs);
2114extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2116extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2115extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 2117extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2116extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2118extern void ext4_ind_truncate(handle_t *, struct inode *inode);
2117extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 2119extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
2118 ext4_lblk_t first, ext4_lblk_t stop); 2120 ext4_lblk_t first, ext4_lblk_t stop);
@@ -2167,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
2167 ext4_group_t ngroup); 2169 ext4_group_t ngroup);
2168extern const char *ext4_decode_error(struct super_block *sb, int errno, 2170extern const char *ext4_decode_error(struct super_block *sb, int errno,
2169 char nbuf[16]); 2171 char nbuf[16]);
2172
2170extern __printf(4, 5) 2173extern __printf(4, 5)
2171void __ext4_error(struct super_block *, const char *, unsigned int, 2174void __ext4_error(struct super_block *, const char *, unsigned int,
2172 const char *, ...); 2175 const char *, ...);
2173#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
2174 __LINE__, ## message)
2175extern __printf(5, 6) 2176extern __printf(5, 6)
2176void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, 2177void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
2177 const char *, ...); 2178 const char *, ...);
2178extern __printf(5, 6) 2179extern __printf(5, 6)
2179void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, 2180void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
2180 const char *, ...); 2181 const char *, ...);
2181extern void __ext4_std_error(struct super_block *, const char *, 2182extern void __ext4_std_error(struct super_block *, const char *,
2182 unsigned int, int); 2183 unsigned int, int);
2183extern __printf(4, 5) 2184extern __printf(4, 5)
2184void __ext4_abort(struct super_block *, const char *, unsigned int, 2185void __ext4_abort(struct super_block *, const char *, unsigned int,
2185 const char *, ...); 2186 const char *, ...);
2186#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
2187 __LINE__, ## message)
2188extern __printf(4, 5) 2187extern __printf(4, 5)
2189void __ext4_warning(struct super_block *, const char *, unsigned int, 2188void __ext4_warning(struct super_block *, const char *, unsigned int,
2190 const char *, ...); 2189 const char *, ...);
2191#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
2192 __LINE__, ## message)
2193extern __printf(3, 4) 2190extern __printf(3, 4)
2194void ext4_msg(struct super_block *, const char *, const char *, ...); 2191void __ext4_msg(struct super_block *, const char *, const char *, ...);
2195extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, 2192extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
2196 const char *, unsigned int, const char *); 2193 const char *, unsigned int, const char *);
2197#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
2198 __LINE__, msg)
2199extern __printf(7, 8) 2194extern __printf(7, 8)
2200void __ext4_grp_locked_error(const char *, unsigned int, 2195void __ext4_grp_locked_error(const char *, unsigned int,
2201 struct super_block *, ext4_group_t, 2196 struct super_block *, ext4_group_t,
2202 unsigned long, ext4_fsblk_t, 2197 unsigned long, ext4_fsblk_t,
2203 const char *, ...); 2198 const char *, ...);
2204#define ext4_grp_locked_error(sb, grp, message...) \ 2199
2205 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) 2200#ifdef CONFIG_PRINTK
2201
2202#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2203 __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
2204#define ext4_error_file(file, func, line, block, fmt, ...) \
2205 __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
2206#define ext4_error(sb, fmt, ...) \
2207 __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2208#define ext4_abort(sb, fmt, ...) \
2209 __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2210#define ext4_warning(sb, fmt, ...) \
2211 __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2212#define ext4_msg(sb, level, fmt, ...) \
2213 __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
2214#define dump_mmp_msg(sb, mmp, msg) \
2215 __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
2216#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2217 __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
2218 fmt, ##__VA_ARGS__)
2219
2220#else
2221
2222#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2223do { \
2224 no_printk(fmt, ##__VA_ARGS__); \
2225 __ext4_error_inode(inode, "", 0, block, " "); \
2226} while (0)
2227#define ext4_error_file(file, func, line, block, fmt, ...) \
2228do { \
2229 no_printk(fmt, ##__VA_ARGS__); \
2230 __ext4_error_file(file, "", 0, block, " "); \
2231} while (0)
2232#define ext4_error(sb, fmt, ...) \
2233do { \
2234 no_printk(fmt, ##__VA_ARGS__); \
2235 __ext4_error(sb, "", 0, " "); \
2236} while (0)
2237#define ext4_abort(sb, fmt, ...) \
2238do { \
2239 no_printk(fmt, ##__VA_ARGS__); \
2240 __ext4_abort(sb, "", 0, " "); \
2241} while (0)
2242#define ext4_warning(sb, fmt, ...) \
2243do { \
2244 no_printk(fmt, ##__VA_ARGS__); \
2245 __ext4_warning(sb, "", 0, " "); \
2246} while (0)
2247#define ext4_msg(sb, level, fmt, ...) \
2248do { \
2249 no_printk(fmt, ##__VA_ARGS__); \
2250 __ext4_msg(sb, "", " "); \
2251} while (0)
2252#define dump_mmp_msg(sb, mmp, msg) \
2253 __dump_mmp_msg(sb, mmp, "", 0, "")
2254#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2255do { \
2256 no_printk(fmt, ##__VA_ARGS__); \
2257 __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
2258} while (0)
2259
2260#endif
2261
2206extern void ext4_update_dynamic_rev(struct super_block *sb); 2262extern void ext4_update_dynamic_rev(struct super_block *sb);
2207extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 2263extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
2208 __u32 compat); 2264 __u32 compat);
@@ -2313,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
2313{ 2369{
2314 struct ext4_group_info ***grp_info; 2370 struct ext4_group_info ***grp_info;
2315 long indexv, indexh; 2371 long indexv, indexh;
2372 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
2316 grp_info = EXT4_SB(sb)->s_group_info; 2373 grp_info = EXT4_SB(sb)->s_group_info;
2317 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); 2374 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
2318 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); 2375 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2516,7 +2573,7 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
2516 struct inode *parent, 2573 struct inode *parent,
2517 struct inode *inode); 2574 struct inode *inode);
2518extern int ext4_read_inline_dir(struct file *filp, 2575extern int ext4_read_inline_dir(struct file *filp,
2519 void *dirent, filldir_t filldir, 2576 struct dir_context *ctx,
2520 int *has_inline_data); 2577 int *has_inline_data);
2521extern int htree_inlinedir_to_tree(struct file *dir_file, 2578extern int htree_inlinedir_to_tree(struct file *dir_file,
2522 struct inode *dir, ext4_lblk_t block, 2579 struct inode *dir, ext4_lblk_t block,
@@ -2599,8 +2656,7 @@ struct ext4_extent;
2599 2656
2600extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2657extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
2601extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2658extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
2602extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 2659extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
2603 int chunk);
2604extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2660extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2605 struct ext4_map_blocks *map, int flags); 2661 struct ext4_map_blocks *map, int flags);
2606extern void ext4_ext_truncate(handle_t *, struct inode *); 2662extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2610,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);
2610extern void ext4_ext_release(struct super_block *); 2666extern void ext4_ext_release(struct super_block *);
2611extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2667extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
2612 loff_t len); 2668 loff_t len);
2613extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2669extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
2614 ssize_t len); 2670 loff_t offset, ssize_t len);
2615extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2671extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
2616 struct ext4_map_blocks *map, int flags); 2672 struct ext4_map_blocks *map, int flags);
2617extern int ext4_ext_calc_metadata_amount(struct inode *inode, 2673extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2652,14 +2708,14 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2652/* page-io.c */ 2708/* page-io.c */
2653extern int __init ext4_init_pageio(void); 2709extern int __init ext4_init_pageio(void);
2654extern void ext4_exit_pageio(void); 2710extern void ext4_exit_pageio(void);
2655extern void ext4_ioend_shutdown(struct inode *);
2656extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2711extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2657extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); 2712extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
2658extern int ext4_put_io_end(ext4_io_end_t *io_end); 2713extern int ext4_put_io_end(ext4_io_end_t *io_end);
2659extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); 2714extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
2660extern void ext4_io_submit_init(struct ext4_io_submit *io, 2715extern void ext4_io_submit_init(struct ext4_io_submit *io,
2661 struct writeback_control *wbc); 2716 struct writeback_control *wbc);
2662extern void ext4_end_io_work(struct work_struct *work); 2717extern void ext4_end_io_rsv_work(struct work_struct *work);
2718extern void ext4_end_io_unrsv_work(struct work_struct *work);
2663extern void ext4_io_submit(struct ext4_io_submit *io); 2719extern void ext4_io_submit(struct ext4_io_submit *io);
2664extern int ext4_bio_write_page(struct ext4_io_submit *io, 2720extern int ext4_bio_write_page(struct ext4_io_submit *io,
2665 struct page *page, 2721 struct page *page,
@@ -2672,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
2672extern int ext4_mmp_csum_verify(struct super_block *sb, 2728extern int ext4_mmp_csum_verify(struct super_block *sb,
2673 struct mmp_struct *mmp); 2729 struct mmp_struct *mmp);
2674 2730
2675/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2731/*
2732 * Note that these flags will never ever appear in a buffer_head's state flag.
2733 * See EXT4_MAP_... to see where this is used.
2734 */
2676enum ext4_state_bits { 2735enum ext4_state_bits {
2677 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2736 BH_Uninit /* blocks are allocated but uninitialized on disk */
2678 = BH_JBDPrivateStart, 2737 = BH_JBDPrivateStart,
2679 BH_AllocFromCluster, /* allocated blocks were part of already 2738 BH_AllocFromCluster, /* allocated blocks were part of already
2680 * allocated cluster. Note that this flag will 2739 * allocated cluster. */
2681 * never, ever appear in a buffer_head's state
2682 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2683 * this is used. */
2684}; 2740};
2685 2741
2686BUFFER_FNS(Uninit, uninit)
2687TAS_BUFFER_FNS(Uninit, uninit)
2688
2689/* 2742/*
2690 * Add new method to test whether block and inode bitmaps are properly 2743 * Add new method to test whether block and inode bitmaps are properly
2691 * initialized. With uninit_bg reading the block from disk is not enough 2744 * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
38/* 38/*
39 * Wrappers for jbd2_journal_start/end. 39 * Wrappers for jbd2_journal_start/end.
40 */ 40 */
41handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 41static int ext4_journal_check_start(struct super_block *sb)
42 int type, int nblocks)
43{ 42{
44 journal_t *journal; 43 journal_t *journal;
45 44
46 might_sleep(); 45 might_sleep();
47
48 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
49 if (sb->s_flags & MS_RDONLY) 46 if (sb->s_flags & MS_RDONLY)
50 return ERR_PTR(-EROFS); 47 return -EROFS;
51
52 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); 48 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
53 journal = EXT4_SB(sb)->s_journal; 49 journal = EXT4_SB(sb)->s_journal;
54 if (!journal)
55 return ext4_get_nojournal();
56 /* 50 /*
57 * Special case here: if the journal has aborted behind our 51 * Special case here: if the journal has aborted behind our
58 * backs (eg. EIO in the commit thread), then we still need to 52 * backs (eg. EIO in the commit thread), then we still need to
59 * take the FS itself readonly cleanly. 53 * take the FS itself readonly cleanly.
60 */ 54 */
61 if (is_journal_aborted(journal)) { 55 if (journal && is_journal_aborted(journal)) {
62 ext4_abort(sb, "Detected aborted journal"); 56 ext4_abort(sb, "Detected aborted journal");
63 return ERR_PTR(-EROFS); 57 return -EROFS;
64 } 58 }
65 return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); 59 return 0;
60}
61
62handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
63 int type, int blocks, int rsv_blocks)
64{
65 journal_t *journal;
66 int err;
67
68 trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
69 err = ext4_journal_check_start(sb);
70 if (err < 0)
71 return ERR_PTR(err);
72
73 journal = EXT4_SB(sb)->s_journal;
74 if (!journal)
75 return ext4_get_nojournal();
76 return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
77 type, line);
66} 78}
67 79
68int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) 80int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
86 return err; 98 return err;
87} 99}
88 100
101handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
102 int type)
103{
104 struct super_block *sb;
105 int err;
106
107 if (!ext4_handle_valid(handle))
108 return ext4_get_nojournal();
109
110 sb = handle->h_journal->j_private;
111 trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
112 _RET_IP_);
113 err = ext4_journal_check_start(sb);
114 if (err < 0) {
115 jbd2_journal_free_reserved(handle);
116 return ERR_PTR(err);
117 }
118
119 err = jbd2_journal_start_reserved(handle, type, line);
120 if (err < 0)
121 return ERR_PTR(err);
122 return handle;
123}
124
89void ext4_journal_abort_handle(const char *caller, unsigned int line, 125void ext4_journal_abort_handle(const char *caller, unsigned int line,
90 const char *err_fn, struct buffer_head *bh, 126 const char *err_fn, struct buffer_head *bh,
91 handle_t *handle, int err) 127 handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
134#define EXT4_HT_MIGRATE 8 134#define EXT4_HT_MIGRATE 8
135#define EXT4_HT_MOVE_EXTENTS 9 135#define EXT4_HT_MOVE_EXTENTS 9
136#define EXT4_HT_XATTR 10 136#define EXT4_HT_XATTR 10
137#define EXT4_HT_MAX 11 137#define EXT4_HT_EXT_CONVERT 11
138#define EXT4_HT_MAX 12
138 139
139/** 140/**
140 * struct ext4_journal_cb_entry - Base structure for callback information. 141 * struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
265 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) 266 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
266 267
267handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 268handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
268 int type, int nblocks); 269 int type, int blocks, int rsv_blocks);
269int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); 270int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
270 271
271#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) 272#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
300} 301}
301 302
302#define ext4_journal_start_sb(sb, type, nblocks) \ 303#define ext4_journal_start_sb(sb, type, nblocks) \
303 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) 304 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
304 305
305#define ext4_journal_start(inode, type, nblocks) \ 306#define ext4_journal_start(inode, type, nblocks) \
306 __ext4_journal_start((inode), __LINE__, (type), (nblocks)) 307 __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
308
309#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
310 __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
307 311
308static inline handle_t *__ext4_journal_start(struct inode *inode, 312static inline handle_t *__ext4_journal_start(struct inode *inode,
309 unsigned int line, int type, 313 unsigned int line, int type,
310 int nblocks) 314 int blocks, int rsv_blocks)
311{ 315{
312 return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); 316 return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
317 rsv_blocks);
313} 318}
314 319
315#define ext4_journal_stop(handle) \ 320#define ext4_journal_stop(handle) \
316 __ext4_journal_stop(__func__, __LINE__, (handle)) 321 __ext4_journal_stop(__func__, __LINE__, (handle))
317 322
323#define ext4_journal_start_reserved(handle, type) \
324 __ext4_journal_start_reserved((handle), __LINE__, (type))
325
326handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
327 int type);
328
329static inline void ext4_journal_free_reserved(handle_t *handle)
330{
331 if (ext4_handle_valid(handle))
332 jbd2_journal_free_reserved(handle);
333}
334
318static inline handle_t *ext4_journal_current_handle(void) 335static inline handle_t *ext4_journal_current_handle(void)
319{ 336{
320 return journal_current_handle(); 337 return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 107936db244e..7097b0f680e6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2125 next_del = ext4_find_delayed_extent(inode, &es); 2125 next_del = ext4_find_delayed_extent(inode, &es);
2126 if (!exists && next_del) { 2126 if (!exists && next_del) {
2127 exists = 1; 2127 exists = 1;
2128 flags |= FIEMAP_EXTENT_DELALLOC; 2128 flags |= (FIEMAP_EXTENT_DELALLOC |
2129 FIEMAP_EXTENT_UNKNOWN);
2129 } 2130 }
2130 up_read(&EXT4_I(inode)->i_data_sem); 2131 up_read(&EXT4_I(inode)->i_data_sem);
2131 2132
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2328} 2329}
2329 2330
2330/* 2331/*
2331 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2332 * How many index/leaf blocks need to change/allocate to add @extents extents?
2332 * 2333 *
2333 * if nrblocks are fit in a single extent (chunk flag is 1), then 2334 * If we add a single extent, then in the worse case, each tree level
2334 * in the worse case, each tree level index/leaf need to be changed 2335 * index/leaf need to be changed in case of the tree split.
2335 * if the tree split due to insert a new extent, then the old tree
2336 * index/leaf need to be updated too
2337 * 2336 *
2338 * If the nrblocks are discontiguous, they could cause 2337 * If more extents are inserted, they could cause the whole tree split more
2339 * the whole tree split more than once, but this is really rare. 2338 * than once, but this is really rare.
2340 */ 2339 */
2341int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2340int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2342{ 2341{
2343 int index; 2342 int index;
2344 int depth; 2343 int depth;
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2349 2348
2350 depth = ext_depth(inode); 2349 depth = ext_depth(inode);
2351 2350
2352 if (chunk) 2351 if (extents <= 1)
2353 index = depth * 2; 2352 index = depth * 2;
2354 else 2353 else
2355 index = depth * 3; 2354 index = depth * 3;
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2357 return index; 2356 return index;
2358} 2357}
2359 2358
2359static inline int get_default_free_blocks_flags(struct inode *inode)
2360{
2361 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2362 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2363 else if (ext4_should_journal_data(inode))
2364 return EXT4_FREE_BLOCKS_FORGET;
2365 return 0;
2366}
2367
2360static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2368static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2361 struct ext4_extent *ex, 2369 struct ext4_extent *ex,
2362 ext4_fsblk_t *partial_cluster, 2370 long long *partial_cluster,
2363 ext4_lblk_t from, ext4_lblk_t to) 2371 ext4_lblk_t from, ext4_lblk_t to)
2364{ 2372{
2365 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2373 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2366 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2374 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2367 ext4_fsblk_t pblk; 2375 ext4_fsblk_t pblk;
2368 int flags = 0; 2376 int flags = get_default_free_blocks_flags(inode);
2369
2370 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2371 flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2372 else if (ext4_should_journal_data(inode))
2373 flags |= EXT4_FREE_BLOCKS_FORGET;
2374 2377
2375 /* 2378 /*
2376 * For bigalloc file systems, we never free a partial cluster 2379 * For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2388 * partial cluster here. 2391 * partial cluster here.
2389 */ 2392 */
2390 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2393 pblk = ext4_ext_pblock(ex) + ee_len - 1;
2391 if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2394 if ((*partial_cluster > 0) &&
2395 (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2392 ext4_free_blocks(handle, inode, NULL, 2396 ext4_free_blocks(handle, inode, NULL,
2393 EXT4_C2B(sbi, *partial_cluster), 2397 EXT4_C2B(sbi, *partial_cluster),
2394 sbi->s_cluster_ratio, flags); 2398 sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2414 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2418 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2415 /* tail removal */ 2419 /* tail removal */
2416 ext4_lblk_t num; 2420 ext4_lblk_t num;
2421 unsigned int unaligned;
2417 2422
2418 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2423 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2419 pblk = ext4_ext_pblock(ex) + ee_len - num; 2424 pblk = ext4_ext_pblock(ex) + ee_len - num;
2420 ext_debug("free last %u blocks starting %llu\n", num, pblk); 2425 /*
2426 * Usually we want to free partial cluster at the end of the
2427 * extent, except for the situation when the cluster is still
2428 * used by any other extent (partial_cluster is negative).
2429 */
2430 if (*partial_cluster < 0 &&
2431 -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
2432 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2433
2434 ext_debug("free last %u blocks starting %llu partial %lld\n",
2435 num, pblk, *partial_cluster);
2421 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2436 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2422 /* 2437 /*
2423 * If the block range to be freed didn't start at the 2438 * If the block range to be freed didn't start at the
2424 * beginning of a cluster, and we removed the entire 2439 * beginning of a cluster, and we removed the entire
2425 * extent, save the partial cluster here, since we 2440 * extent and the cluster is not used by any other extent,
2426 * might need to delete if we determine that the 2441 * save the partial cluster here, since we might need to
2427 * truncate operation has removed all of the blocks in 2442 * delete if we determine that the truncate operation has
2428 * the cluster. 2443 * removed all of the blocks in the cluster.
2444 *
2445 * On the other hand, if we did not manage to free the whole
2446 * extent, we have to mark the cluster as used (store negative
2447 * cluster number in partial_cluster).
2429 */ 2448 */
2430 if (pblk & (sbi->s_cluster_ratio - 1) && 2449 unaligned = pblk & (sbi->s_cluster_ratio - 1);
2431 (ee_len == num)) 2450 if (unaligned && (ee_len == num) &&
2451 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2432 *partial_cluster = EXT4_B2C(sbi, pblk); 2452 *partial_cluster = EXT4_B2C(sbi, pblk);
2433 else 2453 else if (unaligned)
2454 *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2455 else if (*partial_cluster > 0)
2434 *partial_cluster = 0; 2456 *partial_cluster = 0;
2435 } else if (from == le32_to_cpu(ex->ee_block) 2457 } else
2436 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2458 ext4_error(sbi->s_sb, "strange request: removal(2) "
2437 /* head removal */ 2459 "%u-%u from %u:%u\n",
2438 ext4_lblk_t num; 2460 from, to, le32_to_cpu(ex->ee_block), ee_len);
2439 ext4_fsblk_t start;
2440
2441 num = to - from;
2442 start = ext4_ext_pblock(ex);
2443
2444 ext_debug("free first %u blocks starting %llu\n", num, start);
2445 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2446
2447 } else {
2448 printk(KERN_INFO "strange request: removal(2) "
2449 "%u-%u from %u:%u\n",
2450 from, to, le32_to_cpu(ex->ee_block), ee_len);
2451 }
2452 return 0; 2461 return 0;
2453} 2462}
2454 2463
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2461 * @handle: The journal handle 2470 * @handle: The journal handle
2462 * @inode: The files inode 2471 * @inode: The files inode
2463 * @path: The path to the leaf 2472 * @path: The path to the leaf
2473 * @partial_cluster: The cluster which we'll have to free if all extents
2474 * has been released from it. It gets negative in case
2475 * that the cluster is still used.
2464 * @start: The first block to remove 2476 * @start: The first block to remove
2465 * @end: The last block to remove 2477 * @end: The last block to remove
2466 */ 2478 */
2467static int 2479static int
2468ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2480ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2469 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, 2481 struct ext4_ext_path *path,
2482 long long *partial_cluster,
2470 ext4_lblk_t start, ext4_lblk_t end) 2483 ext4_lblk_t start, ext4_lblk_t end)
2471{ 2484{
2472 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2485 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2479 unsigned short ex_ee_len; 2492 unsigned short ex_ee_len;
2480 unsigned uninitialized = 0; 2493 unsigned uninitialized = 0;
2481 struct ext4_extent *ex; 2494 struct ext4_extent *ex;
2495 ext4_fsblk_t pblk;
2482 2496
2483 /* the header must be checked already in ext4_ext_remove_space() */ 2497 /* the header must be checked already in ext4_ext_remove_space() */
2484 ext_debug("truncate since %u in leaf to %u\n", start, end); 2498 ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2490 return -EIO; 2504 return -EIO;
2491 } 2505 }
2492 /* find where to start removing */ 2506 /* find where to start removing */
2493 ex = EXT_LAST_EXTENT(eh); 2507 ex = path[depth].p_ext;
2508 if (!ex)
2509 ex = EXT_LAST_EXTENT(eh);
2494 2510
2495 ex_ee_block = le32_to_cpu(ex->ee_block); 2511 ex_ee_block = le32_to_cpu(ex->ee_block);
2496 ex_ee_len = ext4_ext_get_actual_len(ex); 2512 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2517 2533
2518 /* If this extent is beyond the end of the hole, skip it */ 2534 /* If this extent is beyond the end of the hole, skip it */
2519 if (end < ex_ee_block) { 2535 if (end < ex_ee_block) {
2536 /*
2537 * We're going to skip this extent and move to another,
2538 * so if this extent is not cluster aligned we have
2539 * to mark the current cluster as used to avoid
2540 * accidentally freeing it later on
2541 */
2542 pblk = ext4_ext_pblock(ex);
2543 if (pblk & (sbi->s_cluster_ratio - 1))
2544 *partial_cluster =
2545 -((long long)EXT4_B2C(sbi, pblk));
2520 ex--; 2546 ex--;
2521 ex_ee_block = le32_to_cpu(ex->ee_block); 2547 ex_ee_block = le32_to_cpu(ex->ee_block);
2522 ex_ee_len = ext4_ext_get_actual_len(ex); 2548 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2592 sizeof(struct ext4_extent)); 2618 sizeof(struct ext4_extent));
2593 } 2619 }
2594 le16_add_cpu(&eh->eh_entries, -1); 2620 le16_add_cpu(&eh->eh_entries, -1);
2595 } else 2621 } else if (*partial_cluster > 0)
2596 *partial_cluster = 0; 2622 *partial_cluster = 0;
2597 2623
2598 err = ext4_ext_dirty(handle, inode, path + depth); 2624 err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2610 err = ext4_ext_correct_indexes(handle, inode, path); 2636 err = ext4_ext_correct_indexes(handle, inode, path);
2611 2637
2612 /* 2638 /*
2613 * If there is still a entry in the leaf node, check to see if 2639 * Free the partial cluster only if the current extent does not
2614 * it references the partial cluster. This is the only place 2640 * reference it. Otherwise we might free used cluster.
2615 * where it could; if it doesn't, we can free the cluster.
2616 */ 2641 */
2617 if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && 2642 if (*partial_cluster > 0 &&
2618 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2643 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2619 *partial_cluster)) { 2644 *partial_cluster)) {
2620 int flags = EXT4_FREE_BLOCKS_FORGET; 2645 int flags = get_default_free_blocks_flags(inode);
2621
2622 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2623 flags |= EXT4_FREE_BLOCKS_METADATA;
2624 2646
2625 ext4_free_blocks(handle, inode, NULL, 2647 ext4_free_blocks(handle, inode, NULL,
2626 EXT4_C2B(sbi, *partial_cluster), 2648 EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2664 struct super_block *sb = inode->i_sb; 2686 struct super_block *sb = inode->i_sb;
2665 int depth = ext_depth(inode); 2687 int depth = ext_depth(inode);
2666 struct ext4_ext_path *path = NULL; 2688 struct ext4_ext_path *path = NULL;
2667 ext4_fsblk_t partial_cluster = 0; 2689 long long partial_cluster = 0;
2668 handle_t *handle; 2690 handle_t *handle;
2669 int i = 0, err = 0; 2691 int i = 0, err = 0;
2670 2692
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2676 return PTR_ERR(handle); 2698 return PTR_ERR(handle);
2677 2699
2678again: 2700again:
2679 trace_ext4_ext_remove_space(inode, start, depth); 2701 trace_ext4_ext_remove_space(inode, start, end, depth);
2680 2702
2681 /* 2703 /*
2682 * Check if we are removing extents inside the extent tree. If that 2704 * Check if we are removing extents inside the extent tree. If that
@@ -2844,17 +2866,14 @@ again:
2844 } 2866 }
2845 } 2867 }
2846 2868
2847 trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, 2869 trace_ext4_ext_remove_space_done(inode, start, end, depth,
2848 path->p_hdr->eh_entries); 2870 partial_cluster, path->p_hdr->eh_entries);
2849 2871
2850 /* If we still have something in the partial cluster and we have removed 2872 /* If we still have something in the partial cluster and we have removed
2851 * even the first extent, then we should free the blocks in the partial 2873 * even the first extent, then we should free the blocks in the partial
2852 * cluster as well. */ 2874 * cluster as well. */
2853 if (partial_cluster && path->p_hdr->eh_entries == 0) { 2875 if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
2854 int flags = EXT4_FREE_BLOCKS_FORGET; 2876 int flags = get_default_free_blocks_flags(inode);
2855
2856 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2857 flags |= EXT4_FREE_BLOCKS_METADATA;
2858 2877
2859 ext4_free_blocks(handle, inode, NULL, 2878 ext4_free_blocks(handle, inode, NULL,
2860 EXT4_C2B(EXT4_SB(sb), partial_cluster), 2879 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -3642,7 +3661,7 @@ int ext4_find_delalloc_range(struct inode *inode,
3642{ 3661{
3643 struct extent_status es; 3662 struct extent_status es;
3644 3663
3645 ext4_es_find_delayed_extent(inode, lblk_start, &es); 3664 ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
3646 if (es.es_len == 0) 3665 if (es.es_len == 0)
3647 return 0; /* there is no delay extent in this tree */ 3666 return 0; /* there is no delay extent in this tree */
3648 else if (es.es_lblk <= lblk_start && 3667 else if (es.es_lblk <= lblk_start &&
@@ -4363,7 +4382,7 @@ out2:
4363 } 4382 }
4364 4383
4365out3: 4384out3:
4366 trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); 4385 trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
4367 4386
4368 return err ? err : allocated; 4387 return err ? err : allocated;
4369} 4388}
@@ -4446,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4446 return -EOPNOTSUPP; 4465 return -EOPNOTSUPP;
4447 4466
4448 if (mode & FALLOC_FL_PUNCH_HOLE) 4467 if (mode & FALLOC_FL_PUNCH_HOLE)
4449 return ext4_punch_hole(file, offset, len); 4468 return ext4_punch_hole(inode, offset, len);
4450 4469
4451 ret = ext4_convert_inline_data(inode); 4470 ret = ext4_convert_inline_data(inode);
4452 if (ret) 4471 if (ret)
@@ -4548,10 +4567,9 @@ retry:
4548 * function, to convert the fallocated extents after IO is completed. 4567 * function, to convert the fallocated extents after IO is completed.
4549 * Returns 0 on success. 4568 * Returns 0 on success.
4550 */ 4569 */
4551int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 4570int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4552 ssize_t len) 4571 loff_t offset, ssize_t len)
4553{ 4572{
4554 handle_t *handle;
4555 unsigned int max_blocks; 4573 unsigned int max_blocks;
4556 int ret = 0; 4574 int ret = 0;
4557 int ret2 = 0; 4575 int ret2 = 0;
@@ -4566,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4566 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 4584 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4567 map.m_lblk); 4585 map.m_lblk);
4568 /* 4586 /*
4569 * credits to insert 1 extent into extent tree 4587 * This is somewhat ugly but the idea is clear: When transaction is
4588 * reserved, everything goes into it. Otherwise we rather start several
4589 * smaller transactions for conversion of each extent separately.
4570 */ 4590 */
4571 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4591 if (handle) {
4592 handle = ext4_journal_start_reserved(handle,
4593 EXT4_HT_EXT_CONVERT);
4594 if (IS_ERR(handle))
4595 return PTR_ERR(handle);
4596 credits = 0;
4597 } else {
4598 /*
4599 * credits to insert 1 extent into extent tree
4600 */
4601 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4602 }
4572 while (ret >= 0 && ret < max_blocks) { 4603 while (ret >= 0 && ret < max_blocks) {
4573 map.m_lblk += ret; 4604 map.m_lblk += ret;
4574 map.m_len = (max_blocks -= ret); 4605 map.m_len = (max_blocks -= ret);
4575 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 4606 if (credits) {
4576 if (IS_ERR(handle)) { 4607 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4577 ret = PTR_ERR(handle); 4608 credits);
4578 break; 4609 if (IS_ERR(handle)) {
4610 ret = PTR_ERR(handle);
4611 break;
4612 }
4579 } 4613 }
4580 ret = ext4_map_blocks(handle, inode, &map, 4614 ret = ext4_map_blocks(handle, inode, &map,
4581 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4615 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4586 inode->i_ino, map.m_lblk, 4620 inode->i_ino, map.m_lblk,
4587 map.m_len, ret); 4621 map.m_len, ret);
4588 ext4_mark_inode_dirty(handle, inode); 4622 ext4_mark_inode_dirty(handle, inode);
4589 ret2 = ext4_journal_stop(handle); 4623 if (credits)
4590 if (ret <= 0 || ret2 ) 4624 ret2 = ext4_journal_stop(handle);
4625 if (ret <= 0 || ret2)
4591 break; 4626 break;
4592 } 4627 }
4628 if (!credits)
4629 ret2 = ext4_journal_stop(handle);
4593 return ret > 0 ? ret2 : ret; 4630 return ret > 0 ? ret2 : ret;
4594} 4631}
4595 4632
@@ -4608,9 +4645,10 @@ static int ext4_find_delayed_extent(struct inode *inode,
4608 struct extent_status es; 4645 struct extent_status es;
4609 ext4_lblk_t block, next_del; 4646 ext4_lblk_t block, next_del;
4610 4647
4611 ext4_es_find_delayed_extent(inode, newes->es_lblk, &es);
4612
4613 if (newes->es_pblk == 0) { 4648 if (newes->es_pblk == 0) {
4649 ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
4650 newes->es_lblk + newes->es_len - 1, &es);
4651
4614 /* 4652 /*
4615 * No extent in extent-tree contains block @newes->es_pblk, 4653 * No extent in extent-tree contains block @newes->es_pblk,
4616 * then the block may stay in 1)a hole or 2)delayed-extent. 4654 * then the block may stay in 1)a hole or 2)delayed-extent.
@@ -4630,7 +4668,7 @@ static int ext4_find_delayed_extent(struct inode *inode,
4630 } 4668 }
4631 4669
4632 block = newes->es_lblk + newes->es_len; 4670 block = newes->es_lblk + newes->es_len;
4633 ext4_es_find_delayed_extent(inode, block, &es); 4671 ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
4634 if (es.es_len == 0) 4672 if (es.es_len == 0)
4635 next_del = EXT_MAX_BLOCKS; 4673 next_del = EXT_MAX_BLOCKS;
4636 else 4674 else
@@ -4658,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4658 error = ext4_get_inode_loc(inode, &iloc); 4696 error = ext4_get_inode_loc(inode, &iloc);
4659 if (error) 4697 if (error)
4660 return error; 4698 return error;
4661 physical = iloc.bh->b_blocknr << blockbits; 4699 physical = (__u64)iloc.bh->b_blocknr << blockbits;
4662 offset = EXT4_GOOD_OLD_INODE_SIZE + 4700 offset = EXT4_GOOD_OLD_INODE_SIZE +
4663 EXT4_I(inode)->i_extra_isize; 4701 EXT4_I(inode)->i_extra_isize;
4664 physical += offset; 4702 physical += offset;
@@ -4666,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4666 flags |= FIEMAP_EXTENT_DATA_INLINE; 4704 flags |= FIEMAP_EXTENT_DATA_INLINE;
4667 brelse(iloc.bh); 4705 brelse(iloc.bh);
4668 } else { /* external block */ 4706 } else { /* external block */
4669 physical = EXT4_I(inode)->i_file_acl << blockbits; 4707 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4670 length = inode->i_sb->s_blocksize; 4708 length = inode->i_sb->s_blocksize;
4671 } 4709 }
4672 4710
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index fe3337a85ede..ee018d5f397e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
10 * Ext4 extents status tree core functions. 10 * Ext4 extents status tree core functions.
11 */ 11 */
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/list_sort.h>
13#include "ext4.h" 14#include "ext4.h"
14#include "extents_status.h" 15#include "extents_status.h"
15#include "ext4_extents.h" 16#include "ext4_extents.h"
@@ -232,14 +233,16 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
232} 233}
233 234
234/* 235/*
235 * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk 236 * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
236 * if it exists, otherwise, the next extent after @es->lblk. 237 * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
237 * 238 *
238 * @inode: the inode which owns delayed extents 239 * @inode: the inode which owns delayed extents
239 * @lblk: the offset where we start to search 240 * @lblk: the offset where we start to search
241 * @end: the offset where we stop to search
240 * @es: delayed extent that we found 242 * @es: delayed extent that we found
241 */ 243 */
242void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, 244void ext4_es_find_delayed_extent_range(struct inode *inode,
245 ext4_lblk_t lblk, ext4_lblk_t end,
243 struct extent_status *es) 246 struct extent_status *es)
244{ 247{
245 struct ext4_es_tree *tree = NULL; 248 struct ext4_es_tree *tree = NULL;
@@ -247,7 +250,8 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
247 struct rb_node *node; 250 struct rb_node *node;
248 251
249 BUG_ON(es == NULL); 252 BUG_ON(es == NULL);
250 trace_ext4_es_find_delayed_extent_enter(inode, lblk); 253 BUG_ON(end < lblk);
254 trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
251 255
252 read_lock(&EXT4_I(inode)->i_es_lock); 256 read_lock(&EXT4_I(inode)->i_es_lock);
253 tree = &EXT4_I(inode)->i_es_tree; 257 tree = &EXT4_I(inode)->i_es_tree;
@@ -270,6 +274,10 @@ out:
270 if (es1 && !ext4_es_is_delayed(es1)) { 274 if (es1 && !ext4_es_is_delayed(es1)) {
271 while ((node = rb_next(&es1->rb_node)) != NULL) { 275 while ((node = rb_next(&es1->rb_node)) != NULL) {
272 es1 = rb_entry(node, struct extent_status, rb_node); 276 es1 = rb_entry(node, struct extent_status, rb_node);
277 if (es1->es_lblk > end) {
278 es1 = NULL;
279 break;
280 }
273 if (ext4_es_is_delayed(es1)) 281 if (ext4_es_is_delayed(es1))
274 break; 282 break;
275 } 283 }
@@ -284,8 +292,7 @@ out:
284 292
285 read_unlock(&EXT4_I(inode)->i_es_lock); 293 read_unlock(&EXT4_I(inode)->i_es_lock);
286 294
287 ext4_es_lru_add(inode); 295 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
288 trace_ext4_es_find_delayed_extent_exit(inode, es);
289} 296}
290 297
291static struct extent_status * 298static struct extent_status *
@@ -665,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
665error: 672error:
666 write_unlock(&EXT4_I(inode)->i_es_lock); 673 write_unlock(&EXT4_I(inode)->i_es_lock);
667 674
668 ext4_es_lru_add(inode);
669 ext4_es_print_tree(inode); 675 ext4_es_print_tree(inode);
670 676
671 return err; 677 return err;
@@ -727,7 +733,6 @@ out:
727 733
728 read_unlock(&EXT4_I(inode)->i_es_lock); 734 read_unlock(&EXT4_I(inode)->i_es_lock);
729 735
730 ext4_es_lru_add(inode);
731 trace_ext4_es_lookup_extent_exit(inode, es, found); 736 trace_ext4_es_lookup_extent_exit(inode, es, found);
732 return found; 737 return found;
733} 738}
@@ -871,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
871 EXTENT_STATUS_WRITTEN); 876 EXTENT_STATUS_WRITTEN);
872} 877}
873 878
879static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
880 struct list_head *b)
881{
882 struct ext4_inode_info *eia, *eib;
883 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
884 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
885
886 if (eia->i_touch_when == eib->i_touch_when)
887 return 0;
888 if (time_after(eia->i_touch_when, eib->i_touch_when))
889 return 1;
890 else
891 return -1;
892}
893
874static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 894static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
875{ 895{
876 struct ext4_sb_info *sbi = container_of(shrink, 896 struct ext4_sb_info *sbi = container_of(shrink,
877 struct ext4_sb_info, s_es_shrinker); 897 struct ext4_sb_info, s_es_shrinker);
878 struct ext4_inode_info *ei; 898 struct ext4_inode_info *ei;
879 struct list_head *cur, *tmp, scanned; 899 struct list_head *cur, *tmp;
900 LIST_HEAD(skiped);
880 int nr_to_scan = sc->nr_to_scan; 901 int nr_to_scan = sc->nr_to_scan;
881 int ret, nr_shrunk = 0; 902 int ret, nr_shrunk = 0;
882 903
@@ -886,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
886 if (!nr_to_scan) 907 if (!nr_to_scan)
887 return ret; 908 return ret;
888 909
889 INIT_LIST_HEAD(&scanned);
890
891 spin_lock(&sbi->s_es_lru_lock); 910 spin_lock(&sbi->s_es_lru_lock);
911
912 /*
913 * If the inode that is at the head of LRU list is newer than
914 * last_sorted time, that means that we need to sort this list.
915 */
916 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
917 if (sbi->s_es_last_sorted < ei->i_touch_when) {
918 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
919 sbi->s_es_last_sorted = jiffies;
920 }
921
892 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 922 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
893 list_move_tail(cur, &scanned); 923 /*
924 * If we have already reclaimed all extents from extent
925 * status tree, just stop the loop immediately.
926 */
927 if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
928 break;
894 929
895 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 930 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
896 931
897 read_lock(&ei->i_es_lock); 932 /* Skip the inode that is newer than the last_sorted time */
898 if (ei->i_es_lru_nr == 0) { 933 if (sbi->s_es_last_sorted < ei->i_touch_when) {
899 read_unlock(&ei->i_es_lock); 934 list_move_tail(cur, &skiped);
900 continue; 935 continue;
901 } 936 }
902 read_unlock(&ei->i_es_lock); 937
938 if (ei->i_es_lru_nr == 0)
939 continue;
903 940
904 write_lock(&ei->i_es_lock); 941 write_lock(&ei->i_es_lock);
905 ret = __es_try_to_reclaim_extents(ei, nr_to_scan); 942 ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
943 if (ei->i_es_lru_nr == 0)
944 list_del_init(&ei->i_es_lru);
906 write_unlock(&ei->i_es_lock); 945 write_unlock(&ei->i_es_lock);
907 946
908 nr_shrunk += ret; 947 nr_shrunk += ret;
@@ -910,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
910 if (nr_to_scan == 0) 949 if (nr_to_scan == 0)
911 break; 950 break;
912 } 951 }
913 list_splice_tail(&scanned, &sbi->s_es_lru); 952
953 /* Move the newer inodes into the tail of the LRU list. */
954 list_splice_tail(&skiped, &sbi->s_es_lru);
914 spin_unlock(&sbi->s_es_lru_lock); 955 spin_unlock(&sbi->s_es_lru_lock);
915 956
916 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 957 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
@@ -918,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
918 return ret; 959 return ret;
919} 960}
920 961
921void ext4_es_register_shrinker(struct super_block *sb) 962void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
922{ 963{
923 struct ext4_sb_info *sbi;
924
925 sbi = EXT4_SB(sb);
926 INIT_LIST_HEAD(&sbi->s_es_lru); 964 INIT_LIST_HEAD(&sbi->s_es_lru);
927 spin_lock_init(&sbi->s_es_lru_lock); 965 spin_lock_init(&sbi->s_es_lru_lock);
966 sbi->s_es_last_sorted = 0;
928 sbi->s_es_shrinker.shrink = ext4_es_shrink; 967 sbi->s_es_shrinker.shrink = ext4_es_shrink;
929 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 968 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
930 register_shrinker(&sbi->s_es_shrinker); 969 register_shrinker(&sbi->s_es_shrinker);
931} 970}
932 971
933void ext4_es_unregister_shrinker(struct super_block *sb) 972void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
934{ 973{
935 unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); 974 unregister_shrinker(&sbi->s_es_shrinker);
936} 975}
937 976
938void ext4_es_lru_add(struct inode *inode) 977void ext4_es_lru_add(struct inode *inode)
@@ -940,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode)
940 struct ext4_inode_info *ei = EXT4_I(inode); 979 struct ext4_inode_info *ei = EXT4_I(inode);
941 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 980 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
942 981
982 ei->i_touch_when = jiffies;
983
984 if (!list_empty(&ei->i_es_lru))
985 return;
986
943 spin_lock(&sbi->s_es_lru_lock); 987 spin_lock(&sbi->s_es_lru_lock);
944 if (list_empty(&ei->i_es_lru)) 988 if (list_empty(&ei->i_es_lru))
945 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 989 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
946 else
947 list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
948 spin_unlock(&sbi->s_es_lru_lock); 990 spin_unlock(&sbi->s_es_lru_lock);
949} 991}
950 992
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index d8e2d4dc311e..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
39 EXTENT_STATUS_DELAYED | \ 39 EXTENT_STATUS_DELAYED | \
40 EXTENT_STATUS_HOLE) 40 EXTENT_STATUS_HOLE)
41 41
42struct ext4_sb_info;
42struct ext4_extent; 43struct ext4_extent;
43 44
44struct extent_status { 45struct extent_status {
@@ -62,7 +63,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
62 unsigned long long status); 63 unsigned long long status);
63extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 64extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
64 ext4_lblk_t len); 65 ext4_lblk_t len);
65extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, 66extern void ext4_es_find_delayed_extent_range(struct inode *inode,
67 ext4_lblk_t lblk, ext4_lblk_t end,
66 struct extent_status *es); 68 struct extent_status *es);
67extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 69extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
68 struct extent_status *es); 70 struct extent_status *es);
@@ -118,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
118 es->es_pblk = block; 120 es->es_pblk = block;
119} 121}
120 122
121extern void ext4_es_register_shrinker(struct super_block *sb); 123extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
122extern void ext4_es_unregister_shrinker(struct super_block *sb); 124extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
123extern void ext4_es_lru_add(struct inode *inode); 125extern void ext4_es_lru_add(struct inode *inode);
124extern void ext4_es_lru_del(struct inode *inode); 126extern void ext4_es_lru_del(struct inode *inode);
125 127
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4959e29573b6..6f4cc567c382 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
312 blkbits = inode->i_sb->s_blocksize_bits; 312 blkbits = inode->i_sb->s_blocksize_bits;
313 startoff = *offset; 313 startoff = *offset;
314 lastoff = startoff; 314 lastoff = startoff;
315 endoff = (map->m_lblk + map->m_len) << blkbits; 315 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
316 316
317 index = startoff >> PAGE_CACHE_SHIFT; 317 index = startoff >> PAGE_CACHE_SHIFT;
318 end = endoff >> PAGE_CACHE_SHIFT; 318 end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
457 ret = ext4_map_blocks(NULL, inode, &map, 0); 457 ret = ext4_map_blocks(NULL, inode, &map, 0);
458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
459 if (last != start) 459 if (last != start)
460 dataoff = last << blkbits; 460 dataoff = (loff_t)last << blkbits;
461 break; 461 break;
462 } 462 }
463 463
@@ -465,10 +465,10 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
465 * If there is a delay extent at this offset, 465 * If there is a delay extent at this offset,
466 * it will be as a data. 466 * it will be as a data.
467 */ 467 */
468 ext4_es_find_delayed_extent(inode, last, &es); 468 ext4_es_find_delayed_extent_range(inode, last, last, &es);
469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
470 if (last != start) 470 if (last != start)
471 dataoff = last << blkbits; 471 dataoff = (loff_t)last << blkbits;
472 break; 472 break;
473 } 473 }
474 474
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
486 } 486 }
487 487
488 last++; 488 last++;
489 dataoff = last << blkbits; 489 dataoff = (loff_t)last << blkbits;
490 } while (last <= end); 490 } while (last <= end);
491 491
492 mutex_unlock(&inode->i_mutex); 492 mutex_unlock(&inode->i_mutex);
@@ -494,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
494 if (dataoff > isize) 494 if (dataoff > isize)
495 return -ENXIO; 495 return -ENXIO;
496 496
497 if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 497 return vfs_setpos(file, dataoff, maxsize);
498 return -EINVAL;
499 if (dataoff > maxsize)
500 return -EINVAL;
501
502 if (dataoff != file->f_pos) {
503 file->f_pos = dataoff;
504 file->f_version = 0;
505 }
506
507 return dataoff;
508} 498}
509 499
510/* 500/*
@@ -540,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
540 ret = ext4_map_blocks(NULL, inode, &map, 0); 530 ret = ext4_map_blocks(NULL, inode, &map, 0);
541 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 531 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
542 last += ret; 532 last += ret;
543 holeoff = last << blkbits; 533 holeoff = (loff_t)last << blkbits;
544 continue; 534 continue;
545 } 535 }
546 536
@@ -548,10 +538,10 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
548 * If there is a delay extent at this offset, 538 * If there is a delay extent at this offset,
549 * we will skip this extent. 539 * we will skip this extent.
550 */ 540 */
551 ext4_es_find_delayed_extent(inode, last, &es); 541 ext4_es_find_delayed_extent_range(inode, last, last, &es);
552 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 542 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
553 last = es.es_lblk + es.es_len; 543 last = es.es_lblk + es.es_len;
554 holeoff = last << blkbits; 544 holeoff = (loff_t)last << blkbits;
555 continue; 545 continue;
556 } 546 }
557 547
@@ -566,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
566 &map, &holeoff); 556 &map, &holeoff);
567 if (!unwritten) { 557 if (!unwritten) {
568 last += ret; 558 last += ret;
569 holeoff = last << blkbits; 559 holeoff = (loff_t)last << blkbits;
570 continue; 560 continue;
571 } 561 }
572 } 562 }
@@ -580,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
580 if (holeoff > isize) 570 if (holeoff > isize)
581 holeoff = isize; 571 holeoff = isize;
582 572
583 if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 573 return vfs_setpos(file, holeoff, maxsize);
584 return -EINVAL;
585 if (holeoff > maxsize)
586 return -EINVAL;
587
588 if (holeoff != file->f_pos) {
589 file->f_pos = holeoff;
590 file->f_version = 0;
591 }
592
593 return holeoff;
594} 574}
595 575
596/* 576/*
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
73 return ret; 73 return ret;
74} 74}
75 75
76/**
77 * __sync_file - generic_file_fsync without the locking and filemap_write
78 * @inode: inode to sync
79 * @datasync: only sync essential metadata if true
80 *
81 * This is just generic_file_fsync without the locking. This is needed for
82 * nojournal mode to make sure this inodes data/metadata makes it to disk
83 * properly. The i_mutex should be held already.
84 */
85static int __sync_inode(struct inode *inode, int datasync)
86{
87 int err;
88 int ret;
89
90 ret = sync_mapping_buffers(inode->i_mapping);
91 if (!(inode->i_state & I_DIRTY))
92 return ret;
93 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
94 return ret;
95
96 err = sync_inode_metadata(inode, 1);
97 if (ret == 0)
98 ret = err;
99 return ret;
100}
101
102/* 76/*
103 * akpm: A new design for ext4_sync_file(). 77 * akpm: A new design for ext4_sync_file().
104 * 78 *
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
116 struct inode *inode = file->f_mapping->host; 90 struct inode *inode = file->f_mapping->host;
117 struct ext4_inode_info *ei = EXT4_I(inode); 91 struct ext4_inode_info *ei = EXT4_I(inode);
118 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 92 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
119 int ret, err; 93 int ret = 0, err;
120 tid_t commit_tid; 94 tid_t commit_tid;
121 bool needs_barrier = false; 95 bool needs_barrier = false;
122 96
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
124 98
125 trace_ext4_sync_file_enter(file, datasync); 99 trace_ext4_sync_file_enter(file, datasync);
126 100
127 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 101 if (inode->i_sb->s_flags & MS_RDONLY) {
128 if (ret) 102 /* Make sure that we read updated s_mount_flags value */
129 return ret; 103 smp_rmb();
130 mutex_lock(&inode->i_mutex); 104 if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
131 105 ret = -EROFS;
132 if (inode->i_sb->s_flags & MS_RDONLY)
133 goto out;
134
135 ret = ext4_flush_unwritten_io(inode);
136 if (ret < 0)
137 goto out; 106 goto out;
107 }
138 108
139 if (!journal) { 109 if (!journal) {
140 ret = __sync_inode(inode, datasync); 110 ret = generic_file_fsync(file, start, end, datasync);
141 if (!ret && !hlist_empty(&inode->i_dentry)) 111 if (!ret && !hlist_empty(&inode->i_dentry))
142 ret = ext4_sync_parent(inode); 112 ret = ext4_sync_parent(inode);
143 goto out; 113 goto out;
144 } 114 }
145 115
116 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
117 if (ret)
118 return ret;
146 /* 119 /*
147 * data=writeback,ordered: 120 * data=writeback,ordered:
148 * The caller's filemap_fdatawrite()/wait will sync the data. 121 * The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
172 if (!ret) 145 if (!ret)
173 ret = err; 146 ret = err;
174 } 147 }
175 out: 148out:
176 mutex_unlock(&inode->i_mutex);
177 trace_ext4_sync_file_exit(inode, ret); 149 trace_ext4_sync_file_exit(inode, ret);
178 return ret; 150 return ret;
179} 151}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..f03598c6ffd3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -747,7 +747,8 @@ repeat_in_this_group:
747 if (!handle) { 747 if (!handle) {
748 BUG_ON(nblocks <= 0); 748 BUG_ON(nblocks <= 0);
749 handle = __ext4_journal_start_sb(dir->i_sb, line_no, 749 handle = __ext4_journal_start_sb(dir->i_sb, line_no,
750 handle_type, nblocks); 750 handle_type, nblocks,
751 0);
751 if (IS_ERR(handle)) { 752 if (IS_ERR(handle)) {
752 err = PTR_ERR(handle); 753 err = PTR_ERR(handle);
753 ext4_std_error(sb, err); 754 ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
624 partial--; 624 partial--;
625 } 625 }
626out: 626out:
627 trace_ext4_ind_map_blocks_exit(inode, map, err); 627 trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
628 return err; 628 return err;
629} 629}
630 630
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
675 675
676retry: 676retry:
677 if (rw == READ && ext4_should_dioread_nolock(inode)) { 677 if (rw == READ && ext4_should_dioread_nolock(inode)) {
678 if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
679 mutex_lock(&inode->i_mutex);
680 ext4_flush_unwritten_io(inode);
681 mutex_unlock(&inode->i_mutex);
682 }
683 /* 678 /*
684 * Nolock dioread optimization may be dynamically disabled 679 * Nolock dioread optimization may be dynamically disabled
685 * via ext4_inode_block_unlocked_dio(). Check inode's state 680 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
779 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 774 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
780} 775}
781 776
782int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) 777/*
778 * Calculate number of indirect blocks touched by mapping @nrblocks logically
779 * contiguous blocks
780 */
781int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
783{ 782{
784 int indirects;
785
786 /* if nrblocks are contiguous */
787 if (chunk) {
788 /*
789 * With N contiguous data blocks, we need at most
790 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
791 * 2 dindirect blocks, and 1 tindirect block
792 */
793 return DIV_ROUND_UP(nrblocks,
794 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
795 }
796 /* 783 /*
797 * if nrblocks are not contiguous, worse case, each block touch 784 * With N contiguous data blocks, we need at most
798 * a indirect block, and each indirect block touch a double indirect 785 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
799 * block, plus a triple indirect block 786 * 2 dindirect blocks, and 1 tindirect block
800 */ 787 */
801 indirects = nrblocks * 2 + 1; 788 return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
802 return indirects;
803} 789}
804 790
805/* 791/*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
940 __le32 *last) 926 __le32 *last)
941{ 927{
942 __le32 *p; 928 __le32 *p;
943 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 929 int flags = EXT4_FREE_BLOCKS_VALIDATED;
944 int err; 930 int err;
945 931
946 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 932 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
947 flags |= EXT4_FREE_BLOCKS_METADATA; 933 flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
934 else if (ext4_should_journal_data(inode))
935 flags |= EXT4_FREE_BLOCKS_FORGET;
948 936
949 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 937 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
950 count)) { 938 count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3e2bf873e8a8..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
72 entry = (struct ext4_xattr_entry *) 72 entry = (struct ext4_xattr_entry *)
73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off); 73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
74 74
75 free += le32_to_cpu(entry->e_value_size); 75 free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
76 goto out; 76 goto out;
77 } 77 }
78 78
@@ -1404,16 +1404,15 @@ out:
1404 * offset as if '.' and '..' really take place. 1404 * offset as if '.' and '..' really take place.
1405 * 1405 *
1406 */ 1406 */
1407int ext4_read_inline_dir(struct file *filp, 1407int ext4_read_inline_dir(struct file *file,
1408 void *dirent, filldir_t filldir, 1408 struct dir_context *ctx,
1409 int *has_inline_data) 1409 int *has_inline_data)
1410{ 1410{
1411 int error = 0;
1412 unsigned int offset, parent_ino; 1411 unsigned int offset, parent_ino;
1413 int i, stored; 1412 int i;
1414 struct ext4_dir_entry_2 *de; 1413 struct ext4_dir_entry_2 *de;
1415 struct super_block *sb; 1414 struct super_block *sb;
1416 struct inode *inode = file_inode(filp); 1415 struct inode *inode = file_inode(file);
1417 int ret, inline_size = 0; 1416 int ret, inline_size = 0;
1418 struct ext4_iloc iloc; 1417 struct ext4_iloc iloc;
1419 void *dir_buf = NULL; 1418 void *dir_buf = NULL;
@@ -1444,9 +1443,8 @@ int ext4_read_inline_dir(struct file *filp,
1444 goto out; 1443 goto out;
1445 1444
1446 sb = inode->i_sb; 1445 sb = inode->i_sb;
1447 stored = 0;
1448 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); 1446 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
1449 offset = filp->f_pos; 1447 offset = ctx->pos;
1450 1448
1451 /* 1449 /*
1452 * dotdot_offset and dotdot_size is the real offset and 1450 * dotdot_offset and dotdot_size is the real offset and
@@ -1460,104 +1458,74 @@ int ext4_read_inline_dir(struct file *filp,
1460 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; 1458 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
1461 extra_size = extra_offset + inline_size; 1459 extra_size = extra_offset + inline_size;
1462 1460
1463 while (!error && !stored && filp->f_pos < extra_size) { 1461 /*
1464revalidate: 1462 * If the version has changed since the last call to
1465 /* 1463 * readdir(2), then we might be pointing to an invalid
1466 * If the version has changed since the last call to 1464 * dirent right now. Scan from the start of the inline
1467 * readdir(2), then we might be pointing to an invalid 1465 * dir to make sure.
1468 * dirent right now. Scan from the start of the inline 1466 */
1469 * dir to make sure. 1467 if (file->f_version != inode->i_version) {
1470 */ 1468 for (i = 0; i < extra_size && i < offset;) {
1471 if (filp->f_version != inode->i_version) { 1469 /*
1472 for (i = 0; i < extra_size && i < offset;) { 1470 * "." is with offset 0 and
1473 /* 1471 * ".." is dotdot_offset.
1474 * "." is with offset 0 and 1472 */
1475 * ".." is dotdot_offset. 1473 if (!i) {
1476 */ 1474 i = dotdot_offset;
1477 if (!i) { 1475 continue;
1478 i = dotdot_offset; 1476 } else if (i == dotdot_offset) {
1479 continue; 1477 i = dotdot_size;
1480 } else if (i == dotdot_offset) {
1481 i = dotdot_size;
1482 continue;
1483 }
1484 /* for other entry, the real offset in
1485 * the buf has to be tuned accordingly.
1486 */
1487 de = (struct ext4_dir_entry_2 *)
1488 (dir_buf + i - extra_offset);
1489 /* It's too expensive to do a full
1490 * dirent test each time round this
1491 * loop, but we do have to test at
1492 * least that it is non-zero. A
1493 * failure will be detected in the
1494 * dirent test below. */
1495 if (ext4_rec_len_from_disk(de->rec_len,
1496 extra_size) < EXT4_DIR_REC_LEN(1))
1497 break;
1498 i += ext4_rec_len_from_disk(de->rec_len,
1499 extra_size);
1500 }
1501 offset = i;
1502 filp->f_pos = offset;
1503 filp->f_version = inode->i_version;
1504 }
1505
1506 while (!error && filp->f_pos < extra_size) {
1507 if (filp->f_pos == 0) {
1508 error = filldir(dirent, ".", 1, 0, inode->i_ino,
1509 DT_DIR);
1510 if (error)
1511 break;
1512 stored++;
1513 filp->f_pos = dotdot_offset;
1514 continue; 1478 continue;
1515 } 1479 }
1480 /* for other entry, the real offset in
1481 * the buf has to be tuned accordingly.
1482 */
1483 de = (struct ext4_dir_entry_2 *)
1484 (dir_buf + i - extra_offset);
1485 /* It's too expensive to do a full
1486 * dirent test each time round this
1487 * loop, but we do have to test at
1488 * least that it is non-zero. A
1489 * failure will be detected in the
1490 * dirent test below. */
1491 if (ext4_rec_len_from_disk(de->rec_len, extra_size)
1492 < EXT4_DIR_REC_LEN(1))
1493 break;
1494 i += ext4_rec_len_from_disk(de->rec_len,
1495 extra_size);
1496 }
1497 offset = i;
1498 ctx->pos = offset;
1499 file->f_version = inode->i_version;
1500 }
1516 1501
1517 if (filp->f_pos == dotdot_offset) { 1502 while (ctx->pos < extra_size) {
1518 error = filldir(dirent, "..", 2, 1503 if (ctx->pos == 0) {
1519 dotdot_offset, 1504 if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
1520 parent_ino, DT_DIR); 1505 goto out;
1521 if (error) 1506 ctx->pos = dotdot_offset;
1522 break; 1507 continue;
1523 stored++; 1508 }
1524 1509
1525 filp->f_pos = dotdot_size; 1510 if (ctx->pos == dotdot_offset) {
1526 continue; 1511 if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
1527 } 1512 goto out;
1513 ctx->pos = dotdot_size;
1514 continue;
1515 }
1528 1516
1529 de = (struct ext4_dir_entry_2 *) 1517 de = (struct ext4_dir_entry_2 *)
1530 (dir_buf + filp->f_pos - extra_offset); 1518 (dir_buf + ctx->pos - extra_offset);
1531 if (ext4_check_dir_entry(inode, filp, de, 1519 if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
1532 iloc.bh, dir_buf, 1520 extra_size, ctx->pos))
1533 extra_size, filp->f_pos)) { 1521 goto out;
1534 ret = stored; 1522 if (le32_to_cpu(de->inode)) {
1523 if (!dir_emit(ctx, de->name, de->name_len,
1524 le32_to_cpu(de->inode),
1525 get_dtype(sb, de->file_type)))
1535 goto out; 1526 goto out;
1536 }
1537 if (le32_to_cpu(de->inode)) {
1538 /* We might block in the next section
1539 * if the data destination is
1540 * currently swapped out. So, use a
1541 * version stamp to detect whether or
1542 * not the directory has been modified
1543 * during the copy operation.
1544 */
1545 u64 version = filp->f_version;
1546
1547 error = filldir(dirent, de->name,
1548 de->name_len,
1549 filp->f_pos,
1550 le32_to_cpu(de->inode),
1551 get_dtype(sb, de->file_type));
1552 if (error)
1553 break;
1554 if (version != filp->f_version)
1555 goto revalidate;
1556 stored++;
1557 }
1558 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
1559 extra_size);
1560 } 1527 }
1528 ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
1561 } 1529 }
1562out: 1530out:
1563 kfree(dir_buf); 1531 kfree(dir_buf);
@@ -1842,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
1842 if (error) 1810 if (error)
1843 goto out; 1811 goto out;
1844 1812
1845 physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1813 physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
1846 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; 1814 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
1847 physical += offsetof(struct ext4_inode, i_block); 1815 physical += offsetof(struct ext4_inode, i_block);
1848 length = i_size_read(inode); 1816 length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0723774bdfb5..0188e65e1f58 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
132 new_size); 132 new_size);
133} 133}
134 134
135static void ext4_invalidatepage(struct page *page, unsigned long offset); 135static void ext4_invalidatepage(struct page *page, unsigned int offset,
136 unsigned int length);
136static int __ext4_journalled_writepage(struct page *page, unsigned int len); 137static int __ext4_journalled_writepage(struct page *page, unsigned int len);
137static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 138static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
138static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 139static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
139 struct inode *inode, struct page *page, loff_t from, 140 int pextents);
140 loff_t length, int flags);
141 141
142/* 142/*
143 * Test whether an inode is a fast symlink. 143 * Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
215 filemap_write_and_wait(&inode->i_data); 215 filemap_write_and_wait(&inode->i_data);
216 } 216 }
217 truncate_inode_pages(&inode->i_data, 0); 217 truncate_inode_pages(&inode->i_data, 0);
218 ext4_ioend_shutdown(inode); 218
219 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
219 goto no_delete; 220 goto no_delete;
220 } 221 }
221 222
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
225 if (ext4_should_order_data(inode)) 226 if (ext4_should_order_data(inode))
226 ext4_begin_ordered_truncate(inode, 0); 227 ext4_begin_ordered_truncate(inode, 0);
227 truncate_inode_pages(&inode->i_data, 0); 228 truncate_inode_pages(&inode->i_data, 0);
228 ext4_ioend_shutdown(inode);
229 229
230 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
230 if (is_bad_inode(inode)) 231 if (is_bad_inode(inode))
231 goto no_delete; 232 goto no_delete;
232 233
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
423#define check_block_validity(inode, map) \ 424#define check_block_validity(inode, map) \
424 __check_block_validity((inode), __func__, __LINE__, (map)) 425 __check_block_validity((inode), __func__, __LINE__, (map))
425 426
426/*
427 * Return the number of contiguous dirty pages in a given inode
428 * starting at page frame idx.
429 */
430static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
431 unsigned int max_pages)
432{
433 struct address_space *mapping = inode->i_mapping;
434 pgoff_t index;
435 struct pagevec pvec;
436 pgoff_t num = 0;
437 int i, nr_pages, done = 0;
438
439 if (max_pages == 0)
440 return 0;
441 pagevec_init(&pvec, 0);
442 while (!done) {
443 index = idx;
444 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
445 PAGECACHE_TAG_DIRTY,
446 (pgoff_t)PAGEVEC_SIZE);
447 if (nr_pages == 0)
448 break;
449 for (i = 0; i < nr_pages; i++) {
450 struct page *page = pvec.pages[i];
451 struct buffer_head *bh, *head;
452
453 lock_page(page);
454 if (unlikely(page->mapping != mapping) ||
455 !PageDirty(page) ||
456 PageWriteback(page) ||
457 page->index != idx) {
458 done = 1;
459 unlock_page(page);
460 break;
461 }
462 if (page_has_buffers(page)) {
463 bh = head = page_buffers(page);
464 do {
465 if (!buffer_delay(bh) &&
466 !buffer_unwritten(bh))
467 done = 1;
468 bh = bh->b_this_page;
469 } while (!done && (bh != head));
470 }
471 unlock_page(page);
472 if (done)
473 break;
474 idx++;
475 num++;
476 if (num >= max_pages) {
477 done = 1;
478 break;
479 }
480 }
481 pagevec_release(&pvec);
482 }
483 return num;
484}
485
486#ifdef ES_AGGRESSIVE_TEST 427#ifdef ES_AGGRESSIVE_TEST
487static void ext4_map_blocks_es_recheck(handle_t *handle, 428static void ext4_map_blocks_es_recheck(handle_t *handle,
488 struct inode *inode, 429 struct inode *inode,
@@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
573 "logical block %lu\n", inode->i_ino, flags, map->m_len, 514 "logical block %lu\n", inode->i_ino, flags, map->m_len,
574 (unsigned long) map->m_lblk); 515 (unsigned long) map->m_lblk);
575 516
517 ext4_es_lru_add(inode);
518
576 /* Lookup extent status tree firstly */ 519 /* Lookup extent status tree firstly */
577 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 520 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
578 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 521 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
@@ -1118,10 +1061,13 @@ static int ext4_write_end(struct file *file,
1118 } 1061 }
1119 } 1062 }
1120 1063
1121 if (ext4_has_inline_data(inode)) 1064 if (ext4_has_inline_data(inode)) {
1122 copied = ext4_write_inline_data_end(inode, pos, len, 1065 ret = ext4_write_inline_data_end(inode, pos, len,
1123 copied, page); 1066 copied, page);
1124 else 1067 if (ret < 0)
1068 goto errout;
1069 copied = ret;
1070 } else
1125 copied = block_write_end(file, mapping, pos, 1071 copied = block_write_end(file, mapping, pos,
1126 len, copied, page, fsdata); 1072 len, copied, page, fsdata);
1127 1073
@@ -1157,8 +1103,6 @@ static int ext4_write_end(struct file *file,
1157 if (i_size_changed) 1103 if (i_size_changed)
1158 ext4_mark_inode_dirty(handle, inode); 1104 ext4_mark_inode_dirty(handle, inode);
1159 1105
1160 if (copied < 0)
1161 ret = copied;
1162 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1106 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1163 /* if we have allocated more blocks and copied 1107 /* if we have allocated more blocks and copied
1164 * less. We will have blocks allocated outside 1108 * less. We will have blocks allocated outside
@@ -1415,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1415} 1359}
1416 1360
1417static void ext4_da_page_release_reservation(struct page *page, 1361static void ext4_da_page_release_reservation(struct page *page,
1418 unsigned long offset) 1362 unsigned int offset,
1363 unsigned int length)
1419{ 1364{
1420 int to_release = 0; 1365 int to_release = 0;
1421 struct buffer_head *head, *bh; 1366 struct buffer_head *head, *bh;
1422 unsigned int curr_off = 0; 1367 unsigned int curr_off = 0;
1423 struct inode *inode = page->mapping->host; 1368 struct inode *inode = page->mapping->host;
1424 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1369 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1370 unsigned int stop = offset + length;
1425 int num_clusters; 1371 int num_clusters;
1426 ext4_fsblk_t lblk; 1372 ext4_fsblk_t lblk;
1427 1373
1374 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1375
1428 head = page_buffers(page); 1376 head = page_buffers(page);
1429 bh = head; 1377 bh = head;
1430 do { 1378 do {
1431 unsigned int next_off = curr_off + bh->b_size; 1379 unsigned int next_off = curr_off + bh->b_size;
1432 1380
1381 if (next_off > stop)
1382 break;
1383
1433 if ((offset <= curr_off) && (buffer_delay(bh))) { 1384 if ((offset <= curr_off) && (buffer_delay(bh))) {
1434 to_release++; 1385 to_release++;
1435 clear_buffer_delay(bh); 1386 clear_buffer_delay(bh);
@@ -1460,145 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page,
1460 * Delayed allocation stuff 1411 * Delayed allocation stuff
1461 */ 1412 */
1462 1413
1463/* 1414struct mpage_da_data {
1464 * mpage_da_submit_io - walks through extent of pages and try to write 1415 struct inode *inode;
1465 * them with writepage() call back 1416 struct writeback_control *wbc;
1466 *
1467 * @mpd->inode: inode
1468 * @mpd->first_page: first page of the extent
1469 * @mpd->next_page: page after the last page of the extent
1470 *
1471 * By the time mpage_da_submit_io() is called we expect all blocks
1472 * to be allocated. this may be wrong if allocation failed.
1473 *
1474 * As pages are already locked by write_cache_pages(), we can't use it
1475 */
1476static int mpage_da_submit_io(struct mpage_da_data *mpd,
1477 struct ext4_map_blocks *map)
1478{
1479 struct pagevec pvec;
1480 unsigned long index, end;
1481 int ret = 0, err, nr_pages, i;
1482 struct inode *inode = mpd->inode;
1483 struct address_space *mapping = inode->i_mapping;
1484 loff_t size = i_size_read(inode);
1485 unsigned int len, block_start;
1486 struct buffer_head *bh, *page_bufs = NULL;
1487 sector_t pblock = 0, cur_logical = 0;
1488 struct ext4_io_submit io_submit;
1489 1417
1490 BUG_ON(mpd->next_page <= mpd->first_page); 1418 pgoff_t first_page; /* The first page to write */
1491 ext4_io_submit_init(&io_submit, mpd->wbc); 1419 pgoff_t next_page; /* Current page to examine */
1492 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); 1420 pgoff_t last_page; /* Last page to examine */
1493 if (!io_submit.io_end)
1494 return -ENOMEM;
1495 /* 1421 /*
1496 * We need to start from the first_page to the next_page - 1 1422 * Extent to map - this can be after first_page because that can be
1497 * to make sure we also write the mapped dirty buffer_heads. 1423 * fully mapped. We somewhat abuse m_flags to store whether the extent
1498 * If we look at mpd->b_blocknr we would only be looking 1424 * is delalloc or unwritten.
1499 * at the currently mapped buffer_heads.
1500 */ 1425 */
1501 index = mpd->first_page; 1426 struct ext4_map_blocks map;
1502 end = mpd->next_page - 1; 1427 struct ext4_io_submit io_submit; /* IO submission data */
1503 1428};
1504 pagevec_init(&pvec, 0);
1505 while (index <= end) {
1506 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1507 if (nr_pages == 0)
1508 break;
1509 for (i = 0; i < nr_pages; i++) {
1510 int skip_page = 0;
1511 struct page *page = pvec.pages[i];
1512
1513 index = page->index;
1514 if (index > end)
1515 break;
1516
1517 if (index == size >> PAGE_CACHE_SHIFT)
1518 len = size & ~PAGE_CACHE_MASK;
1519 else
1520 len = PAGE_CACHE_SIZE;
1521 if (map) {
1522 cur_logical = index << (PAGE_CACHE_SHIFT -
1523 inode->i_blkbits);
1524 pblock = map->m_pblk + (cur_logical -
1525 map->m_lblk);
1526 }
1527 index++;
1528
1529 BUG_ON(!PageLocked(page));
1530 BUG_ON(PageWriteback(page));
1531
1532 bh = page_bufs = page_buffers(page);
1533 block_start = 0;
1534 do {
1535 if (map && (cur_logical >= map->m_lblk) &&
1536 (cur_logical <= (map->m_lblk +
1537 (map->m_len - 1)))) {
1538 if (buffer_delay(bh)) {
1539 clear_buffer_delay(bh);
1540 bh->b_blocknr = pblock;
1541 }
1542 if (buffer_unwritten(bh) ||
1543 buffer_mapped(bh))
1544 BUG_ON(bh->b_blocknr != pblock);
1545 if (map->m_flags & EXT4_MAP_UNINIT)
1546 set_buffer_uninit(bh);
1547 clear_buffer_unwritten(bh);
1548 }
1549
1550 /*
1551 * skip page if block allocation undone and
1552 * block is dirty
1553 */
1554 if (ext4_bh_delay_or_unwritten(NULL, bh))
1555 skip_page = 1;
1556 bh = bh->b_this_page;
1557 block_start += bh->b_size;
1558 cur_logical++;
1559 pblock++;
1560 } while (bh != page_bufs);
1561
1562 if (skip_page) {
1563 unlock_page(page);
1564 continue;
1565 }
1566
1567 clear_page_dirty_for_io(page);
1568 err = ext4_bio_write_page(&io_submit, page, len,
1569 mpd->wbc);
1570 if (!err)
1571 mpd->pages_written++;
1572 /*
1573 * In error case, we have to continue because
1574 * remaining pages are still locked
1575 */
1576 if (ret == 0)
1577 ret = err;
1578 }
1579 pagevec_release(&pvec);
1580 }
1581 ext4_io_submit(&io_submit);
1582 /* Drop io_end reference we got from init */
1583 ext4_put_io_end_defer(io_submit.io_end);
1584 return ret;
1585}
1586 1429
1587static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) 1430static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1431 bool invalidate)
1588{ 1432{
1589 int nr_pages, i; 1433 int nr_pages, i;
1590 pgoff_t index, end; 1434 pgoff_t index, end;
1591 struct pagevec pvec; 1435 struct pagevec pvec;
1592 struct inode *inode = mpd->inode; 1436 struct inode *inode = mpd->inode;
1593 struct address_space *mapping = inode->i_mapping; 1437 struct address_space *mapping = inode->i_mapping;
1594 ext4_lblk_t start, last; 1438
1439 /* This is necessary when next_page == 0. */
1440 if (mpd->first_page >= mpd->next_page)
1441 return;
1595 1442
1596 index = mpd->first_page; 1443 index = mpd->first_page;
1597 end = mpd->next_page - 1; 1444 end = mpd->next_page - 1;
1598 1445 if (invalidate) {
1599 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1446 ext4_lblk_t start, last;
1600 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1447 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1601 ext4_es_remove_extent(inode, start, last - start + 1); 1448 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1449 ext4_es_remove_extent(inode, start, last - start + 1);
1450 }
1602 1451
1603 pagevec_init(&pvec, 0); 1452 pagevec_init(&pvec, 0);
1604 while (index <= end) { 1453 while (index <= end) {
@@ -1611,14 +1460,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1611 break; 1460 break;
1612 BUG_ON(!PageLocked(page)); 1461 BUG_ON(!PageLocked(page));
1613 BUG_ON(PageWriteback(page)); 1462 BUG_ON(PageWriteback(page));
1614 block_invalidatepage(page, 0); 1463 if (invalidate) {
1615 ClearPageUptodate(page); 1464 block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1465 ClearPageUptodate(page);
1466 }
1616 unlock_page(page); 1467 unlock_page(page);
1617 } 1468 }
1618 index = pvec.pages[nr_pages - 1]->index + 1; 1469 index = pvec.pages[nr_pages - 1]->index + 1;
1619 pagevec_release(&pvec); 1470 pagevec_release(&pvec);
1620 } 1471 }
1621 return;
1622} 1472}
1623 1473
1624static void ext4_print_free_blocks(struct inode *inode) 1474static void ext4_print_free_blocks(struct inode *inode)
@@ -1647,215 +1497,6 @@ static void ext4_print_free_blocks(struct inode *inode)
1647 return; 1497 return;
1648} 1498}
1649 1499
1650/*
1651 * mpage_da_map_and_submit - go through given space, map them
1652 * if necessary, and then submit them for I/O
1653 *
1654 * @mpd - bh describing space
1655 *
1656 * The function skips space we know is already mapped to disk blocks.
1657 *
1658 */
1659static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1660{
1661 int err, blks, get_blocks_flags;
1662 struct ext4_map_blocks map, *mapp = NULL;
1663 sector_t next = mpd->b_blocknr;
1664 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1665 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1666 handle_t *handle = NULL;
1667
1668 /*
1669 * If the blocks are mapped already, or we couldn't accumulate
1670 * any blocks, then proceed immediately to the submission stage.
1671 */
1672 if ((mpd->b_size == 0) ||
1673 ((mpd->b_state & (1 << BH_Mapped)) &&
1674 !(mpd->b_state & (1 << BH_Delay)) &&
1675 !(mpd->b_state & (1 << BH_Unwritten))))
1676 goto submit_io;
1677
1678 handle = ext4_journal_current_handle();
1679 BUG_ON(!handle);
1680
1681 /*
1682 * Call ext4_map_blocks() to allocate any delayed allocation
1683 * blocks, or to convert an uninitialized extent to be
1684 * initialized (in the case where we have written into
1685 * one or more preallocated blocks).
1686 *
1687 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1688 * indicate that we are on the delayed allocation path. This
1689 * affects functions in many different parts of the allocation
1690 * call path. This flag exists primarily because we don't
1691 * want to change *many* call functions, so ext4_map_blocks()
1692 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1693 * inode's allocation semaphore is taken.
1694 *
1695 * If the blocks in questions were delalloc blocks, set
1696 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1697 * variables are updated after the blocks have been allocated.
1698 */
1699 map.m_lblk = next;
1700 map.m_len = max_blocks;
1701 /*
1702 * We're in delalloc path and it is possible that we're going to
1703 * need more metadata blocks than previously reserved. However
1704 * we must not fail because we're in writeback and there is
1705 * nothing we can do about it so it might result in data loss.
1706 * So use reserved blocks to allocate metadata if possible.
1707 */
1708 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1709 EXT4_GET_BLOCKS_METADATA_NOFAIL;
1710 if (ext4_should_dioread_nolock(mpd->inode))
1711 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1712 if (mpd->b_state & (1 << BH_Delay))
1713 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1714
1715
1716 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1717 if (blks < 0) {
1718 struct super_block *sb = mpd->inode->i_sb;
1719
1720 err = blks;
1721 /*
1722 * If get block returns EAGAIN or ENOSPC and there
1723 * appears to be free blocks we will just let
1724 * mpage_da_submit_io() unlock all of the pages.
1725 */
1726 if (err == -EAGAIN)
1727 goto submit_io;
1728
1729 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1730 mpd->retval = err;
1731 goto submit_io;
1732 }
1733
1734 /*
1735 * get block failure will cause us to loop in
1736 * writepages, because a_ops->writepage won't be able
1737 * to make progress. The page will be redirtied by
1738 * writepage and writepages will again try to write
1739 * the same.
1740 */
1741 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1742 ext4_msg(sb, KERN_CRIT,
1743 "delayed block allocation failed for inode %lu "
1744 "at logical offset %llu with max blocks %zd "
1745 "with error %d", mpd->inode->i_ino,
1746 (unsigned long long) next,
1747 mpd->b_size >> mpd->inode->i_blkbits, err);
1748 ext4_msg(sb, KERN_CRIT,
1749 "This should not happen!! Data will be lost");
1750 if (err == -ENOSPC)
1751 ext4_print_free_blocks(mpd->inode);
1752 }
1753 /* invalidate all the pages */
1754 ext4_da_block_invalidatepages(mpd);
1755
1756 /* Mark this page range as having been completed */
1757 mpd->io_done = 1;
1758 return;
1759 }
1760 BUG_ON(blks == 0);
1761
1762 mapp = &map;
1763 if (map.m_flags & EXT4_MAP_NEW) {
1764 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1765 int i;
1766
1767 for (i = 0; i < map.m_len; i++)
1768 unmap_underlying_metadata(bdev, map.m_pblk + i);
1769 }
1770
1771 /*
1772 * Update on-disk size along with block allocation.
1773 */
1774 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1775 if (disksize > i_size_read(mpd->inode))
1776 disksize = i_size_read(mpd->inode);
1777 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1778 ext4_update_i_disksize(mpd->inode, disksize);
1779 err = ext4_mark_inode_dirty(handle, mpd->inode);
1780 if (err)
1781 ext4_error(mpd->inode->i_sb,
1782 "Failed to mark inode %lu dirty",
1783 mpd->inode->i_ino);
1784 }
1785
1786submit_io:
1787 mpage_da_submit_io(mpd, mapp);
1788 mpd->io_done = 1;
1789}
1790
1791#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1792 (1 << BH_Delay) | (1 << BH_Unwritten))
1793
1794/*
1795 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1796 *
1797 * @mpd->lbh - extent of blocks
1798 * @logical - logical number of the block in the file
1799 * @b_state - b_state of the buffer head added
1800 *
1801 * the function is used to collect contig. blocks in same state
1802 */
1803static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
1804 unsigned long b_state)
1805{
1806 sector_t next;
1807 int blkbits = mpd->inode->i_blkbits;
1808 int nrblocks = mpd->b_size >> blkbits;
1809
1810 /*
1811 * XXX Don't go larger than mballoc is willing to allocate
1812 * This is a stopgap solution. We eventually need to fold
1813 * mpage_da_submit_io() into this function and then call
1814 * ext4_map_blocks() multiple times in a loop
1815 */
1816 if (nrblocks >= (8*1024*1024 >> blkbits))
1817 goto flush_it;
1818
1819 /* check if the reserved journal credits might overflow */
1820 if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
1821 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1822 /*
1823 * With non-extent format we are limited by the journal
1824 * credit available. Total credit needed to insert
1825 * nrblocks contiguous blocks is dependent on the
1826 * nrblocks. So limit nrblocks.
1827 */
1828 goto flush_it;
1829 }
1830 }
1831 /*
1832 * First block in the extent
1833 */
1834 if (mpd->b_size == 0) {
1835 mpd->b_blocknr = logical;
1836 mpd->b_size = 1 << blkbits;
1837 mpd->b_state = b_state & BH_FLAGS;
1838 return;
1839 }
1840
1841 next = mpd->b_blocknr + nrblocks;
1842 /*
1843 * Can we merge the block to our big extent?
1844 */
1845 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1846 mpd->b_size += 1 << blkbits;
1847 return;
1848 }
1849
1850flush_it:
1851 /*
1852 * We couldn't merge the block to our extent, so we
1853 * need to flush current extent and start new one
1854 */
1855 mpage_da_map_and_submit(mpd);
1856 return;
1857}
1858
1859static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 1500static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1860{ 1501{
1861 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1502 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1888,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1888 "logical block %lu\n", inode->i_ino, map->m_len, 1529 "logical block %lu\n", inode->i_ino, map->m_len,
1889 (unsigned long) map->m_lblk); 1530 (unsigned long) map->m_lblk);
1890 1531
1532 ext4_es_lru_add(inode);
1533
1891 /* Lookup extent status tree firstly */ 1534 /* Lookup extent status tree firstly */
1892 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1535 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1893 1536
@@ -2161,7 +1804,7 @@ out:
2161 * lock so we have to do some magic. 1804 * lock so we have to do some magic.
2162 * 1805 *
2163 * This function can get called via... 1806 * This function can get called via...
2164 * - ext4_da_writepages after taking page lock (have journal handle) 1807 * - ext4_writepages after taking page lock (have journal handle)
2165 * - journal_submit_inode_data_buffers (no journal handle) 1808 * - journal_submit_inode_data_buffers (no journal handle)
2166 * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1809 * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
2167 * - grab_page_cache when doing write_begin (have journal handle) 1810 * - grab_page_cache when doing write_begin (have journal handle)
@@ -2243,6 +1886,7 @@ static int ext4_writepage(struct page *page,
2243 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); 1886 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
2244 if (!io_submit.io_end) { 1887 if (!io_submit.io_end) {
2245 redirty_page_for_writepage(wbc, page); 1888 redirty_page_for_writepage(wbc, page);
1889 unlock_page(page);
2246 return -ENOMEM; 1890 return -ENOMEM;
2247 } 1891 }
2248 ret = ext4_bio_write_page(&io_submit, page, len, wbc); 1892 ret = ext4_bio_write_page(&io_submit, page, len, wbc);
@@ -2252,70 +1896,391 @@ static int ext4_writepage(struct page *page,
2252 return ret; 1896 return ret;
2253} 1897}
2254 1898
1899#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
1900
2255/* 1901/*
2256 * This is called via ext4_da_writepages() to 1902 * mballoc gives us at most this number of blocks...
2257 * calculate the total number of credits to reserve to fit 1903 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
2258 * a single extent allocation into a single transaction, 1904 * The rest of mballoc seems to handle chunks upto full group size.
2259 * ext4_da_writpeages() will loop calling this before
2260 * the block allocation.
2261 */ 1905 */
1906#define MAX_WRITEPAGES_EXTENT_LEN 2048
2262 1907
2263static int ext4_da_writepages_trans_blocks(struct inode *inode) 1908/*
1909 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
1910 *
1911 * @mpd - extent of blocks
1912 * @lblk - logical number of the block in the file
1913 * @b_state - b_state of the buffer head added
1914 *
1915 * the function is used to collect contig. blocks in same state
1916 */
1917static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
1918 unsigned long b_state)
1919{
1920 struct ext4_map_blocks *map = &mpd->map;
1921
1922 /* Don't go larger than mballoc is willing to allocate */
1923 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
1924 return 0;
1925
1926 /* First block in the extent? */
1927 if (map->m_len == 0) {
1928 map->m_lblk = lblk;
1929 map->m_len = 1;
1930 map->m_flags = b_state & BH_FLAGS;
1931 return 1;
1932 }
1933
1934 /* Can we merge the block to our big extent? */
1935 if (lblk == map->m_lblk + map->m_len &&
1936 (b_state & BH_FLAGS) == map->m_flags) {
1937 map->m_len++;
1938 return 1;
1939 }
1940 return 0;
1941}
1942
1943static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
1944 struct buffer_head *head,
1945 struct buffer_head *bh,
1946 ext4_lblk_t lblk)
1947{
1948 struct inode *inode = mpd->inode;
1949 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
1950 >> inode->i_blkbits;
1951
1952 do {
1953 BUG_ON(buffer_locked(bh));
1954
1955 if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
1956 (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
1957 lblk >= blocks) {
1958 /* Found extent to map? */
1959 if (mpd->map.m_len)
1960 return false;
1961 if (lblk >= blocks)
1962 return true;
1963 continue;
1964 }
1965 if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
1966 return false;
1967 } while (lblk++, (bh = bh->b_this_page) != head);
1968 return true;
1969}
1970
1971static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
1972{
1973 int len;
1974 loff_t size = i_size_read(mpd->inode);
1975 int err;
1976
1977 BUG_ON(page->index != mpd->first_page);
1978 if (page->index == size >> PAGE_CACHE_SHIFT)
1979 len = size & ~PAGE_CACHE_MASK;
1980 else
1981 len = PAGE_CACHE_SIZE;
1982 clear_page_dirty_for_io(page);
1983 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
1984 if (!err)
1985 mpd->wbc->nr_to_write--;
1986 mpd->first_page++;
1987
1988 return err;
1989}
1990
1991/*
1992 * mpage_map_buffers - update buffers corresponding to changed extent and
1993 * submit fully mapped pages for IO
1994 *
1995 * @mpd - description of extent to map, on return next extent to map
1996 *
1997 * Scan buffers corresponding to changed extent (we expect corresponding pages
1998 * to be already locked) and update buffer state according to new extent state.
1999 * We map delalloc buffers to their physical location, clear unwritten bits,
2000 * and mark buffers as uninit when we perform writes to uninitialized extents
2001 * and do extent conversion after IO is finished. If the last page is not fully
2002 * mapped, we update @map to the next extent in the last page that needs
2003 * mapping. Otherwise we submit the page for IO.
2004 */
2005static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2006{
2007 struct pagevec pvec;
2008 int nr_pages, i;
2009 struct inode *inode = mpd->inode;
2010 struct buffer_head *head, *bh;
2011 int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
2012 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
2013 >> inode->i_blkbits;
2014 pgoff_t start, end;
2015 ext4_lblk_t lblk;
2016 sector_t pblock;
2017 int err;
2018
2019 start = mpd->map.m_lblk >> bpp_bits;
2020 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
2021 lblk = start << bpp_bits;
2022 pblock = mpd->map.m_pblk;
2023
2024 pagevec_init(&pvec, 0);
2025 while (start <= end) {
2026 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
2027 PAGEVEC_SIZE);
2028 if (nr_pages == 0)
2029 break;
2030 for (i = 0; i < nr_pages; i++) {
2031 struct page *page = pvec.pages[i];
2032
2033 if (page->index > end)
2034 break;
2035 /* Upto 'end' pages must be contiguous */
2036 BUG_ON(page->index != start);
2037 bh = head = page_buffers(page);
2038 do {
2039 if (lblk < mpd->map.m_lblk)
2040 continue;
2041 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2042 /*
2043 * Buffer after end of mapped extent.
2044 * Find next buffer in the page to map.
2045 */
2046 mpd->map.m_len = 0;
2047 mpd->map.m_flags = 0;
2048 add_page_bufs_to_extent(mpd, head, bh,
2049 lblk);
2050 pagevec_release(&pvec);
2051 return 0;
2052 }
2053 if (buffer_delay(bh)) {
2054 clear_buffer_delay(bh);
2055 bh->b_blocknr = pblock++;
2056 }
2057 clear_buffer_unwritten(bh);
2058 } while (++lblk < blocks &&
2059 (bh = bh->b_this_page) != head);
2060
2061 /*
2062 * FIXME: This is going to break if dioread_nolock
2063 * supports blocksize < pagesize as we will try to
2064 * convert potentially unmapped parts of inode.
2065 */
2066 mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
2067 /* Page fully mapped - let IO run! */
2068 err = mpage_submit_page(mpd, page);
2069 if (err < 0) {
2070 pagevec_release(&pvec);
2071 return err;
2072 }
2073 start++;
2074 }
2075 pagevec_release(&pvec);
2076 }
2077 /* Extent fully mapped and matches with page boundary. We are done. */
2078 mpd->map.m_len = 0;
2079 mpd->map.m_flags = 0;
2080 return 0;
2081}
2082
2083static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2264{ 2084{
2265 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2085 struct inode *inode = mpd->inode;
2086 struct ext4_map_blocks *map = &mpd->map;
2087 int get_blocks_flags;
2088 int err;
2266 2089
2090 trace_ext4_da_write_pages_extent(inode, map);
2267 /* 2091 /*
2268 * With non-extent format the journal credit needed to 2092 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
2269 * insert nrblocks contiguous block is dependent on 2093 * to convert an uninitialized extent to be initialized (in the case
2270 * number of contiguous block. So we will limit 2094 * where we have written into one or more preallocated blocks). It is
2271 * number of contiguous block to a sane value 2095 * possible that we're going to need more metadata blocks than
2096 * previously reserved. However we must not fail because we're in
2097 * writeback and there is nothing we can do about it so it might result
2098 * in data loss. So use reserved blocks to allocate metadata if
2099 * possible.
2100 *
2101 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
2102 * in question are delalloc blocks. This affects functions in many
2103 * different parts of the allocation call path. This flag exists
2104 * primarily because we don't want to change *many* call functions, so
2105 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
2106 * once the inode's allocation semaphore is taken.
2272 */ 2107 */
2273 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && 2108 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2274 (max_blocks > EXT4_MAX_TRANS_DATA)) 2109 EXT4_GET_BLOCKS_METADATA_NOFAIL;
2275 max_blocks = EXT4_MAX_TRANS_DATA; 2110 if (ext4_should_dioread_nolock(inode))
2111 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2112 if (map->m_flags & (1 << BH_Delay))
2113 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2114
2115 err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
2116 if (err < 0)
2117 return err;
2118 if (map->m_flags & EXT4_MAP_UNINIT) {
2119 if (!mpd->io_submit.io_end->handle &&
2120 ext4_handle_valid(handle)) {
2121 mpd->io_submit.io_end->handle = handle->h_rsv_handle;
2122 handle->h_rsv_handle = NULL;
2123 }
2124 ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
2125 }
2276 2126
2277 return ext4_chunk_trans_blocks(inode, max_blocks); 2127 BUG_ON(map->m_len == 0);
2128 if (map->m_flags & EXT4_MAP_NEW) {
2129 struct block_device *bdev = inode->i_sb->s_bdev;
2130 int i;
2131
2132 for (i = 0; i < map->m_len; i++)
2133 unmap_underlying_metadata(bdev, map->m_pblk + i);
2134 }
2135 return 0;
2278} 2136}
2279 2137
2280/* 2138/*
2281 * write_cache_pages_da - walk the list of dirty pages of the given 2139 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2282 * address space and accumulate pages that need writing, and call 2140 * mpd->len and submit pages underlying it for IO
2283 * mpage_da_map_and_submit to map a single contiguous memory region 2141 *
2284 * and then write them. 2142 * @handle - handle for journal operations
2143 * @mpd - extent to map
2144 *
2145 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2146 * delayed, blocks are allocated, if it is unwritten, we may need to convert
2147 * them to initialized or split the described range from larger unwritten
2148 * extent. Note that we need not map all the described range since allocation
2149 * can return less blocks or the range is covered by more unwritten extents. We
2150 * cannot map more because we are limited by reserved transaction credits. On
2151 * the other hand we always make sure that the last touched page is fully
2152 * mapped so that it can be written out (and thus forward progress is
2153 * guaranteed). After mapping we submit all mapped pages for IO.
2285 */ 2154 */
2286static int write_cache_pages_da(handle_t *handle, 2155static int mpage_map_and_submit_extent(handle_t *handle,
2287 struct address_space *mapping, 2156 struct mpage_da_data *mpd,
2288 struct writeback_control *wbc, 2157 bool *give_up_on_write)
2289 struct mpage_da_data *mpd,
2290 pgoff_t *done_index)
2291{ 2158{
2292 struct buffer_head *bh, *head; 2159 struct inode *inode = mpd->inode;
2293 struct inode *inode = mapping->host; 2160 struct ext4_map_blocks *map = &mpd->map;
2294 struct pagevec pvec; 2161 int err;
2295 unsigned int nr_pages; 2162 loff_t disksize;
2296 sector_t logical;
2297 pgoff_t index, end;
2298 long nr_to_write = wbc->nr_to_write;
2299 int i, tag, ret = 0;
2300
2301 memset(mpd, 0, sizeof(struct mpage_da_data));
2302 mpd->wbc = wbc;
2303 mpd->inode = inode;
2304 pagevec_init(&pvec, 0);
2305 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2306 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2307 2163
2308 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2164 mpd->io_submit.io_end->offset =
2165 ((loff_t)map->m_lblk) << inode->i_blkbits;
2166 while (map->m_len) {
2167 err = mpage_map_one_extent(handle, mpd);
2168 if (err < 0) {
2169 struct super_block *sb = inode->i_sb;
2170
2171 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
2172 goto invalidate_dirty_pages;
2173 /*
2174 * Let the uper layers retry transient errors.
2175 * In the case of ENOSPC, if ext4_count_free_blocks()
2176 * is non-zero, a commit should free up blocks.
2177 */
2178 if ((err == -ENOMEM) ||
2179 (err == -ENOSPC && ext4_count_free_clusters(sb)))
2180 return err;
2181 ext4_msg(sb, KERN_CRIT,
2182 "Delayed block allocation failed for "
2183 "inode %lu at logical offset %llu with"
2184 " max blocks %u with error %d",
2185 inode->i_ino,
2186 (unsigned long long)map->m_lblk,
2187 (unsigned)map->m_len, -err);
2188 ext4_msg(sb, KERN_CRIT,
2189 "This should not happen!! Data will "
2190 "be lost\n");
2191 if (err == -ENOSPC)
2192 ext4_print_free_blocks(inode);
2193 invalidate_dirty_pages:
2194 *give_up_on_write = true;
2195 return err;
2196 }
2197 /*
2198 * Update buffer state, submit mapped pages, and get us new
2199 * extent to map
2200 */
2201 err = mpage_map_and_submit_buffers(mpd);
2202 if (err < 0)
2203 return err;
2204 }
2205
2206 /* Update on-disk size after IO is submitted */
2207 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
2208 if (disksize > i_size_read(inode))
2209 disksize = i_size_read(inode);
2210 if (disksize > EXT4_I(inode)->i_disksize) {
2211 int err2;
2212
2213 ext4_update_i_disksize(inode, disksize);
2214 err2 = ext4_mark_inode_dirty(handle, inode);
2215 if (err2)
2216 ext4_error(inode->i_sb,
2217 "Failed to mark inode %lu dirty",
2218 inode->i_ino);
2219 if (!err)
2220 err = err2;
2221 }
2222 return err;
2223}
2224
2225/*
2226 * Calculate the total number of credits to reserve for one writepages
2227 * iteration. This is called from ext4_writepages(). We map an extent of
2228 * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2229 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2230 * bpp - 1 blocks in bpp different extents.
2231 */
2232static int ext4_da_writepages_trans_blocks(struct inode *inode)
2233{
2234 int bpp = ext4_journal_blocks_per_page(inode);
2235
2236 return ext4_meta_trans_blocks(inode,
2237 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
2238}
2239
2240/*
2241 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2242 * and underlying extent to map
2243 *
2244 * @mpd - where to look for pages
2245 *
2246 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2247 * IO immediately. When we find a page which isn't mapped we start accumulating
2248 * extent of buffers underlying these pages that needs mapping (formed by
2249 * either delayed or unwritten buffers). We also lock the pages containing
2250 * these buffers. The extent found is returned in @mpd structure (starting at
2251 * mpd->lblk with length mpd->len blocks).
2252 *
2253 * Note that this function can attach bios to one io_end structure which are
2254 * neither logically nor physically contiguous. Although it may seem as an
2255 * unnecessary complication, it is actually inevitable in blocksize < pagesize
2256 * case as we need to track IO to all buffers underlying a page in one io_end.
2257 */
2258static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2259{
2260 struct address_space *mapping = mpd->inode->i_mapping;
2261 struct pagevec pvec;
2262 unsigned int nr_pages;
2263 pgoff_t index = mpd->first_page;
2264 pgoff_t end = mpd->last_page;
2265 int tag;
2266 int i, err = 0;
2267 int blkbits = mpd->inode->i_blkbits;
2268 ext4_lblk_t lblk;
2269 struct buffer_head *head;
2270
2271 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
2309 tag = PAGECACHE_TAG_TOWRITE; 2272 tag = PAGECACHE_TAG_TOWRITE;
2310 else 2273 else
2311 tag = PAGECACHE_TAG_DIRTY; 2274 tag = PAGECACHE_TAG_DIRTY;
2312 2275
2313 *done_index = index; 2276 pagevec_init(&pvec, 0);
2277 mpd->map.m_len = 0;
2278 mpd->next_page = index;
2314 while (index <= end) { 2279 while (index <= end) {
2315 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2280 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2316 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2281 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2317 if (nr_pages == 0) 2282 if (nr_pages == 0)
2318 return 0; 2283 goto out;
2319 2284
2320 for (i = 0; i < nr_pages; i++) { 2285 for (i = 0; i < nr_pages; i++) {
2321 struct page *page = pvec.pages[i]; 2286 struct page *page = pvec.pages[i];
@@ -2330,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle,
2330 if (page->index > end) 2295 if (page->index > end)
2331 goto out; 2296 goto out;
2332 2297
2333 *done_index = page->index + 1; 2298 /* If we can't merge this page, we are done. */
2334 2299 if (mpd->map.m_len > 0 && mpd->next_page != page->index)
2335 /* 2300 goto out;
2336 * If we can't merge this page, and we have
2337 * accumulated an contiguous region, write it
2338 */
2339 if ((mpd->next_page != page->index) &&
2340 (mpd->next_page != mpd->first_page)) {
2341 mpage_da_map_and_submit(mpd);
2342 goto ret_extent_tail;
2343 }
2344 2301
2345 lock_page(page); 2302 lock_page(page);
2346
2347 /* 2303 /*
2348 * If the page is no longer dirty, or its 2304 * If the page is no longer dirty, or its mapping no
2349 * mapping no longer corresponds to inode we 2305 * longer corresponds to inode we are writing (which
2350 * are writing (which means it has been 2306 * means it has been truncated or invalidated), or the
2351 * truncated or invalidated), or the page is 2307 * page is already under writeback and we are not doing
2352 * already under writeback and we are not 2308 * a data integrity writeback, skip the page
2353 * doing a data integrity writeback, skip the page
2354 */ 2309 */
2355 if (!PageDirty(page) || 2310 if (!PageDirty(page) ||
2356 (PageWriteback(page) && 2311 (PageWriteback(page) &&
2357 (wbc->sync_mode == WB_SYNC_NONE)) || 2312 (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
2358 unlikely(page->mapping != mapping)) { 2313 unlikely(page->mapping != mapping)) {
2359 unlock_page(page); 2314 unlock_page(page);
2360 continue; 2315 continue;
@@ -2363,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle,
2363 wait_on_page_writeback(page); 2318 wait_on_page_writeback(page);
2364 BUG_ON(PageWriteback(page)); 2319 BUG_ON(PageWriteback(page));
2365 2320
2366 /* 2321 if (mpd->map.m_len == 0)
2367 * If we have inline data and arrive here, it means that
2368 * we will soon create the block for the 1st page, so
2369 * we'd better clear the inline data here.
2370 */
2371 if (ext4_has_inline_data(inode)) {
2372 BUG_ON(ext4_test_inode_state(inode,
2373 EXT4_STATE_MAY_INLINE_DATA));
2374 ext4_destroy_inline_data(handle, inode);
2375 }
2376
2377 if (mpd->next_page != page->index)
2378 mpd->first_page = page->index; 2322 mpd->first_page = page->index;
2379 mpd->next_page = page->index + 1; 2323 mpd->next_page = page->index + 1;
2380 logical = (sector_t) page->index <<
2381 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2382
2383 /* Add all dirty buffers to mpd */ 2324 /* Add all dirty buffers to mpd */
2325 lblk = ((ext4_lblk_t)page->index) <<
2326 (PAGE_CACHE_SHIFT - blkbits);
2384 head = page_buffers(page); 2327 head = page_buffers(page);
2385 bh = head; 2328 if (!add_page_bufs_to_extent(mpd, head, head, lblk))
2386 do { 2329 goto out;
2387 BUG_ON(buffer_locked(bh)); 2330 /* So far everything mapped? Submit the page for IO. */
2388 /* 2331 if (mpd->map.m_len == 0) {
2389 * We need to try to allocate unmapped blocks 2332 err = mpage_submit_page(mpd, page);
2390 * in the same page. Otherwise we won't make 2333 if (err < 0)
2391 * progress with the page in ext4_writepage
2392 */
2393 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2394 mpage_add_bh_to_extent(mpd, logical,
2395 bh->b_state);
2396 if (mpd->io_done)
2397 goto ret_extent_tail;
2398 } else if (buffer_dirty(bh) &&
2399 buffer_mapped(bh)) {
2400 /*
2401 * mapped dirty buffer. We need to
2402 * update the b_state because we look
2403 * at b_state in mpage_da_map_blocks.
2404 * We don't update b_size because if we
2405 * find an unmapped buffer_head later
2406 * we need to use the b_state flag of
2407 * that buffer_head.
2408 */
2409 if (mpd->b_size == 0)
2410 mpd->b_state =
2411 bh->b_state & BH_FLAGS;
2412 }
2413 logical++;
2414 } while ((bh = bh->b_this_page) != head);
2415
2416 if (nr_to_write > 0) {
2417 nr_to_write--;
2418 if (nr_to_write == 0 &&
2419 wbc->sync_mode == WB_SYNC_NONE)
2420 /*
2421 * We stop writing back only if we are
2422 * not doing integrity sync. In case of
2423 * integrity sync we have to keep going
2424 * because someone may be concurrently
2425 * dirtying pages, and we might have
2426 * synced a lot of newly appeared dirty
2427 * pages, but have not synced all of the
2428 * old dirty pages.
2429 */
2430 goto out; 2334 goto out;
2431 } 2335 }
2336
2337 /*
2338 * Accumulated enough dirty pages? This doesn't apply
2339 * to WB_SYNC_ALL mode. For integrity sync we have to
2340 * keep going because someone may be concurrently
2341 * dirtying pages, and we might have synced a lot of
2342 * newly appeared dirty pages, but have not synced all
2343 * of the old dirty pages.
2344 */
2345 if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
2346 mpd->next_page - mpd->first_page >=
2347 mpd->wbc->nr_to_write)
2348 goto out;
2432 } 2349 }
2433 pagevec_release(&pvec); 2350 pagevec_release(&pvec);
2434 cond_resched(); 2351 cond_resched();
2435 } 2352 }
2436 return 0; 2353 return 0;
2437ret_extent_tail:
2438 ret = MPAGE_DA_EXTENT_TAIL;
2439out: 2354out:
2440 pagevec_release(&pvec); 2355 pagevec_release(&pvec);
2441 cond_resched(); 2356 return err;
2442 return ret;
2443} 2357}
2444 2358
2359static int __writepage(struct page *page, struct writeback_control *wbc,
2360 void *data)
2361{
2362 struct address_space *mapping = data;
2363 int ret = ext4_writepage(page, wbc);
2364 mapping_set_error(mapping, ret);
2365 return ret;
2366}
2445 2367
2446static int ext4_da_writepages(struct address_space *mapping, 2368static int ext4_writepages(struct address_space *mapping,
2447 struct writeback_control *wbc) 2369 struct writeback_control *wbc)
2448{ 2370{
2449 pgoff_t index; 2371 pgoff_t writeback_index = 0;
2372 long nr_to_write = wbc->nr_to_write;
2450 int range_whole = 0; 2373 int range_whole = 0;
2374 int cycled = 1;
2451 handle_t *handle = NULL; 2375 handle_t *handle = NULL;
2452 struct mpage_da_data mpd; 2376 struct mpage_da_data mpd;
2453 struct inode *inode = mapping->host; 2377 struct inode *inode = mapping->host;
2454 int pages_written = 0; 2378 int needed_blocks, rsv_blocks = 0, ret = 0;
2455 unsigned int max_pages;
2456 int range_cyclic, cycled = 1, io_done = 0;
2457 int needed_blocks, ret = 0;
2458 long desired_nr_to_write, nr_to_writebump = 0;
2459 loff_t range_start = wbc->range_start;
2460 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2379 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2461 pgoff_t done_index = 0; 2380 bool done;
2462 pgoff_t end;
2463 struct blk_plug plug; 2381 struct blk_plug plug;
2382 bool give_up_on_write = false;
2464 2383
2465 trace_ext4_da_writepages(inode, wbc); 2384 trace_ext4_writepages(inode, wbc);
2466 2385
2467 /* 2386 /*
2468 * No pages to write? This is mainly a kludge to avoid starting 2387 * No pages to write? This is mainly a kludge to avoid starting
@@ -2472,164 +2391,165 @@ static int ext4_da_writepages(struct address_space *mapping,
2472 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2391 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2473 return 0; 2392 return 0;
2474 2393
2394 if (ext4_should_journal_data(inode)) {
2395 struct blk_plug plug;
2396 int ret;
2397
2398 blk_start_plug(&plug);
2399 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
2400 blk_finish_plug(&plug);
2401 return ret;
2402 }
2403
2475 /* 2404 /*
2476 * If the filesystem has aborted, it is read-only, so return 2405 * If the filesystem has aborted, it is read-only, so return
2477 * right away instead of dumping stack traces later on that 2406 * right away instead of dumping stack traces later on that
2478 * will obscure the real source of the problem. We test 2407 * will obscure the real source of the problem. We test
2479 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2408 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2480 * the latter could be true if the filesystem is mounted 2409 * the latter could be true if the filesystem is mounted
2481 * read-only, and in that case, ext4_da_writepages should 2410 * read-only, and in that case, ext4_writepages should
2482 * *never* be called, so if that ever happens, we would want 2411 * *never* be called, so if that ever happens, we would want
2483 * the stack trace. 2412 * the stack trace.
2484 */ 2413 */
2485 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2414 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2486 return -EROFS; 2415 return -EROFS;
2487 2416
2417 if (ext4_should_dioread_nolock(inode)) {
2418 /*
2419 * We may need to convert upto one extent per block in
2420 * the page and we may dirty the inode.
2421 */
2422 rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
2423 }
2424
2425 /*
2426 * If we have inline data and arrive here, it means that
2427 * we will soon create the block for the 1st page, so
2428 * we'd better clear the inline data here.
2429 */
2430 if (ext4_has_inline_data(inode)) {
2431 /* Just inode will be modified... */
2432 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
2433 if (IS_ERR(handle)) {
2434 ret = PTR_ERR(handle);
2435 goto out_writepages;
2436 }
2437 BUG_ON(ext4_test_inode_state(inode,
2438 EXT4_STATE_MAY_INLINE_DATA));
2439 ext4_destroy_inline_data(handle, inode);
2440 ext4_journal_stop(handle);
2441 }
2442
2488 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2443 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2489 range_whole = 1; 2444 range_whole = 1;
2490 2445
2491 range_cyclic = wbc->range_cyclic;
2492 if (wbc->range_cyclic) { 2446 if (wbc->range_cyclic) {
2493 index = mapping->writeback_index; 2447 writeback_index = mapping->writeback_index;
2494 if (index) 2448 if (writeback_index)
2495 cycled = 0; 2449 cycled = 0;
2496 wbc->range_start = index << PAGE_CACHE_SHIFT; 2450 mpd.first_page = writeback_index;
2497 wbc->range_end = LLONG_MAX; 2451 mpd.last_page = -1;
2498 wbc->range_cyclic = 0;
2499 end = -1;
2500 } else { 2452 } else {
2501 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2453 mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
2502 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2454 mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
2503 }
2504
2505 /*
2506 * This works around two forms of stupidity. The first is in
2507 * the writeback code, which caps the maximum number of pages
2508 * written to be 1024 pages. This is wrong on multiple
2509 * levels; different architectues have a different page size,
2510 * which changes the maximum amount of data which gets
2511 * written. Secondly, 4 megabytes is way too small. XFS
2512 * forces this value to be 16 megabytes by multiplying
2513 * nr_to_write parameter by four, and then relies on its
2514 * allocator to allocate larger extents to make them
2515 * contiguous. Unfortunately this brings us to the second
2516 * stupidity, which is that ext4's mballoc code only allocates
2517 * at most 2048 blocks. So we force contiguous writes up to
2518 * the number of dirty blocks in the inode, or
2519 * sbi->max_writeback_mb_bump whichever is smaller.
2520 */
2521 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2522 if (!range_cyclic && range_whole) {
2523 if (wbc->nr_to_write == LONG_MAX)
2524 desired_nr_to_write = wbc->nr_to_write;
2525 else
2526 desired_nr_to_write = wbc->nr_to_write * 8;
2527 } else
2528 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2529 max_pages);
2530 if (desired_nr_to_write > max_pages)
2531 desired_nr_to_write = max_pages;
2532
2533 if (wbc->nr_to_write < desired_nr_to_write) {
2534 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2535 wbc->nr_to_write = desired_nr_to_write;
2536 } 2455 }
2537 2456
2457 mpd.inode = inode;
2458 mpd.wbc = wbc;
2459 ext4_io_submit_init(&mpd.io_submit, wbc);
2538retry: 2460retry:
2539 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2461 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2540 tag_pages_for_writeback(mapping, index, end); 2462 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2541 2463 done = false;
2542 blk_start_plug(&plug); 2464 blk_start_plug(&plug);
2543 while (!ret && wbc->nr_to_write > 0) { 2465 while (!done && mpd.first_page <= mpd.last_page) {
2466 /* For each extent of pages we use new io_end */
2467 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2468 if (!mpd.io_submit.io_end) {
2469 ret = -ENOMEM;
2470 break;
2471 }
2544 2472
2545 /* 2473 /*
2546 * we insert one extent at a time. So we need 2474 * We have two constraints: We find one extent to map and we
2547 * credit needed for single extent allocation. 2475 * must always write out whole page (makes a difference when
2548 * journalled mode is currently not supported 2476 * blocksize < pagesize) so that we don't block on IO when we
2549 * by delalloc 2477 * try to write out the rest of the page. Journalled mode is
2478 * not supported by delalloc.
2550 */ 2479 */
2551 BUG_ON(ext4_should_journal_data(inode)); 2480 BUG_ON(ext4_should_journal_data(inode));
2552 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2481 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2553 2482
2554 /* start a new transaction*/ 2483 /* start a new transaction */
2555 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2484 handle = ext4_journal_start_with_reserve(inode,
2556 needed_blocks); 2485 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
2557 if (IS_ERR(handle)) { 2486 if (IS_ERR(handle)) {
2558 ret = PTR_ERR(handle); 2487 ret = PTR_ERR(handle);
2559 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2488 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2560 "%ld pages, ino %lu; err %d", __func__, 2489 "%ld pages, ino %lu; err %d", __func__,
2561 wbc->nr_to_write, inode->i_ino, ret); 2490 wbc->nr_to_write, inode->i_ino, ret);
2562 blk_finish_plug(&plug); 2491 /* Release allocated io_end */
2563 goto out_writepages; 2492 ext4_put_io_end(mpd.io_submit.io_end);
2493 break;
2564 } 2494 }
2565 2495
2566 /* 2496 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
2567 * Now call write_cache_pages_da() to find the next 2497 ret = mpage_prepare_extent_to_map(&mpd);
2568 * contiguous region of logical blocks that need 2498 if (!ret) {
2569 * blocks to be allocated by ext4 and submit them. 2499 if (mpd.map.m_len)
2570 */ 2500 ret = mpage_map_and_submit_extent(handle, &mpd,
2571 ret = write_cache_pages_da(handle, mapping, 2501 &give_up_on_write);
2572 wbc, &mpd, &done_index); 2502 else {
2573 /* 2503 /*
2574 * If we have a contiguous extent of pages and we 2504 * We scanned the whole range (or exhausted
2575 * haven't done the I/O yet, map the blocks and submit 2505 * nr_to_write), submitted what was mapped and
2576 * them for I/O. 2506 * didn't find anything needing mapping. We are
2577 */ 2507 * done.
2578 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2508 */
2579 mpage_da_map_and_submit(&mpd); 2509 done = true;
2580 ret = MPAGE_DA_EXTENT_TAIL; 2510 }
2581 } 2511 }
2582 trace_ext4_da_write_pages(inode, &mpd);
2583 wbc->nr_to_write -= mpd.pages_written;
2584
2585 ext4_journal_stop(handle); 2512 ext4_journal_stop(handle);
2586 2513 /* Submit prepared bio */
2587 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2514 ext4_io_submit(&mpd.io_submit);
2588 /* commit the transaction which would 2515 /* Unlock pages we didn't use */
2516 mpage_release_unused_pages(&mpd, give_up_on_write);
2517 /* Drop our io_end reference we got from init */
2518 ext4_put_io_end(mpd.io_submit.io_end);
2519
2520 if (ret == -ENOSPC && sbi->s_journal) {
2521 /*
2522 * Commit the transaction which would
2589 * free blocks released in the transaction 2523 * free blocks released in the transaction
2590 * and try again 2524 * and try again
2591 */ 2525 */
2592 jbd2_journal_force_commit_nested(sbi->s_journal); 2526 jbd2_journal_force_commit_nested(sbi->s_journal);
2593 ret = 0; 2527 ret = 0;
2594 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2528 continue;
2595 /* 2529 }
2596 * Got one extent now try with rest of the pages. 2530 /* Fatal error - ENOMEM, EIO... */
2597 * If mpd.retval is set -EIO, journal is aborted. 2531 if (ret)
2598 * So we don't need to write any more.
2599 */
2600 pages_written += mpd.pages_written;
2601 ret = mpd.retval;
2602 io_done = 1;
2603 } else if (wbc->nr_to_write)
2604 /*
2605 * There is no more writeout needed
2606 * or we requested for a noblocking writeout
2607 * and we found the device congested
2608 */
2609 break; 2532 break;
2610 } 2533 }
2611 blk_finish_plug(&plug); 2534 blk_finish_plug(&plug);
2612 if (!io_done && !cycled) { 2535 if (!ret && !cycled) {
2613 cycled = 1; 2536 cycled = 1;
2614 index = 0; 2537 mpd.last_page = writeback_index - 1;
2615 wbc->range_start = index << PAGE_CACHE_SHIFT; 2538 mpd.first_page = 0;
2616 wbc->range_end = mapping->writeback_index - 1;
2617 goto retry; 2539 goto retry;
2618 } 2540 }
2619 2541
2620 /* Update index */ 2542 /* Update index */
2621 wbc->range_cyclic = range_cyclic;
2622 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2543 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2623 /* 2544 /*
2624 * set the writeback_index so that range_cyclic 2545 * Set the writeback_index so that range_cyclic
2625 * mode will write it back later 2546 * mode will write it back later
2626 */ 2547 */
2627 mapping->writeback_index = done_index; 2548 mapping->writeback_index = mpd.first_page;
2628 2549
2629out_writepages: 2550out_writepages:
2630 wbc->nr_to_write -= nr_to_writebump; 2551 trace_ext4_writepages_result(inode, wbc, ret,
2631 wbc->range_start = range_start; 2552 nr_to_write - wbc->nr_to_write);
2632 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2633 return ret; 2553 return ret;
2634} 2554}
2635 2555
@@ -2841,7 +2761,8 @@ static int ext4_da_write_end(struct file *file,
2841 return ret ? ret : copied; 2761 return ret ? ret : copied;
2842} 2762}
2843 2763
2844static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2764static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
2765 unsigned int length)
2845{ 2766{
2846 /* 2767 /*
2847 * Drop reserved blocks 2768 * Drop reserved blocks
@@ -2850,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2850 if (!page_has_buffers(page)) 2771 if (!page_has_buffers(page))
2851 goto out; 2772 goto out;
2852 2773
2853 ext4_da_page_release_reservation(page, offset); 2774 ext4_da_page_release_reservation(page, offset, length);
2854 2775
2855out: 2776out:
2856 ext4_invalidatepage(page, offset); 2777 ext4_invalidatepage(page, offset, length);
2857 2778
2858 return; 2779 return;
2859} 2780}
@@ -2876,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
2876 * laptop_mode, not even desirable). However, to do otherwise 2797 * laptop_mode, not even desirable). However, to do otherwise
2877 * would require replicating code paths in: 2798 * would require replicating code paths in:
2878 * 2799 *
2879 * ext4_da_writepages() -> 2800 * ext4_writepages() ->
2880 * write_cache_pages() ---> (via passed in callback function) 2801 * write_cache_pages() ---> (via passed in callback function)
2881 * __mpage_da_writepage() --> 2802 * __mpage_da_writepage() -->
2882 * mpage_add_bh_to_extent() 2803 * mpage_add_bh_to_extent()
@@ -3001,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3001 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2922 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3002} 2923}
3003 2924
3004static void ext4_invalidatepage(struct page *page, unsigned long offset) 2925static void ext4_invalidatepage(struct page *page, unsigned int offset,
2926 unsigned int length)
3005{ 2927{
3006 trace_ext4_invalidatepage(page, offset); 2928 trace_ext4_invalidatepage(page, offset, length);
3007 2929
3008 /* No journalling happens on data buffers when this function is used */ 2930 /* No journalling happens on data buffers when this function is used */
3009 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 2931 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
3010 2932
3011 block_invalidatepage(page, offset); 2933 block_invalidatepage(page, offset, length);
3012} 2934}
3013 2935
3014static int __ext4_journalled_invalidatepage(struct page *page, 2936static int __ext4_journalled_invalidatepage(struct page *page,
3015 unsigned long offset) 2937 unsigned int offset,
2938 unsigned int length)
3016{ 2939{
3017 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2940 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3018 2941
3019 trace_ext4_journalled_invalidatepage(page, offset); 2942 trace_ext4_journalled_invalidatepage(page, offset, length);
3020 2943
3021 /* 2944 /*
3022 * If it's a full truncate we just forget about the pending dirtying 2945 * If it's a full truncate we just forget about the pending dirtying
3023 */ 2946 */
3024 if (offset == 0) 2947 if (offset == 0 && length == PAGE_CACHE_SIZE)
3025 ClearPageChecked(page); 2948 ClearPageChecked(page);
3026 2949
3027 return jbd2_journal_invalidatepage(journal, page, offset); 2950 return jbd2_journal_invalidatepage(journal, page, offset, length);
3028} 2951}
3029 2952
3030/* Wrapper for aops... */ 2953/* Wrapper for aops... */
3031static void ext4_journalled_invalidatepage(struct page *page, 2954static void ext4_journalled_invalidatepage(struct page *page,
3032 unsigned long offset) 2955 unsigned int offset,
2956 unsigned int length)
3033{ 2957{
3034 WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); 2958 WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
3035} 2959}
3036 2960
3037static int ext4_releasepage(struct page *page, gfp_t wait) 2961static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3141,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3141 3065
3142 BUG_ON(iocb->private == NULL); 3066 BUG_ON(iocb->private == NULL);
3143 3067
3068 /*
3069 * Make all waiters for direct IO properly wait also for extent
3070 * conversion. This also disallows race between truncate() and
3071 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3072 */
3073 if (rw == WRITE)
3074 atomic_inc(&inode->i_dio_count);
3075
3144 /* If we do a overwrite dio, i_mutex locking can be released */ 3076 /* If we do a overwrite dio, i_mutex locking can be released */
3145 overwrite = *((int *)iocb->private); 3077 overwrite = *((int *)iocb->private);
3146 3078
3147 if (overwrite) { 3079 if (overwrite) {
3148 atomic_inc(&inode->i_dio_count);
3149 down_read(&EXT4_I(inode)->i_data_sem); 3080 down_read(&EXT4_I(inode)->i_data_sem);
3150 mutex_unlock(&inode->i_mutex); 3081 mutex_unlock(&inode->i_mutex);
3151 } 3082 }
@@ -3216,11 +3147,19 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3216 ext4_inode_aio_set(inode, NULL); 3147 ext4_inode_aio_set(inode, NULL);
3217 ext4_put_io_end(io_end); 3148 ext4_put_io_end(io_end);
3218 /* 3149 /*
3219 * In case of error or no write ext4_end_io_dio() was not 3150 * When no IO was submitted ext4_end_io_dio() was not
3220 * called so we have to put iocb's reference. 3151 * called so we have to put iocb's reference.
3221 */ 3152 */
3222 if (ret <= 0 && ret != -EIOCBQUEUED) { 3153 if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
3223 WARN_ON(iocb->private != io_end); 3154 WARN_ON(iocb->private != io_end);
3155 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
3156 WARN_ON(io_end->iocb);
3157 /*
3158 * Generic code already did inode_dio_done() so we
3159 * have to clear EXT4_IO_END_DIRECT to not do it for
3160 * the second time.
3161 */
3162 io_end->flag = 0;
3224 ext4_put_io_end(io_end); 3163 ext4_put_io_end(io_end);
3225 iocb->private = NULL; 3164 iocb->private = NULL;
3226 } 3165 }
@@ -3232,7 +3171,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3232 * for non AIO case, since the IO is already 3171 * for non AIO case, since the IO is already
3233 * completed, we could do the conversion right here 3172 * completed, we could do the conversion right here
3234 */ 3173 */
3235 err = ext4_convert_unwritten_extents(inode, 3174 err = ext4_convert_unwritten_extents(NULL, inode,
3236 offset, ret); 3175 offset, ret);
3237 if (err < 0) 3176 if (err < 0)
3238 ret = err; 3177 ret = err;
@@ -3240,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3240 } 3179 }
3241 3180
3242retake_lock: 3181retake_lock:
3182 if (rw == WRITE)
3183 inode_dio_done(inode);
3243 /* take i_mutex locking again if we do a ovewrite dio */ 3184 /* take i_mutex locking again if we do a ovewrite dio */
3244 if (overwrite) { 3185 if (overwrite) {
3245 inode_dio_done(inode);
3246 up_read(&EXT4_I(inode)->i_data_sem); 3186 up_read(&EXT4_I(inode)->i_data_sem);
3247 mutex_lock(&inode->i_mutex); 3187 mutex_lock(&inode->i_mutex);
3248 } 3188 }
@@ -3301,6 +3241,7 @@ static const struct address_space_operations ext4_aops = {
3301 .readpage = ext4_readpage, 3241 .readpage = ext4_readpage,
3302 .readpages = ext4_readpages, 3242 .readpages = ext4_readpages,
3303 .writepage = ext4_writepage, 3243 .writepage = ext4_writepage,
3244 .writepages = ext4_writepages,
3304 .write_begin = ext4_write_begin, 3245 .write_begin = ext4_write_begin,
3305 .write_end = ext4_write_end, 3246 .write_end = ext4_write_end,
3306 .bmap = ext4_bmap, 3247 .bmap = ext4_bmap,
@@ -3316,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3316 .readpage = ext4_readpage, 3257 .readpage = ext4_readpage,
3317 .readpages = ext4_readpages, 3258 .readpages = ext4_readpages,
3318 .writepage = ext4_writepage, 3259 .writepage = ext4_writepage,
3260 .writepages = ext4_writepages,
3319 .write_begin = ext4_write_begin, 3261 .write_begin = ext4_write_begin,
3320 .write_end = ext4_journalled_write_end, 3262 .write_end = ext4_journalled_write_end,
3321 .set_page_dirty = ext4_journalled_set_page_dirty, 3263 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3331,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = {
3331 .readpage = ext4_readpage, 3273 .readpage = ext4_readpage,
3332 .readpages = ext4_readpages, 3274 .readpages = ext4_readpages,
3333 .writepage = ext4_writepage, 3275 .writepage = ext4_writepage,
3334 .writepages = ext4_da_writepages, 3276 .writepages = ext4_writepages,
3335 .write_begin = ext4_da_write_begin, 3277 .write_begin = ext4_da_write_begin,
3336 .write_end = ext4_da_write_end, 3278 .write_end = ext4_da_write_end,
3337 .bmap = ext4_bmap, 3279 .bmap = ext4_bmap,
@@ -3364,89 +3306,56 @@ void ext4_set_aops(struct inode *inode)
3364 inode->i_mapping->a_ops = &ext4_aops; 3306 inode->i_mapping->a_ops = &ext4_aops;
3365} 3307}
3366 3308
3367
3368/* 3309/*
3369 * ext4_discard_partial_page_buffers() 3310 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3370 * Wrapper function for ext4_discard_partial_page_buffers_no_lock. 3311 * up to the end of the block which corresponds to `from'.
3371 * This function finds and locks the page containing the offset 3312 * This required during truncate. We need to physically zero the tail end
3372 * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. 3313 * of that block so it doesn't yield old data if the file is later grown.
3373 * Calling functions that already have the page locked should call
3374 * ext4_discard_partial_page_buffers_no_lock directly.
3375 */ 3314 */
3376int ext4_discard_partial_page_buffers(handle_t *handle, 3315int ext4_block_truncate_page(handle_t *handle,
3377 struct address_space *mapping, loff_t from, 3316 struct address_space *mapping, loff_t from)
3378 loff_t length, int flags)
3379{ 3317{
3318 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3319 unsigned length;
3320 unsigned blocksize;
3380 struct inode *inode = mapping->host; 3321 struct inode *inode = mapping->host;
3381 struct page *page;
3382 int err = 0;
3383
3384 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3385 mapping_gfp_mask(mapping) & ~__GFP_FS);
3386 if (!page)
3387 return -ENOMEM;
3388 3322
3389 err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, 3323 blocksize = inode->i_sb->s_blocksize;
3390 from, length, flags); 3324 length = blocksize - (offset & (blocksize - 1));
3391 3325
3392 unlock_page(page); 3326 return ext4_block_zero_page_range(handle, mapping, from, length);
3393 page_cache_release(page);
3394 return err;
3395} 3327}
3396 3328
3397/* 3329/*
3398 * ext4_discard_partial_page_buffers_no_lock() 3330 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3399 * Zeros a page range of length 'length' starting from offset 'from'. 3331 * starting from file offset 'from'. The range to be zero'd must
3400 * Buffer heads that correspond to the block aligned regions of the 3332 * be contained with in one block. If the specified range exceeds
3401 * zeroed range will be unmapped. Unblock aligned regions 3333 * the end of the block it will be shortened to end of the block
3402 * will have the corresponding buffer head mapped if needed so that 3334 * that cooresponds to 'from'
3403 * that region of the page can be updated with the partial zero out.
3404 *
3405 * This function assumes that the page has already been locked. The
3406 * The range to be discarded must be contained with in the given page.
3407 * If the specified range exceeds the end of the page it will be shortened
3408 * to the end of the page that corresponds to 'from'. This function is
3409 * appropriate for updating a page and it buffer heads to be unmapped and
3410 * zeroed for blocks that have been either released, or are going to be
3411 * released.
3412 *
3413 * handle: The journal handle
3414 * inode: The files inode
3415 * page: A locked page that contains the offset "from"
3416 * from: The starting byte offset (from the beginning of the file)
3417 * to begin discarding
3418 * len: The length of bytes to discard
3419 * flags: Optional flags that may be used:
3420 *
3421 * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3422 * Only zero the regions of the page whose buffer heads
3423 * have already been unmapped. This flag is appropriate
3424 * for updating the contents of a page whose blocks may
3425 * have already been released, and we only want to zero
3426 * out the regions that correspond to those released blocks.
3427 *
3428 * Returns zero on success or negative on failure.
3429 */ 3335 */
3430static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 3336int ext4_block_zero_page_range(handle_t *handle,
3431 struct inode *inode, struct page *page, loff_t from, 3337 struct address_space *mapping, loff_t from, loff_t length)
3432 loff_t length, int flags)
3433{ 3338{
3434 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3339 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3435 unsigned int offset = from & (PAGE_CACHE_SIZE-1); 3340 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3436 unsigned int blocksize, max, pos; 3341 unsigned blocksize, max, pos;
3437 ext4_lblk_t iblock; 3342 ext4_lblk_t iblock;
3343 struct inode *inode = mapping->host;
3438 struct buffer_head *bh; 3344 struct buffer_head *bh;
3345 struct page *page;
3439 int err = 0; 3346 int err = 0;
3440 3347
3441 blocksize = inode->i_sb->s_blocksize; 3348 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3442 max = PAGE_CACHE_SIZE - offset; 3349 mapping_gfp_mask(mapping) & ~__GFP_FS);
3350 if (!page)
3351 return -ENOMEM;
3443 3352
3444 if (index != page->index) 3353 blocksize = inode->i_sb->s_blocksize;
3445 return -EINVAL; 3354 max = blocksize - (offset & (blocksize - 1));
3446 3355
3447 /* 3356 /*
3448 * correct length if it does not fall between 3357 * correct length if it does not fall between
3449 * 'from' and the end of the page 3358 * 'from' and the end of the block
3450 */ 3359 */
3451 if (length > max || length < 0) 3360 if (length > max || length < 0)
3452 length = max; 3361 length = max;
@@ -3464,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3464 iblock++; 3373 iblock++;
3465 pos += blocksize; 3374 pos += blocksize;
3466 } 3375 }
3467 3376 if (buffer_freed(bh)) {
3468 pos = offset; 3377 BUFFER_TRACE(bh, "freed: skip");
3469 while (pos < offset + length) { 3378 goto unlock;
3470 unsigned int end_of_block, range_to_discard; 3379 }
3471 3380 if (!buffer_mapped(bh)) {
3472 err = 0; 3381 BUFFER_TRACE(bh, "unmapped");
3473 3382 ext4_get_block(inode, iblock, bh, 0);
3474 /* The length of space left to zero and unmap */ 3383 /* unmapped? It's a hole - nothing to do */
3475 range_to_discard = offset + length - pos;
3476
3477 /* The length of space until the end of the block */
3478 end_of_block = blocksize - (pos & (blocksize-1));
3479
3480 /*
3481 * Do not unmap or zero past end of block
3482 * for this buffer head
3483 */
3484 if (range_to_discard > end_of_block)
3485 range_to_discard = end_of_block;
3486
3487
3488 /*
3489 * Skip this buffer head if we are only zeroing unampped
3490 * regions of the page
3491 */
3492 if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
3493 buffer_mapped(bh))
3494 goto next;
3495
3496 /* If the range is block aligned, unmap */
3497 if (range_to_discard == blocksize) {
3498 clear_buffer_dirty(bh);
3499 bh->b_bdev = NULL;
3500 clear_buffer_mapped(bh);
3501 clear_buffer_req(bh);
3502 clear_buffer_new(bh);
3503 clear_buffer_delay(bh);
3504 clear_buffer_unwritten(bh);
3505 clear_buffer_uptodate(bh);
3506 zero_user(page, pos, range_to_discard);
3507 BUFFER_TRACE(bh, "Buffer discarded");
3508 goto next;
3509 }
3510
3511 /*
3512 * If this block is not completely contained in the range
3513 * to be discarded, then it is not going to be released. Because
3514 * we need to keep this block, we need to make sure this part
3515 * of the page is uptodate before we modify it by writeing
3516 * partial zeros on it.
3517 */
3518 if (!buffer_mapped(bh)) { 3384 if (!buffer_mapped(bh)) {
3519 /* 3385 BUFFER_TRACE(bh, "still unmapped");
3520 * Buffer head must be mapped before we can read 3386 goto unlock;
3521 * from the block
3522 */
3523 BUFFER_TRACE(bh, "unmapped");
3524 ext4_get_block(inode, iblock, bh, 0);
3525 /* unmapped? It's a hole - nothing to do */
3526 if (!buffer_mapped(bh)) {
3527 BUFFER_TRACE(bh, "still unmapped");
3528 goto next;
3529 }
3530 } 3387 }
3388 }
3531 3389
3532 /* Ok, it's mapped. Make sure it's up-to-date */ 3390 /* Ok, it's mapped. Make sure it's up-to-date */
3533 if (PageUptodate(page)) 3391 if (PageUptodate(page))
3534 set_buffer_uptodate(bh); 3392 set_buffer_uptodate(bh);
3535 3393
3536 if (!buffer_uptodate(bh)) { 3394 if (!buffer_uptodate(bh)) {
3537 err = -EIO; 3395 err = -EIO;
3538 ll_rw_block(READ, 1, &bh); 3396 ll_rw_block(READ, 1, &bh);
3539 wait_on_buffer(bh); 3397 wait_on_buffer(bh);
3540 /* Uhhuh. Read error. Complain and punt.*/ 3398 /* Uhhuh. Read error. Complain and punt. */
3541 if (!buffer_uptodate(bh)) 3399 if (!buffer_uptodate(bh))
3542 goto next; 3400 goto unlock;
3543 } 3401 }
3402 if (ext4_should_journal_data(inode)) {
3403 BUFFER_TRACE(bh, "get write access");
3404 err = ext4_journal_get_write_access(handle, bh);
3405 if (err)
3406 goto unlock;
3407 }
3408 zero_user(page, offset, length);
3409 BUFFER_TRACE(bh, "zeroed end of block");
3544 3410
3545 if (ext4_should_journal_data(inode)) { 3411 if (ext4_should_journal_data(inode)) {
3546 BUFFER_TRACE(bh, "get write access"); 3412 err = ext4_handle_dirty_metadata(handle, inode, bh);
3547 err = ext4_journal_get_write_access(handle, bh); 3413 } else {
3548 if (err) 3414 err = 0;
3549 goto next; 3415 mark_buffer_dirty(bh);
3550 } 3416 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
3417 err = ext4_jbd2_file_inode(handle, inode);
3418 }
3551 3419
3552 zero_user(page, pos, range_to_discard); 3420unlock:
3421 unlock_page(page);
3422 page_cache_release(page);
3423 return err;
3424}
3553 3425
3554 err = 0; 3426int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3555 if (ext4_should_journal_data(inode)) { 3427 loff_t lstart, loff_t length)
3556 err = ext4_handle_dirty_metadata(handle, inode, bh); 3428{
3557 } else 3429 struct super_block *sb = inode->i_sb;
3558 mark_buffer_dirty(bh); 3430 struct address_space *mapping = inode->i_mapping;
3431 unsigned partial_start, partial_end;
3432 ext4_fsblk_t start, end;
3433 loff_t byte_end = (lstart + length - 1);
3434 int err = 0;
3559 3435
3560 BUFFER_TRACE(bh, "Partial buffer zeroed"); 3436 partial_start = lstart & (sb->s_blocksize - 1);
3561next: 3437 partial_end = byte_end & (sb->s_blocksize - 1);
3562 bh = bh->b_this_page;
3563 iblock++;
3564 pos += range_to_discard;
3565 }
3566 3438
3439 start = lstart >> sb->s_blocksize_bits;
3440 end = byte_end >> sb->s_blocksize_bits;
3441
3442 /* Handle partial zero within the single block */
3443 if (start == end &&
3444 (partial_start || (partial_end != sb->s_blocksize - 1))) {
3445 err = ext4_block_zero_page_range(handle, mapping,
3446 lstart, length);
3447 return err;
3448 }
3449 /* Handle partial zero out on the start of the range */
3450 if (partial_start) {
3451 err = ext4_block_zero_page_range(handle, mapping,
3452 lstart, sb->s_blocksize);
3453 if (err)
3454 return err;
3455 }
3456 /* Handle partial zero out on the end of the range */
3457 if (partial_end != sb->s_blocksize - 1)
3458 err = ext4_block_zero_page_range(handle, mapping,
3459 byte_end - partial_end,
3460 partial_end + 1);
3567 return err; 3461 return err;
3568} 3462}
3569 3463
@@ -3589,14 +3483,12 @@ int ext4_can_truncate(struct inode *inode)
3589 * Returns: 0 on success or negative on failure 3483 * Returns: 0 on success or negative on failure
3590 */ 3484 */
3591 3485
3592int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3486int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3593{ 3487{
3594 struct inode *inode = file_inode(file);
3595 struct super_block *sb = inode->i_sb; 3488 struct super_block *sb = inode->i_sb;
3596 ext4_lblk_t first_block, stop_block; 3489 ext4_lblk_t first_block, stop_block;
3597 struct address_space *mapping = inode->i_mapping; 3490 struct address_space *mapping = inode->i_mapping;
3598 loff_t first_page, last_page, page_len; 3491 loff_t first_block_offset, last_block_offset;
3599 loff_t first_page_offset, last_page_offset;
3600 handle_t *handle; 3492 handle_t *handle;
3601 unsigned int credits; 3493 unsigned int credits;
3602 int ret = 0; 3494 int ret = 0;
@@ -3647,23 +3539,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3647 offset; 3539 offset;
3648 } 3540 }
3649 3541
3650 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 3542 first_block_offset = round_up(offset, sb->s_blocksize);
3651 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 3543 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
3652 3544
3653 first_page_offset = first_page << PAGE_CACHE_SHIFT; 3545 /* Now release the pages and zero block aligned part of pages*/
3654 last_page_offset = last_page << PAGE_CACHE_SHIFT; 3546 if (last_block_offset > first_block_offset)
3655 3547 truncate_pagecache_range(inode, first_block_offset,
3656 /* Now release the pages */ 3548 last_block_offset);
3657 if (last_page_offset > first_page_offset) {
3658 truncate_pagecache_range(inode, first_page_offset,
3659 last_page_offset - 1);
3660 }
3661 3549
3662 /* Wait all existing dio workers, newcomers will block on i_mutex */ 3550 /* Wait all existing dio workers, newcomers will block on i_mutex */
3663 ext4_inode_block_unlocked_dio(inode); 3551 ext4_inode_block_unlocked_dio(inode);
3664 ret = ext4_flush_unwritten_io(inode);
3665 if (ret)
3666 goto out_dio;
3667 inode_dio_wait(inode); 3552 inode_dio_wait(inode);
3668 3553
3669 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3554 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3677,66 +3562,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3677 goto out_dio; 3562 goto out_dio;
3678 } 3563 }
3679 3564
3680 /* 3565 ret = ext4_zero_partial_blocks(handle, inode, offset,
3681 * Now we need to zero out the non-page-aligned data in the 3566 length);
3682 * pages at the start and tail of the hole, and unmap the 3567 if (ret)
3683 * buffer heads for the block aligned regions of the page that 3568 goto out_stop;
3684 * were completely zeroed.
3685 */
3686 if (first_page > last_page) {
3687 /*
3688 * If the file space being truncated is contained
3689 * within a page just zero out and unmap the middle of
3690 * that page
3691 */
3692 ret = ext4_discard_partial_page_buffers(handle,
3693 mapping, offset, length, 0);
3694
3695 if (ret)
3696 goto out_stop;
3697 } else {
3698 /*
3699 * zero out and unmap the partial page that contains
3700 * the start of the hole
3701 */
3702 page_len = first_page_offset - offset;
3703 if (page_len > 0) {
3704 ret = ext4_discard_partial_page_buffers(handle, mapping,
3705 offset, page_len, 0);
3706 if (ret)
3707 goto out_stop;
3708 }
3709
3710 /*
3711 * zero out and unmap the partial page that contains
3712 * the end of the hole
3713 */
3714 page_len = offset + length - last_page_offset;
3715 if (page_len > 0) {
3716 ret = ext4_discard_partial_page_buffers(handle, mapping,
3717 last_page_offset, page_len, 0);
3718 if (ret)
3719 goto out_stop;
3720 }
3721 }
3722
3723 /*
3724 * If i_size is contained in the last page, we need to
3725 * unmap and zero the partial page after i_size
3726 */
3727 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
3728 inode->i_size % PAGE_CACHE_SIZE != 0) {
3729 page_len = PAGE_CACHE_SIZE -
3730 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3731
3732 if (page_len > 0) {
3733 ret = ext4_discard_partial_page_buffers(handle,
3734 mapping, inode->i_size, page_len, 0);
3735
3736 if (ret)
3737 goto out_stop;
3738 }
3739 }
3740 3569
3741 first_block = (offset + sb->s_blocksize - 1) >> 3570 first_block = (offset + sb->s_blocksize - 1) >>
3742 EXT4_BLOCK_SIZE_BITS(sb); 3571 EXT4_BLOCK_SIZE_BITS(sb);
@@ -3812,7 +3641,6 @@ void ext4_truncate(struct inode *inode)
3812 unsigned int credits; 3641 unsigned int credits;
3813 handle_t *handle; 3642 handle_t *handle;
3814 struct address_space *mapping = inode->i_mapping; 3643 struct address_space *mapping = inode->i_mapping;
3815 loff_t page_len;
3816 3644
3817 /* 3645 /*
3818 * There is a possibility that we're either freeing the inode 3646 * There is a possibility that we're either freeing the inode
@@ -3839,12 +3667,6 @@ void ext4_truncate(struct inode *inode)
3839 return; 3667 return;
3840 } 3668 }
3841 3669
3842 /*
3843 * finish any pending end_io work so we won't run the risk of
3844 * converting any truncated blocks to initialized later
3845 */
3846 ext4_flush_unwritten_io(inode);
3847
3848 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3670 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3849 credits = ext4_writepage_trans_blocks(inode); 3671 credits = ext4_writepage_trans_blocks(inode);
3850 else 3672 else
@@ -3856,14 +3678,8 @@ void ext4_truncate(struct inode *inode)
3856 return; 3678 return;
3857 } 3679 }
3858 3680
3859 if (inode->i_size % PAGE_CACHE_SIZE != 0) { 3681 if (inode->i_size & (inode->i_sb->s_blocksize - 1))
3860 page_len = PAGE_CACHE_SIZE - 3682 ext4_block_truncate_page(handle, mapping, inode->i_size);
3861 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3862
3863 if (ext4_discard_partial_page_buffers(handle,
3864 mapping, inode->i_size, page_len, 0))
3865 goto out_stop;
3866 }
3867 3683
3868 /* 3684 /*
3869 * We add the inode to the orphan list, so that if this 3685 * We add the inode to the orphan list, so that if this
@@ -4632,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
4632 inode->i_size >> PAGE_CACHE_SHIFT); 4448 inode->i_size >> PAGE_CACHE_SHIFT);
4633 if (!page) 4449 if (!page)
4634 return; 4450 return;
4635 ret = __ext4_journalled_invalidatepage(page, offset); 4451 ret = __ext4_journalled_invalidatepage(page, offset,
4452 PAGE_CACHE_SIZE - offset);
4636 unlock_page(page); 4453 unlock_page(page);
4637 page_cache_release(page); 4454 page_cache_release(page);
4638 if (ret != -EBUSY) 4455 if (ret != -EBUSY)
@@ -4814,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4814 struct kstat *stat) 4631 struct kstat *stat)
4815{ 4632{
4816 struct inode *inode; 4633 struct inode *inode;
4817 unsigned long delalloc_blocks; 4634 unsigned long long delalloc_blocks;
4818 4635
4819 inode = dentry->d_inode; 4636 inode = dentry->d_inode;
4820 generic_fillattr(inode, stat); 4637 generic_fillattr(inode, stat);
@@ -4832,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4832 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 4649 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
4833 EXT4_I(inode)->i_reserved_data_blocks); 4650 EXT4_I(inode)->i_reserved_data_blocks);
4834 4651
4835 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4652 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
4836 return 0; 4653 return 0;
4837} 4654}
4838 4655
4839static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4656static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
4657 int pextents)
4840{ 4658{
4841 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4659 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4842 return ext4_ind_trans_blocks(inode, nrblocks, chunk); 4660 return ext4_ind_trans_blocks(inode, lblocks);
4843 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4661 return ext4_ext_index_trans_blocks(inode, pextents);
4844} 4662}
4845 4663
4846/* 4664/*
@@ -4854,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4854 * 4672 *
4855 * Also account for superblock, inode, quota and xattr blocks 4673 * Also account for superblock, inode, quota and xattr blocks
4856 */ 4674 */
4857static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4675static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
4676 int pextents)
4858{ 4677{
4859 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 4678 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
4860 int gdpblocks; 4679 int gdpblocks;
@@ -4862,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4862 int ret = 0; 4681 int ret = 0;
4863 4682
4864 /* 4683 /*
4865 * How many index blocks need to touch to modify nrblocks? 4684 * How many index blocks need to touch to map @lblocks logical blocks
4866 * The "Chunk" flag indicating whether the nrblocks is 4685 * to @pextents physical extents?
4867 * physically contiguous on disk
4868 *
4869 * For Direct IO and fallocate, they calls get_block to allocate
4870 * one single extent at a time, so they could set the "Chunk" flag
4871 */ 4686 */
4872 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4687 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
4873 4688
4874 ret = idxblocks; 4689 ret = idxblocks;
4875 4690
@@ -4877,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4877 * Now let's see how many group bitmaps and group descriptors need 4692 * Now let's see how many group bitmaps and group descriptors need
4878 * to account 4693 * to account
4879 */ 4694 */
4880 groups = idxblocks; 4695 groups = idxblocks + pextents;
4881 if (chunk)
4882 groups += 1;
4883 else
4884 groups += nrblocks;
4885
4886 gdpblocks = groups; 4696 gdpblocks = groups;
4887 if (groups > ngroups) 4697 if (groups > ngroups)
4888 groups = ngroups; 4698 groups = ngroups;
@@ -4913,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
4913 int bpp = ext4_journal_blocks_per_page(inode); 4723 int bpp = ext4_journal_blocks_per_page(inode);
4914 int ret; 4724 int ret;
4915 4725
4916 ret = ext4_meta_trans_blocks(inode, bpp, 0); 4726 ret = ext4_meta_trans_blocks(inode, bpp, bpp);
4917 4727
4918 /* Account for data blocks for journalled mode */ 4728 /* Account for data blocks for journalled mode */
4919 if (ext4_should_journal_data(inode)) 4729 if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b1ed9e07434b..a9ff5e5137ca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,7 +2105,12 @@ repeat:
2105 group = ac->ac_g_ex.fe_group; 2105 group = ac->ac_g_ex.fe_group;
2106 2106
2107 for (i = 0; i < ngroups; group++, i++) { 2107 for (i = 0; i < ngroups; group++, i++) {
2108 if (group == ngroups) 2108 cond_resched();
2109 /*
2110 * Artificially restricted ngroups for non-extent
2111 * files makes group > ngroups possible on first loop.
2112 */
2113 if (group >= ngroups)
2109 group = 0; 2114 group = 0;
2110 2115
2111 /* This now checks without needing the buddy page */ 2116 /* This now checks without needing the buddy page */
@@ -4401,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4401repeat: 4406repeat:
4402 /* allocate space in core */ 4407 /* allocate space in core */
4403 *errp = ext4_mb_regular_allocator(ac); 4408 *errp = ext4_mb_regular_allocator(ac);
4404 if (*errp) { 4409 if (*errp)
4405 ext4_discard_allocated_blocks(ac); 4410 goto discard_and_exit;
4406 goto errout;
4407 }
4408 4411
4409 /* as we've just preallocated more space than 4412 /* as we've just preallocated more space than
4410 * user requested orinally, we store allocated 4413 * user requested originally, we store allocated
4411 * space in a special descriptor */ 4414 * space in a special descriptor */
4412 if (ac->ac_status == AC_STATUS_FOUND && 4415 if (ac->ac_status == AC_STATUS_FOUND &&
4413 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4416 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4414 ext4_mb_new_preallocation(ac); 4417 *errp = ext4_mb_new_preallocation(ac);
4418 if (*errp) {
4419 discard_and_exit:
4420 ext4_discard_allocated_blocks(ac);
4421 goto errout;
4422 }
4415 } 4423 }
4416 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4424 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4417 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 4425 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4608,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4608 BUG_ON(bh && (count > 1)); 4616 BUG_ON(bh && (count > 1));
4609 4617
4610 for (i = 0; i < count; i++) { 4618 for (i = 0; i < count; i++) {
4619 cond_resched();
4611 if (!bh) 4620 if (!bh)
4612 tbh = sb_find_get_block(inode->i_sb, 4621 tbh = sb_find_get_block(inode->i_sb,
4613 block + i); 4622 block + i);
4614 if (unlikely(!tbh)) 4623 if (!tbh)
4615 continue; 4624 continue;
4616 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4625 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4617 inode, tbh, block + i); 4626 inode, tbh, block + i);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
912 struct page *pagep[2] = {NULL, NULL}; 912 struct page *pagep[2] = {NULL, NULL};
913 handle_t *handle; 913 handle_t *handle;
914 ext4_lblk_t orig_blk_offset; 914 ext4_lblk_t orig_blk_offset;
915 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
916 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 915 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
917 unsigned int w_flags = 0; 916 unsigned int w_flags = 0;
918 unsigned int tmp_data_size, data_size, replaced_size; 917 unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
940 orig_blk_offset = orig_page_offset * blocks_per_page + 939 orig_blk_offset = orig_page_offset * blocks_per_page +
941 data_offset_in_page; 940 data_offset_in_page;
942 941
943 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
944
945 /* Calculate data_size */ 942 /* Calculate data_size */
946 if ((orig_blk_offset + block_len_in_page - 1) == 943 if ((orig_blk_offset + block_len_in_page - 1) ==
947 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 944 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc35ecb7..234b834d5a97 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
918 bh->b_data, bh->b_size, 918 bh->b_data, bh->b_size,
919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
920 + ((char *)de - bh->b_data))) { 920 + ((char *)de - bh->b_data))) {
921 /* On error, skip the f_pos to the next block. */ 921 /* silently ignore the rest of the block */
922 dir_file->f_pos = (dir_file->f_pos | 922 break;
923 (dir->i_sb->s_blocksize - 1)) + 1;
924 brelse(bh);
925 return count;
926 } 923 }
927 ext4fs_dirhash(de->name, de->name_len, hinfo); 924 ext4fs_dirhash(de->name, de->name_len, hinfo);
928 if ((hinfo->hash < start_hash) || 925 if ((hinfo->hash < start_hash) ||
@@ -2299,6 +2296,45 @@ retry:
2299 return err; 2296 return err;
2300} 2297}
2301 2298
2299static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
2300{
2301 handle_t *handle;
2302 struct inode *inode;
2303 int err, retries = 0;
2304
2305 dquot_initialize(dir);
2306
2307retry:
2308 inode = ext4_new_inode_start_handle(dir, mode,
2309 NULL, 0, NULL,
2310 EXT4_HT_DIR,
2311 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2312 4 + EXT4_XATTR_TRANS_BLOCKS);
2313 handle = ext4_journal_current_handle();
2314 err = PTR_ERR(inode);
2315 if (!IS_ERR(inode)) {
2316 inode->i_op = &ext4_file_inode_operations;
2317 inode->i_fop = &ext4_file_operations;
2318 ext4_set_aops(inode);
2319 err = ext4_orphan_add(handle, inode);
2320 if (err)
2321 goto err_drop_inode;
2322 mark_inode_dirty(inode);
2323 d_tmpfile(dentry, inode);
2324 unlock_new_inode(inode);
2325 }
2326 if (handle)
2327 ext4_journal_stop(handle);
2328 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2329 goto retry;
2330 return err;
2331err_drop_inode:
2332 ext4_journal_stop(handle);
2333 unlock_new_inode(inode);
2334 iput(inode);
2335 return err;
2336}
2337
2302struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, 2338struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
2303 struct ext4_dir_entry_2 *de, 2339 struct ext4_dir_entry_2 *de,
2304 int blocksize, int csum_size, 2340 int blocksize, int csum_size,
@@ -2906,7 +2942,7 @@ static int ext4_link(struct dentry *old_dentry,
2906retry: 2942retry:
2907 handle = ext4_journal_start(dir, EXT4_HT_DIR, 2943 handle = ext4_journal_start(dir, EXT4_HT_DIR,
2908 (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2944 (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2909 EXT4_INDEX_EXTRA_TRANS_BLOCKS)); 2945 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
2910 if (IS_ERR(handle)) 2946 if (IS_ERR(handle))
2911 return PTR_ERR(handle); 2947 return PTR_ERR(handle);
2912 2948
@@ -2920,6 +2956,11 @@ retry:
2920 err = ext4_add_entry(handle, dentry, inode); 2956 err = ext4_add_entry(handle, dentry, inode);
2921 if (!err) { 2957 if (!err) {
2922 ext4_mark_inode_dirty(handle, inode); 2958 ext4_mark_inode_dirty(handle, inode);
2959 /* this can happen only for tmpfile being
2960 * linked the first time
2961 */
2962 if (inode->i_nlink == 1)
2963 ext4_orphan_del(handle, inode);
2923 d_instantiate(dentry, inode); 2964 d_instantiate(dentry, inode);
2924 } else { 2965 } else {
2925 drop_nlink(inode); 2966 drop_nlink(inode);
@@ -3172,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = {
3172 .mkdir = ext4_mkdir, 3213 .mkdir = ext4_mkdir,
3173 .rmdir = ext4_rmdir, 3214 .rmdir = ext4_rmdir,
3174 .mknod = ext4_mknod, 3215 .mknod = ext4_mknod,
3216 .tmpfile = ext4_tmpfile,
3175 .rename = ext4_rename, 3217 .rename = ext4_rename,
3176 .setattr = ext4_setattr, 3218 .setattr = ext4_setattr,
3177 .setxattr = generic_setxattr, 3219 .setxattr = generic_setxattr,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 19599bded62a..48786cdb5e6c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -46,29 +46,82 @@ void ext4_exit_pageio(void)
46} 46}
47 47
48/* 48/*
49 * This function is called by ext4_evict_inode() to make sure there is 49 * Print an buffer I/O error compatible with the fs/buffer.c. This
50 * no more pending I/O completion work left to do. 50 * provides compatibility with dmesg scrapers that look for a specific
51 * buffer I/O error message. We really need a unified error reporting
52 * structure to userspace ala Digital Unix's uerf system, but it's
53 * probably not going to happen in my lifetime, due to LKML politics...
51 */ 54 */
52void ext4_ioend_shutdown(struct inode *inode) 55static void buffer_io_error(struct buffer_head *bh)
53{ 56{
54 wait_queue_head_t *wq = ext4_ioend_wq(inode); 57 char b[BDEVNAME_SIZE];
58 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
59 bdevname(bh->b_bdev, b),
60 (unsigned long long)bh->b_blocknr);
61}
55 62
56 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 63static void ext4_finish_bio(struct bio *bio)
57 /* 64{
58 * We need to make sure the work structure is finished being 65 int i;
59 * used before we let the inode get destroyed. 66 int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
60 */ 67
61 if (work_pending(&EXT4_I(inode)->i_unwritten_work)) 68 for (i = 0; i < bio->bi_vcnt; i++) {
62 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 69 struct bio_vec *bvec = &bio->bi_io_vec[i];
70 struct page *page = bvec->bv_page;
71 struct buffer_head *bh, *head;
72 unsigned bio_start = bvec->bv_offset;
73 unsigned bio_end = bio_start + bvec->bv_len;
74 unsigned under_io = 0;
75 unsigned long flags;
76
77 if (!page)
78 continue;
79
80 if (error) {
81 SetPageError(page);
82 set_bit(AS_EIO, &page->mapping->flags);
83 }
84 bh = head = page_buffers(page);
85 /*
86 * We check all buffers in the page under BH_Uptodate_Lock
87 * to avoid races with other end io clearing async_write flags
88 */
89 local_irq_save(flags);
90 bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
91 do {
92 if (bh_offset(bh) < bio_start ||
93 bh_offset(bh) + bh->b_size > bio_end) {
94 if (buffer_async_write(bh))
95 under_io++;
96 continue;
97 }
98 clear_buffer_async_write(bh);
99 if (error)
100 buffer_io_error(bh);
101 } while ((bh = bh->b_this_page) != head);
102 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
103 local_irq_restore(flags);
104 if (!under_io)
105 end_page_writeback(page);
106 }
63} 107}
64 108
65static void ext4_release_io_end(ext4_io_end_t *io_end) 109static void ext4_release_io_end(ext4_io_end_t *io_end)
66{ 110{
111 struct bio *bio, *next_bio;
112
67 BUG_ON(!list_empty(&io_end->list)); 113 BUG_ON(!list_empty(&io_end->list));
68 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 114 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
115 WARN_ON(io_end->handle);
69 116
70 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) 117 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
71 wake_up_all(ext4_ioend_wq(io_end->inode)); 118 wake_up_all(ext4_ioend_wq(io_end->inode));
119
120 for (bio = io_end->bio; bio; bio = next_bio) {
121 next_bio = bio->bi_private;
122 ext4_finish_bio(bio);
123 bio_put(bio);
124 }
72 if (io_end->flag & EXT4_IO_END_DIRECT) 125 if (io_end->flag & EXT4_IO_END_DIRECT)
73 inode_dio_done(io_end->inode); 126 inode_dio_done(io_end->inode);
74 if (io_end->iocb) 127 if (io_end->iocb)
@@ -86,19 +139,28 @@ static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
86 wake_up_all(ext4_ioend_wq(inode)); 139 wake_up_all(ext4_ioend_wq(inode));
87} 140}
88 141
89/* check a range of space and convert unwritten extents to written. */ 142/*
143 * Check a range of space and convert unwritten extents to written. Note that
144 * we are protected from truncate touching same part of extent tree by the
145 * fact that truncate code waits for all DIO to finish (thus exclusion from
146 * direct IO is achieved) and also waits for PageWriteback bits. Thus we
147 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
148 * completed (happens from ext4_free_ioend()).
149 */
90static int ext4_end_io(ext4_io_end_t *io) 150static int ext4_end_io(ext4_io_end_t *io)
91{ 151{
92 struct inode *inode = io->inode; 152 struct inode *inode = io->inode;
93 loff_t offset = io->offset; 153 loff_t offset = io->offset;
94 ssize_t size = io->size; 154 ssize_t size = io->size;
155 handle_t *handle = io->handle;
95 int ret = 0; 156 int ret = 0;
96 157
97 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 158 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
98 "list->prev 0x%p\n", 159 "list->prev 0x%p\n",
99 io, inode->i_ino, io->list.next, io->list.prev); 160 io, inode->i_ino, io->list.next, io->list.prev);
100 161
101 ret = ext4_convert_unwritten_extents(inode, offset, size); 162 io->handle = NULL; /* Following call will use up the handle */
163 ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
102 if (ret < 0) { 164 if (ret < 0) {
103 ext4_msg(inode->i_sb, KERN_EMERG, 165 ext4_msg(inode->i_sb, KERN_EMERG,
104 "failed to convert unwritten extents to written " 166 "failed to convert unwritten extents to written "
@@ -111,20 +173,17 @@ static int ext4_end_io(ext4_io_end_t *io)
111 return ret; 173 return ret;
112} 174}
113 175
114static void dump_completed_IO(struct inode *inode) 176static void dump_completed_IO(struct inode *inode, struct list_head *head)
115{ 177{
116#ifdef EXT4FS_DEBUG 178#ifdef EXT4FS_DEBUG
117 struct list_head *cur, *before, *after; 179 struct list_head *cur, *before, *after;
118 ext4_io_end_t *io, *io0, *io1; 180 ext4_io_end_t *io, *io0, *io1;
119 181
120 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { 182 if (list_empty(head))
121 ext4_debug("inode %lu completed_io list is empty\n",
122 inode->i_ino);
123 return; 183 return;
124 }
125 184
126 ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); 185 ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
127 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { 186 list_for_each_entry(io, head, list) {
128 cur = &io->list; 187 cur = &io->list;
129 before = cur->prev; 188 before = cur->prev;
130 io0 = container_of(before, ext4_io_end_t, list); 189 io0 = container_of(before, ext4_io_end_t, list);
@@ -145,16 +204,23 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
145 unsigned long flags; 204 unsigned long flags;
146 205
147 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 206 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
148 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
149
150 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 207 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
151 if (list_empty(&ei->i_completed_io_list)) 208 if (io_end->handle) {
152 queue_work(wq, &ei->i_unwritten_work); 209 wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
153 list_add_tail(&io_end->list, &ei->i_completed_io_list); 210 if (list_empty(&ei->i_rsv_conversion_list))
211 queue_work(wq, &ei->i_rsv_conversion_work);
212 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
213 } else {
214 wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
215 if (list_empty(&ei->i_unrsv_conversion_list))
216 queue_work(wq, &ei->i_unrsv_conversion_work);
217 list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
218 }
154 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 219 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
155} 220}
156 221
157static int ext4_do_flush_completed_IO(struct inode *inode) 222static int ext4_do_flush_completed_IO(struct inode *inode,
223 struct list_head *head)
158{ 224{
159 ext4_io_end_t *io; 225 ext4_io_end_t *io;
160 struct list_head unwritten; 226 struct list_head unwritten;
@@ -163,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
163 int err, ret = 0; 229 int err, ret = 0;
164 230
165 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 231 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
166 dump_completed_IO(inode); 232 dump_completed_IO(inode, head);
167 list_replace_init(&ei->i_completed_io_list, &unwritten); 233 list_replace_init(head, &unwritten);
168 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 234 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
169 235
170 while (!list_empty(&unwritten)) { 236 while (!list_empty(&unwritten)) {
@@ -180,23 +246,20 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
180} 246}
181 247
182/* 248/*
183 * work on completed aio dio IO, to convert unwritten extents to extents 249 * work on completed IO, to convert unwritten extents to extents
184 */ 250 */
185void ext4_end_io_work(struct work_struct *work) 251void ext4_end_io_rsv_work(struct work_struct *work)
186{ 252{
187 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 253 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
188 i_unwritten_work); 254 i_rsv_conversion_work);
189 ext4_do_flush_completed_IO(&ei->vfs_inode); 255 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
190} 256}
191 257
192int ext4_flush_unwritten_io(struct inode *inode) 258void ext4_end_io_unrsv_work(struct work_struct *work)
193{ 259{
194 int ret; 260 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
195 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && 261 i_unrsv_conversion_work);
196 !(inode->i_state & I_FREEING)); 262 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
197 ret = ext4_do_flush_completed_IO(inode);
198 ext4_unwritten_wait(inode);
199 return ret;
200} 263}
201 264
202ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 265ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -228,8 +291,10 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
228 291
229 if (atomic_dec_and_test(&io_end->count)) { 292 if (atomic_dec_and_test(&io_end->count)) {
230 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 293 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
231 err = ext4_convert_unwritten_extents(io_end->inode, 294 err = ext4_convert_unwritten_extents(io_end->handle,
232 io_end->offset, io_end->size); 295 io_end->inode, io_end->offset,
296 io_end->size);
297 io_end->handle = NULL;
233 ext4_clear_io_unwritten_flag(io_end); 298 ext4_clear_io_unwritten_flag(io_end);
234 } 299 }
235 ext4_release_io_end(io_end); 300 ext4_release_io_end(io_end);
@@ -243,79 +308,31 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
243 return io_end; 308 return io_end;
244} 309}
245 310
246/*
247 * Print an buffer I/O error compatible with the fs/buffer.c. This
248 * provides compatibility with dmesg scrapers that look for a specific
249 * buffer I/O error message. We really need a unified error reporting
250 * structure to userspace ala Digital Unix's uerf system, but it's
251 * probably not going to happen in my lifetime, due to LKML politics...
252 */
253static void buffer_io_error(struct buffer_head *bh)
254{
255 char b[BDEVNAME_SIZE];
256 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
257 bdevname(bh->b_bdev, b),
258 (unsigned long long)bh->b_blocknr);
259}
260
261static void ext4_end_bio(struct bio *bio, int error) 311static void ext4_end_bio(struct bio *bio, int error)
262{ 312{
263 ext4_io_end_t *io_end = bio->bi_private; 313 ext4_io_end_t *io_end = bio->bi_private;
264 struct inode *inode;
265 int i;
266 int blocksize;
267 sector_t bi_sector = bio->bi_sector; 314 sector_t bi_sector = bio->bi_sector;
268 315
269 BUG_ON(!io_end); 316 BUG_ON(!io_end);
270 inode = io_end->inode;
271 blocksize = 1 << inode->i_blkbits;
272 bio->bi_private = NULL;
273 bio->bi_end_io = NULL; 317 bio->bi_end_io = NULL;
274 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 318 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
275 error = 0; 319 error = 0;
276 for (i = 0; i < bio->bi_vcnt; i++) {
277 struct bio_vec *bvec = &bio->bi_io_vec[i];
278 struct page *page = bvec->bv_page;
279 struct buffer_head *bh, *head;
280 unsigned bio_start = bvec->bv_offset;
281 unsigned bio_end = bio_start + bvec->bv_len;
282 unsigned under_io = 0;
283 unsigned long flags;
284 320
285 if (!page) 321 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
286 continue;
287
288 if (error) {
289 SetPageError(page);
290 set_bit(AS_EIO, &page->mapping->flags);
291 }
292 bh = head = page_buffers(page);
293 /* 322 /*
294 * We check all buffers in the page under BH_Uptodate_Lock 323 * Link bio into list hanging from io_end. We have to do it
295 * to avoid races with other end io clearing async_write flags 324 * atomically as bio completions can be racing against each
325 * other.
296 */ 326 */
297 local_irq_save(flags); 327 bio->bi_private = xchg(&io_end->bio, bio);
298 bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 328 } else {
299 do { 329 ext4_finish_bio(bio);
300 if (bh_offset(bh) < bio_start || 330 bio_put(bio);
301 bh_offset(bh) + blocksize > bio_end) {
302 if (buffer_async_write(bh))
303 under_io++;
304 continue;
305 }
306 clear_buffer_async_write(bh);
307 if (error)
308 buffer_io_error(bh);
309 } while ((bh = bh->b_this_page) != head);
310 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
311 local_irq_restore(flags);
312 if (!under_io)
313 end_page_writeback(page);
314 } 331 }
315 bio_put(bio);
316 332
317 if (error) { 333 if (error) {
318 io_end->flag |= EXT4_IO_END_ERROR; 334 struct inode *inode = io_end->inode;
335
319 ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 336 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
320 "(offset %llu size %ld starting block %llu)", 337 "(offset %llu size %ld starting block %llu)",
321 inode->i_ino, 338 inode->i_ino,
@@ -324,7 +341,6 @@ static void ext4_end_bio(struct bio *bio, int error)
324 (unsigned long long) 341 (unsigned long long)
325 bi_sector >> (inode->i_blkbits - 9)); 342 bi_sector >> (inode->i_blkbits - 9));
326 } 343 }
327
328 ext4_put_io_end_defer(io_end); 344 ext4_put_io_end_defer(io_end);
329} 345}
330 346
@@ -356,13 +372,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
356 struct bio *bio; 372 struct bio *bio;
357 373
358 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 374 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
375 if (!bio)
376 return -ENOMEM;
359 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 377 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
360 bio->bi_bdev = bh->b_bdev; 378 bio->bi_bdev = bh->b_bdev;
361 bio->bi_end_io = ext4_end_bio; 379 bio->bi_end_io = ext4_end_bio;
362 bio->bi_private = ext4_get_io_end(io->io_end); 380 bio->bi_private = ext4_get_io_end(io->io_end);
363 if (!io->io_end->size)
364 io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
365 + bh_offset(bh);
366 io->io_bio = bio; 381 io->io_bio = bio;
367 io->io_next_block = bh->b_blocknr; 382 io->io_next_block = bh->b_blocknr;
368 return 0; 383 return 0;
@@ -372,7 +387,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
372 struct inode *inode, 387 struct inode *inode,
373 struct buffer_head *bh) 388 struct buffer_head *bh)
374{ 389{
375 ext4_io_end_t *io_end;
376 int ret; 390 int ret;
377 391
378 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 392 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -387,10 +401,6 @@ submit_and_retry:
387 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 401 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
388 if (ret != bh->b_size) 402 if (ret != bh->b_size)
389 goto submit_and_retry; 403 goto submit_and_retry;
390 io_end = io->io_end;
391 if (test_clear_buffer_uninit(bh))
392 ext4_set_io_unwritten_flag(inode, io_end);
393 io_end->size += bh->b_size;
394 io->io_next_block++; 404 io->io_next_block++;
395 return 0; 405 return 0;
396} 406}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
79 ext4_fsblk_t end = start + input->blocks_count; 79 ext4_fsblk_t end = start + input->blocks_count;
80 ext4_group_t group = input->group; 80 ext4_group_t group = input->group;
81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
82 unsigned overhead = ext4_group_overhead_blocks(sb, group); 82 unsigned overhead;
83 ext4_fsblk_t metaend = start + overhead; 83 ext4_fsblk_t metaend;
84 struct buffer_head *bh = NULL; 84 struct buffer_head *bh = NULL;
85 ext4_grpblk_t free_blocks_count, offset; 85 ext4_grpblk_t free_blocks_count, offset;
86 int err = -EINVAL; 86 int err = -EINVAL;
87 87
88 if (group != sbi->s_groups_count) {
89 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
90 input->group, sbi->s_groups_count);
91 return -EINVAL;
92 }
93
94 overhead = ext4_group_overhead_blocks(sb, group);
95 metaend = start + overhead;
88 input->free_blocks_count = free_blocks_count = 96 input->free_blocks_count = free_blocks_count =
89 input->blocks_count - 2 - overhead - sbi->s_itb_per_group; 97 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
90 98
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
96 free_blocks_count, input->reserved_blocks); 104 free_blocks_count, input->reserved_blocks);
97 105
98 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 106 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
99 if (group != sbi->s_groups_count) 107 if (offset != 0)
100 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
101 input->group, sbi->s_groups_count);
102 else if (offset != 0)
103 ext4_warning(sb, "Last group not full"); 108 ext4_warning(sb, "Last group not full");
104 else if (input->reserved_blocks > input->blocks_count / 5) 109 else if (input->reserved_blocks > input->blocks_count / 5)
105 ext4_warning(sb, "Reserved blocks too high (%u)", 110 ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
1551 int reserved_gdb = ext4_bg_has_super(sb, input->group) ? 1556 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
1552 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; 1557 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
1553 struct inode *inode = NULL; 1558 struct inode *inode = NULL;
1554 int gdb_off, gdb_num; 1559 int gdb_off;
1555 int err; 1560 int err;
1556 __u16 bg_flags = 0; 1561 __u16 bg_flags = 0;
1557 1562
1558 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
1559 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); 1563 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
1560 1564
1561 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 1565 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
1656 err = err2; 1660 err = err2;
1657 1661
1658 if (!err) { 1662 if (!err) {
1659 ext4_fsblk_t first_block;
1660 first_block = ext4_group_first_block_no(sb, 0);
1661 if (test_opt(sb, DEBUG)) 1663 if (test_opt(sb, DEBUG))
1662 printk(KERN_DEBUG "EXT4-fs: extended group to %llu " 1664 printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
1663 "blocks\n", ext4_blocks_count(es)); 1665 "blocks\n", ext4_blocks_count(es));
1664 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, 1666 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
1665 (char *)es, sizeof(struct ext4_super_block), 0); 1667 (char *)es, sizeof(struct ext4_super_block), 0);
1666 } 1668 }
1667 return err; 1669 return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..85b3dd60169b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
69static void ext4_clear_journal_err(struct super_block *sb, 69static void ext4_clear_journal_err(struct super_block *sb,
70 struct ext4_super_block *es); 70 struct ext4_super_block *es);
71static int ext4_sync_fs(struct super_block *sb, int wait); 71static int ext4_sync_fs(struct super_block *sb, int wait);
72static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
72static int ext4_remount(struct super_block *sb, int *flags, char *data); 73static int ext4_remount(struct super_block *sb, int *flags, char *data);
73static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 74static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
74static int ext4_unfreeze(struct super_block *sb); 75static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
398 } 399 }
399 if (test_opt(sb, ERRORS_RO)) { 400 if (test_opt(sb, ERRORS_RO)) {
400 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 401 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
402 /*
403 * Make sure updated value of ->s_mount_flags will be visible
404 * before ->s_flags update
405 */
406 smp_wmb();
401 sb->s_flags |= MS_RDONLY; 407 sb->s_flags |= MS_RDONLY;
402 } 408 }
403 if (test_opt(sb, ERRORS_PANIC)) 409 if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
422 ext4_handle_error(sb); 428 ext4_handle_error(sb);
423} 429}
424 430
425void ext4_error_inode(struct inode *inode, const char *function, 431void __ext4_error_inode(struct inode *inode, const char *function,
426 unsigned int line, ext4_fsblk_t block, 432 unsigned int line, ext4_fsblk_t block,
427 const char *fmt, ...) 433 const char *fmt, ...)
428{ 434{
429 va_list args; 435 va_list args;
430 struct va_format vaf; 436 struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
451 ext4_handle_error(inode->i_sb); 457 ext4_handle_error(inode->i_sb);
452} 458}
453 459
454void ext4_error_file(struct file *file, const char *function, 460void __ext4_error_file(struct file *file, const char *function,
455 unsigned int line, ext4_fsblk_t block, 461 unsigned int line, ext4_fsblk_t block,
456 const char *fmt, ...) 462 const char *fmt, ...)
457{ 463{
458 va_list args; 464 va_list args;
459 struct va_format vaf; 465 struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
570 576
571 if ((sb->s_flags & MS_RDONLY) == 0) { 577 if ((sb->s_flags & MS_RDONLY) == 0) {
572 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 578 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
573 sb->s_flags |= MS_RDONLY;
574 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; 579 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
580 /*
581 * Make sure updated value of ->s_mount_flags will be visible
582 * before ->s_flags update
583 */
584 smp_wmb();
585 sb->s_flags |= MS_RDONLY;
575 if (EXT4_SB(sb)->s_journal) 586 if (EXT4_SB(sb)->s_journal)
576 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 587 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
577 save_error_info(sb, function, line); 588 save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
580 panic("EXT4-fs panic from previous error\n"); 591 panic("EXT4-fs panic from previous error\n");
581} 592}
582 593
583void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) 594void __ext4_msg(struct super_block *sb,
595 const char *prefix, const char *fmt, ...)
584{ 596{
585 struct va_format vaf; 597 struct va_format vaf;
586 va_list args; 598 va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
750 ext4_unregister_li_request(sb); 762 ext4_unregister_li_request(sb);
751 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 763 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
752 764
753 flush_workqueue(sbi->dio_unwritten_wq); 765 flush_workqueue(sbi->unrsv_conversion_wq);
754 destroy_workqueue(sbi->dio_unwritten_wq); 766 flush_workqueue(sbi->rsv_conversion_wq);
767 destroy_workqueue(sbi->unrsv_conversion_wq);
768 destroy_workqueue(sbi->rsv_conversion_wq);
755 769
756 if (sbi->s_journal) { 770 if (sbi->s_journal) {
757 err = jbd2_journal_destroy(sbi->s_journal); 771 err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
760 ext4_abort(sb, "Couldn't clean up the journal"); 774 ext4_abort(sb, "Couldn't clean up the journal");
761 } 775 }
762 776
763 ext4_es_unregister_shrinker(sb); 777 ext4_es_unregister_shrinker(sbi);
764 del_timer(&sbi->s_err_report); 778 del_timer(&sbi->s_err_report);
765 ext4_release_system_zone(sb); 779 ext4_release_system_zone(sb);
766 ext4_mb_release(sb); 780 ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
849 rwlock_init(&ei->i_es_lock); 863 rwlock_init(&ei->i_es_lock);
850 INIT_LIST_HEAD(&ei->i_es_lru); 864 INIT_LIST_HEAD(&ei->i_es_lru);
851 ei->i_es_lru_nr = 0; 865 ei->i_es_lru_nr = 0;
866 ei->i_touch_when = 0;
852 ei->i_reserved_data_blocks = 0; 867 ei->i_reserved_data_blocks = 0;
853 ei->i_reserved_meta_blocks = 0; 868 ei->i_reserved_meta_blocks = 0;
854 ei->i_allocated_meta_blocks = 0; 869 ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
859 ei->i_reserved_quota = 0; 874 ei->i_reserved_quota = 0;
860#endif 875#endif
861 ei->jinode = NULL; 876 ei->jinode = NULL;
862 INIT_LIST_HEAD(&ei->i_completed_io_list); 877 INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
878 INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
863 spin_lock_init(&ei->i_completed_io_lock); 879 spin_lock_init(&ei->i_completed_io_lock);
864 ei->i_sync_tid = 0; 880 ei->i_sync_tid = 0;
865 ei->i_datasync_tid = 0; 881 ei->i_datasync_tid = 0;
866 atomic_set(&ei->i_ioend_count, 0); 882 atomic_set(&ei->i_ioend_count, 0);
867 atomic_set(&ei->i_unwritten, 0); 883 atomic_set(&ei->i_unwritten, 0);
868 INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); 884 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
885 INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
869 886
870 return &ei->vfs_inode; 887 return &ei->vfs_inode;
871} 888}
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
1093 .dirty_inode = ext4_dirty_inode, 1110 .dirty_inode = ext4_dirty_inode,
1094 .drop_inode = ext4_drop_inode, 1111 .drop_inode = ext4_drop_inode,
1095 .evict_inode = ext4_evict_inode, 1112 .evict_inode = ext4_evict_inode,
1113 .sync_fs = ext4_sync_fs_nojournal,
1096 .put_super = ext4_put_super, 1114 .put_super = ext4_put_super,
1097 .statfs = ext4_statfs, 1115 .statfs = ext4_statfs,
1098 .remount_fs = ext4_remount, 1116 .remount_fs = ext4_remount,
@@ -1908,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1908 struct ext4_sb_info *sbi = EXT4_SB(sb); 1926 struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 struct ext4_group_desc *gdp = NULL; 1927 struct ext4_group_desc *gdp = NULL;
1910 ext4_group_t flex_group; 1928 ext4_group_t flex_group;
1911 unsigned int groups_per_flex = 0;
1912 int i, err; 1929 int i, err;
1913 1930
1914 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1931 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1916 sbi->s_log_groups_per_flex = 0; 1933 sbi->s_log_groups_per_flex = 0;
1917 return 1; 1934 return 1;
1918 } 1935 }
1919 groups_per_flex = 1U << sbi->s_log_groups_per_flex;
1920 1936
1921 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); 1937 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
1922 if (err) 1938 if (err)
@@ -2164,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2164 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2180 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2165 dquot_initialize(inode); 2181 dquot_initialize(inode);
2166 if (inode->i_nlink) { 2182 if (inode->i_nlink) {
2167 ext4_msg(sb, KERN_DEBUG, 2183 if (test_opt(sb, DEBUG))
2168 "%s: truncating inode %lu to %lld bytes", 2184 ext4_msg(sb, KERN_DEBUG,
2169 __func__, inode->i_ino, inode->i_size); 2185 "%s: truncating inode %lu to %lld bytes",
2186 __func__, inode->i_ino, inode->i_size);
2170 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 2187 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2171 inode->i_ino, inode->i_size); 2188 inode->i_ino, inode->i_size);
2172 mutex_lock(&inode->i_mutex); 2189 mutex_lock(&inode->i_mutex);
2190 truncate_inode_pages(inode->i_mapping, inode->i_size);
2173 ext4_truncate(inode); 2191 ext4_truncate(inode);
2174 mutex_unlock(&inode->i_mutex); 2192 mutex_unlock(&inode->i_mutex);
2175 nr_truncates++; 2193 nr_truncates++;
2176 } else { 2194 } else {
2177 ext4_msg(sb, KERN_DEBUG, 2195 if (test_opt(sb, DEBUG))
2178 "%s: deleting unreferenced inode %lu", 2196 ext4_msg(sb, KERN_DEBUG,
2179 __func__, inode->i_ino); 2197 "%s: deleting unreferenced inode %lu",
2198 __func__, inode->i_ino);
2180 jbd_debug(2, "deleting unreferenced inode %lu\n", 2199 jbd_debug(2, "deleting unreferenced inode %lu\n",
2181 inode->i_ino); 2200 inode->i_ino);
2182 nr_orphans++; 2201 nr_orphans++;
@@ -2377,7 +2396,10 @@ struct ext4_attr {
2377 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2396 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2378 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2397 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2379 const char *, size_t); 2398 const char *, size_t);
2380 int offset; 2399 union {
2400 int offset;
2401 int deprecated_val;
2402 } u;
2381}; 2403};
2382 2404
2383static int parse_strtoull(const char *buf, 2405static int parse_strtoull(const char *buf,
@@ -2446,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2446static ssize_t sbi_ui_show(struct ext4_attr *a, 2468static ssize_t sbi_ui_show(struct ext4_attr *a,
2447 struct ext4_sb_info *sbi, char *buf) 2469 struct ext4_sb_info *sbi, char *buf)
2448{ 2470{
2449 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2471 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2450 2472
2451 return snprintf(buf, PAGE_SIZE, "%u\n", *ui); 2473 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2452} 2474}
@@ -2455,7 +2477,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
2455 struct ext4_sb_info *sbi, 2477 struct ext4_sb_info *sbi,
2456 const char *buf, size_t count) 2478 const char *buf, size_t count)
2457{ 2479{
2458 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2480 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2459 unsigned long t; 2481 unsigned long t;
2460 int ret; 2482 int ret;
2461 2483
@@ -2504,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
2504 return count; 2526 return count;
2505} 2527}
2506 2528
2529static ssize_t sbi_deprecated_show(struct ext4_attr *a,
2530 struct ext4_sb_info *sbi, char *buf)
2531{
2532 return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
2533}
2534
2507#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ 2535#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2508static struct ext4_attr ext4_attr_##_name = { \ 2536static struct ext4_attr ext4_attr_##_name = { \
2509 .attr = {.name = __stringify(_name), .mode = _mode }, \ 2537 .attr = {.name = __stringify(_name), .mode = _mode }, \
2510 .show = _show, \ 2538 .show = _show, \
2511 .store = _store, \ 2539 .store = _store, \
2512 .offset = offsetof(struct ext4_sb_info, _elname), \ 2540 .u = { \
2541 .offset = offsetof(struct ext4_sb_info, _elname),\
2542 }, \
2513} 2543}
2514#define EXT4_ATTR(name, mode, show, store) \ 2544#define EXT4_ATTR(name, mode, show, store) \
2515static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2545static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2550,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2520#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2550#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2521 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) 2551 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2522#define ATTR_LIST(name) &ext4_attr_##name.attr 2552#define ATTR_LIST(name) &ext4_attr_##name.attr
2553#define EXT4_DEPRECATED_ATTR(_name, _val) \
2554static struct ext4_attr ext4_attr_##_name = { \
2555 .attr = {.name = __stringify(_name), .mode = 0444 }, \
2556 .show = sbi_deprecated_show, \
2557 .u = { \
2558 .deprecated_val = _val, \
2559 }, \
2560}
2523 2561
2524EXT4_RO_ATTR(delayed_allocation_blocks); 2562EXT4_RO_ATTR(delayed_allocation_blocks);
2525EXT4_RO_ATTR(session_write_kbytes); 2563EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2534EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2572EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2535EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2573EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2536EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2574EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2537EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2575EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
2538EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 2576EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
2539EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2577EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2540 2578
@@ -3763,7 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3763 sbi->s_err_report.data = (unsigned long) sb; 3801 sbi->s_err_report.data = (unsigned long) sb;
3764 3802
3765 /* Register extent status tree shrinker */ 3803 /* Register extent status tree shrinker */
3766 ext4_es_register_shrinker(sb); 3804 ext4_es_register_shrinker(sbi);
3767 3805
3768 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3806 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3769 ext4_count_free_clusters(sb)); 3807 ext4_count_free_clusters(sb));
@@ -3787,7 +3825,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3787 } 3825 }
3788 3826
3789 sbi->s_stripe = ext4_get_stripe_size(sbi); 3827 sbi->s_stripe = ext4_get_stripe_size(sbi);
3790 sbi->s_max_writeback_mb_bump = 128;
3791 sbi->s_extent_max_zeroout_kb = 32; 3828 sbi->s_extent_max_zeroout_kb = 32;
3792 3829
3793 /* 3830 /*
@@ -3915,12 +3952,20 @@ no_journal:
3915 * The maximum number of concurrent works can be high and 3952 * The maximum number of concurrent works can be high and
3916 * concurrency isn't really necessary. Limit it to 1. 3953 * concurrency isn't really necessary. Limit it to 1.
3917 */ 3954 */
3918 EXT4_SB(sb)->dio_unwritten_wq = 3955 EXT4_SB(sb)->rsv_conversion_wq =
3919 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3956 alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3920 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3957 if (!EXT4_SB(sb)->rsv_conversion_wq) {
3921 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3958 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3922 ret = -ENOMEM; 3959 ret = -ENOMEM;
3923 goto failed_mount_wq; 3960 goto failed_mount4;
3961 }
3962
3963 EXT4_SB(sb)->unrsv_conversion_wq =
3964 alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3965 if (!EXT4_SB(sb)->unrsv_conversion_wq) {
3966 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3967 ret = -ENOMEM;
3968 goto failed_mount4;
3924 } 3969 }
3925 3970
3926 /* 3971 /*
@@ -4074,14 +4119,17 @@ failed_mount4a:
4074 sb->s_root = NULL; 4119 sb->s_root = NULL;
4075failed_mount4: 4120failed_mount4:
4076 ext4_msg(sb, KERN_ERR, "mount failed"); 4121 ext4_msg(sb, KERN_ERR, "mount failed");
4077 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 4122 if (EXT4_SB(sb)->rsv_conversion_wq)
4123 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4124 if (EXT4_SB(sb)->unrsv_conversion_wq)
4125 destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4078failed_mount_wq: 4126failed_mount_wq:
4079 if (sbi->s_journal) { 4127 if (sbi->s_journal) {
4080 jbd2_journal_destroy(sbi->s_journal); 4128 jbd2_journal_destroy(sbi->s_journal);
4081 sbi->s_journal = NULL; 4129 sbi->s_journal = NULL;
4082 } 4130 }
4083failed_mount3: 4131failed_mount3:
4084 ext4_es_unregister_shrinker(sb); 4132 ext4_es_unregister_shrinker(sbi);
4085 del_timer(&sbi->s_err_report); 4133 del_timer(&sbi->s_err_report);
4086 if (sbi->s_flex_groups) 4134 if (sbi->s_flex_groups)
4087 ext4_kvfree(sbi->s_flex_groups); 4135 ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4517{ 4565{
4518 int ret = 0; 4566 int ret = 0;
4519 tid_t target; 4567 tid_t target;
4568 bool needs_barrier = false;
4520 struct ext4_sb_info *sbi = EXT4_SB(sb); 4569 struct ext4_sb_info *sbi = EXT4_SB(sb);
4521 4570
4522 trace_ext4_sync_fs(sb, wait); 4571 trace_ext4_sync_fs(sb, wait);
4523 flush_workqueue(sbi->dio_unwritten_wq); 4572 flush_workqueue(sbi->rsv_conversion_wq);
4573 flush_workqueue(sbi->unrsv_conversion_wq);
4524 /* 4574 /*
4525 * Writeback quota in non-journalled quota case - journalled quota has 4575 * Writeback quota in non-journalled quota case - journalled quota has
4526 * no dirty dquots 4576 * no dirty dquots
4527 */ 4577 */
4528 dquot_writeback_dquots(sb, -1); 4578 dquot_writeback_dquots(sb, -1);
4579 /*
4580 * Data writeback is possible w/o journal transaction, so barrier must
4581 * being sent at the end of the function. But we can skip it if
4582 * transaction_commit will do it for us.
4583 */
4584 target = jbd2_get_latest_transaction(sbi->s_journal);
4585 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
4586 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
4587 needs_barrier = true;
4588
4529 if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 4589 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4530 if (wait) 4590 if (wait)
4531 jbd2_log_wait_commit(sbi->s_journal, target); 4591 ret = jbd2_log_wait_commit(sbi->s_journal, target);
4592 }
4593 if (needs_barrier) {
4594 int err;
4595 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4596 if (!ret)
4597 ret = err;
4532 } 4598 }
4599
4600 return ret;
4601}
4602
4603static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
4604{
4605 int ret = 0;
4606
4607 trace_ext4_sync_fs(sb, wait);
4608 flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4609 flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4610 dquot_writeback_dquots(sb, -1);
4611 if (wait && test_opt(sb, BARRIER))
4612 ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4613
4533 return ret; 4614 return ret;
4534} 4615}
4535 4616
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e6326e..e06e0995e00f 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL
51 Linux website <http://acl.bestbits.at/>. 51 Linux website <http://acl.bestbits.at/>.
52 52
53 If you don't know what Access Control Lists are, say N 53 If you don't know what Access Control Lists are, say N
54
55config F2FS_FS_SECURITY
56 bool "F2FS Security Labels"
57 depends on F2FS_FS_XATTR
58 help
59 Security labels provide an access control facility to support Linux
60 Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
61 Linux. This option enables an extended attribute handler for file
62 security labels in the f2fs filesystem, so that it requires enabling
63 the extended attribute support in advance.
64
65 If you are not using a security module, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 44abc2f286e0..b7826ec1b470 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
250 } 250 }
251 } 251 }
252 252
253 error = f2fs_setxattr(inode, name_index, "", value, size); 253 error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
254 254
255 kfree(value); 255 kfree(value);
256 if (!error) 256 if (!error)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b1de01da1a40..66a6b85a51d8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -357,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
357 unsigned long blk_size = sbi->blocksize; 357 unsigned long blk_size = sbi->blocksize;
358 struct f2fs_checkpoint *cp_block; 358 struct f2fs_checkpoint *cp_block;
359 unsigned long long cur_version = 0, pre_version = 0; 359 unsigned long long cur_version = 0, pre_version = 0;
360 unsigned int crc = 0;
361 size_t crc_offset; 360 size_t crc_offset;
361 __u32 crc = 0;
362 362
363 /* Read the 1st cp block in this CP pack */ 363 /* Read the 1st cp block in this CP pack */
364 cp_page_1 = get_meta_page(sbi, cp_addr); 364 cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -369,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
369 if (crc_offset >= blk_size) 369 if (crc_offset >= blk_size)
370 goto invalid_cp1; 370 goto invalid_cp1;
371 371
372 crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); 372 crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
373 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 373 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
374 goto invalid_cp1; 374 goto invalid_cp1;
375 375
@@ -384,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
384 if (crc_offset >= blk_size) 384 if (crc_offset >= blk_size)
385 goto invalid_cp2; 385 goto invalid_cp2;
386 386
387 crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); 387 crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
388 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 388 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
389 goto invalid_cp2; 389 goto invalid_cp2;
390 390
@@ -450,13 +450,30 @@ fail_no_cp:
450 return -EINVAL; 450 return -EINVAL;
451} 451}
452 452
453void set_dirty_dir_page(struct inode *inode, struct page *page) 453static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
454{ 454{
455 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 455 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
456 struct list_head *head = &sbi->dir_inode_list; 456 struct list_head *head = &sbi->dir_inode_list;
457 struct dir_inode_entry *new;
458 struct list_head *this; 457 struct list_head *this;
459 458
459 list_for_each(this, head) {
460 struct dir_inode_entry *entry;
461 entry = list_entry(this, struct dir_inode_entry, list);
462 if (entry->inode == inode)
463 return -EEXIST;
464 }
465 list_add_tail(&new->list, head);
466#ifdef CONFIG_F2FS_STAT_FS
467 sbi->n_dirty_dirs++;
468#endif
469 return 0;
470}
471
472void set_dirty_dir_page(struct inode *inode, struct page *page)
473{
474 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
475 struct dir_inode_entry *new;
476
460 if (!S_ISDIR(inode->i_mode)) 477 if (!S_ISDIR(inode->i_mode))
461 return; 478 return;
462retry: 479retry:
@@ -469,23 +486,31 @@ retry:
469 INIT_LIST_HEAD(&new->list); 486 INIT_LIST_HEAD(&new->list);
470 487
471 spin_lock(&sbi->dir_inode_lock); 488 spin_lock(&sbi->dir_inode_lock);
472 list_for_each(this, head) { 489 if (__add_dirty_inode(inode, new))
473 struct dir_inode_entry *entry; 490 kmem_cache_free(inode_entry_slab, new);
474 entry = list_entry(this, struct dir_inode_entry, list);
475 if (entry->inode == inode) {
476 kmem_cache_free(inode_entry_slab, new);
477 goto out;
478 }
479 }
480 list_add_tail(&new->list, head);
481 sbi->n_dirty_dirs++;
482 491
483 BUG_ON(!S_ISDIR(inode->i_mode));
484out:
485 inc_page_count(sbi, F2FS_DIRTY_DENTS); 492 inc_page_count(sbi, F2FS_DIRTY_DENTS);
486 inode_inc_dirty_dents(inode); 493 inode_inc_dirty_dents(inode);
487 SetPagePrivate(page); 494 SetPagePrivate(page);
495 spin_unlock(&sbi->dir_inode_lock);
496}
488 497
498void add_dirty_dir_inode(struct inode *inode)
499{
500 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
501 struct dir_inode_entry *new;
502retry:
503 new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
504 if (!new) {
505 cond_resched();
506 goto retry;
507 }
508 new->inode = inode;
509 INIT_LIST_HEAD(&new->list);
510
511 spin_lock(&sbi->dir_inode_lock);
512 if (__add_dirty_inode(inode, new))
513 kmem_cache_free(inode_entry_slab, new);
489 spin_unlock(&sbi->dir_inode_lock); 514 spin_unlock(&sbi->dir_inode_lock);
490} 515}
491 516
@@ -499,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode)
499 return; 524 return;
500 525
501 spin_lock(&sbi->dir_inode_lock); 526 spin_lock(&sbi->dir_inode_lock);
502 if (atomic_read(&F2FS_I(inode)->dirty_dents)) 527 if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
503 goto out; 528 spin_unlock(&sbi->dir_inode_lock);
529 return;
530 }
504 531
505 list_for_each(this, head) { 532 list_for_each(this, head) {
506 struct dir_inode_entry *entry; 533 struct dir_inode_entry *entry;
@@ -508,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode)
508 if (entry->inode == inode) { 535 if (entry->inode == inode) {
509 list_del(&entry->list); 536 list_del(&entry->list);
510 kmem_cache_free(inode_entry_slab, entry); 537 kmem_cache_free(inode_entry_slab, entry);
538#ifdef CONFIG_F2FS_STAT_FS
511 sbi->n_dirty_dirs--; 539 sbi->n_dirty_dirs--;
540#endif
541 break;
542 }
543 }
544 spin_unlock(&sbi->dir_inode_lock);
545
546 /* Only from the recovery routine */
547 if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
548 clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
549 iput(inode);
550 }
551}
552
553struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
554{
555 struct list_head *head = &sbi->dir_inode_list;
556 struct list_head *this;
557 struct inode *inode = NULL;
558
559 spin_lock(&sbi->dir_inode_lock);
560 list_for_each(this, head) {
561 struct dir_inode_entry *entry;
562 entry = list_entry(this, struct dir_inode_entry, list);
563 if (entry->inode->i_ino == ino) {
564 inode = entry->inode;
512 break; 565 break;
513 } 566 }
514 } 567 }
515out:
516 spin_unlock(&sbi->dir_inode_lock); 568 spin_unlock(&sbi->dir_inode_lock);
569 return inode;
517} 570}
518 571
519void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) 572void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -595,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
595 block_t start_blk; 648 block_t start_blk;
596 struct page *cp_page; 649 struct page *cp_page;
597 unsigned int data_sum_blocks, orphan_blocks; 650 unsigned int data_sum_blocks, orphan_blocks;
598 unsigned int crc32 = 0; 651 __u32 crc32 = 0;
599 void *kaddr; 652 void *kaddr;
600 int i; 653 int i;
601 654
@@ -664,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
664 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); 717 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
665 718
666 crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); 719 crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
667 *(__le32 *)((unsigned char *)ckpt + 720 *((__le32 *)((unsigned char *)ckpt +
668 le32_to_cpu(ckpt->checksum_offset)) 721 le32_to_cpu(ckpt->checksum_offset)))
669 = cpu_to_le32(crc32); 722 = cpu_to_le32(crc32);
670 723
671 start_blk = __start_cp_addr(sbi); 724 start_blk = __start_cp_addr(sbi);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 91ff93b0b0f4..035f9a345cdf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -68,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
68 struct buffer_head *bh_result) 68 struct buffer_head *bh_result)
69{ 69{
70 struct f2fs_inode_info *fi = F2FS_I(inode); 70 struct f2fs_inode_info *fi = F2FS_I(inode);
71#ifdef CONFIG_F2FS_STAT_FS
71 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 72 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
73#endif
72 pgoff_t start_fofs, end_fofs; 74 pgoff_t start_fofs, end_fofs;
73 block_t start_blkaddr; 75 block_t start_blkaddr;
74 76
@@ -78,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
78 return 0; 80 return 0;
79 } 81 }
80 82
83#ifdef CONFIG_F2FS_STAT_FS
81 sbi->total_hit_ext++; 84 sbi->total_hit_ext++;
85#endif
82 start_fofs = fi->ext.fofs; 86 start_fofs = fi->ext.fofs;
83 end_fofs = fi->ext.fofs + fi->ext.len - 1; 87 end_fofs = fi->ext.fofs + fi->ext.len - 1;
84 start_blkaddr = fi->ext.blk_addr; 88 start_blkaddr = fi->ext.blk_addr;
@@ -96,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
96 else 100 else
97 bh_result->b_size = UINT_MAX; 101 bh_result->b_size = UINT_MAX;
98 102
103#ifdef CONFIG_F2FS_STAT_FS
99 sbi->read_hit_ext++; 104 sbi->read_hit_ext++;
105#endif
100 read_unlock(&fi->ext.ext_lock); 106 read_unlock(&fi->ext.ext_lock);
101 return 1; 107 return 1;
102 } 108 }
@@ -199,7 +205,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
199 if (dn.data_blkaddr == NEW_ADDR) 205 if (dn.data_blkaddr == NEW_ADDR)
200 return ERR_PTR(-EINVAL); 206 return ERR_PTR(-EINVAL);
201 207
202 page = grab_cache_page(mapping, index); 208 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
203 if (!page) 209 if (!page)
204 return ERR_PTR(-ENOMEM); 210 return ERR_PTR(-ENOMEM);
205 211
@@ -233,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
233 struct page *page; 239 struct page *page;
234 int err; 240 int err;
235 241
242repeat:
243 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
244 if (!page)
245 return ERR_PTR(-ENOMEM);
246
236 set_new_dnode(&dn, inode, NULL, NULL, 0); 247 set_new_dnode(&dn, inode, NULL, NULL, 0);
237 err = get_dnode_of_data(&dn, index, LOOKUP_NODE); 248 err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
238 if (err) 249 if (err) {
250 f2fs_put_page(page, 1);
239 return ERR_PTR(err); 251 return ERR_PTR(err);
252 }
240 f2fs_put_dnode(&dn); 253 f2fs_put_dnode(&dn);
241 254
242 if (dn.data_blkaddr == NULL_ADDR) 255 if (dn.data_blkaddr == NULL_ADDR) {
256 f2fs_put_page(page, 1);
243 return ERR_PTR(-ENOENT); 257 return ERR_PTR(-ENOENT);
244repeat: 258 }
245 page = grab_cache_page(mapping, index);
246 if (!page)
247 return ERR_PTR(-ENOMEM);
248 259
249 if (PageUptodate(page)) 260 if (PageUptodate(page))
250 return page; 261 return page;
@@ -274,9 +285,10 @@ repeat:
274 * 285 *
275 * Also, caller should grab and release a mutex by calling mutex_lock_op() and 286 * Also, caller should grab and release a mutex by calling mutex_lock_op() and
276 * mutex_unlock_op(). 287 * mutex_unlock_op().
288 * Note that, npage is set only by make_empty_dir.
277 */ 289 */
278struct page *get_new_data_page(struct inode *inode, pgoff_t index, 290struct page *get_new_data_page(struct inode *inode,
279 bool new_i_size) 291 struct page *npage, pgoff_t index, bool new_i_size)
280{ 292{
281 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 293 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
282 struct address_space *mapping = inode->i_mapping; 294 struct address_space *mapping = inode->i_mapping;
@@ -284,18 +296,20 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
284 struct dnode_of_data dn; 296 struct dnode_of_data dn;
285 int err; 297 int err;
286 298
287 set_new_dnode(&dn, inode, NULL, NULL, 0); 299 set_new_dnode(&dn, inode, npage, npage, 0);
288 err = get_dnode_of_data(&dn, index, ALLOC_NODE); 300 err = get_dnode_of_data(&dn, index, ALLOC_NODE);
289 if (err) 301 if (err)
290 return ERR_PTR(err); 302 return ERR_PTR(err);
291 303
292 if (dn.data_blkaddr == NULL_ADDR) { 304 if (dn.data_blkaddr == NULL_ADDR) {
293 if (reserve_new_block(&dn)) { 305 if (reserve_new_block(&dn)) {
294 f2fs_put_dnode(&dn); 306 if (!npage)
307 f2fs_put_dnode(&dn);
295 return ERR_PTR(-ENOSPC); 308 return ERR_PTR(-ENOSPC);
296 } 309 }
297 } 310 }
298 f2fs_put_dnode(&dn); 311 if (!npage)
312 f2fs_put_dnode(&dn);
299repeat: 313repeat:
300 page = grab_cache_page(mapping, index); 314 page = grab_cache_page(mapping, index);
301 if (!page) 315 if (!page)
@@ -325,6 +339,8 @@ repeat:
325 if (new_i_size && 339 if (new_i_size &&
326 i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { 340 i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
327 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); 341 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
342 /* Only the directory inode sets new_i_size */
343 set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
328 mark_inode_dirty_sync(inode); 344 mark_inode_dirty_sync(inode);
329 } 345 }
330 return page; 346 return page;
@@ -481,8 +497,9 @@ int do_write_data_page(struct page *page)
481 * If current allocation needs SSR, 497 * If current allocation needs SSR,
482 * it had better in-place writes for updated data. 498 * it had better in-place writes for updated data.
483 */ 499 */
484 if (old_blk_addr != NEW_ADDR && !is_cold_data(page) && 500 if (unlikely(old_blk_addr != NEW_ADDR &&
485 need_inplace_update(inode)) { 501 !is_cold_data(page) &&
502 need_inplace_update(inode))) {
486 rewrite_data_page(F2FS_SB(inode->i_sb), page, 503 rewrite_data_page(F2FS_SB(inode->i_sb), page,
487 old_blk_addr); 504 old_blk_addr);
488 } else { 505 } else {
@@ -684,6 +701,27 @@ err:
684 return err; 701 return err;
685} 702}
686 703
704static int f2fs_write_end(struct file *file,
705 struct address_space *mapping,
706 loff_t pos, unsigned len, unsigned copied,
707 struct page *page, void *fsdata)
708{
709 struct inode *inode = page->mapping->host;
710
711 SetPageUptodate(page);
712 set_page_dirty(page);
713
714 if (pos + copied > i_size_read(inode)) {
715 i_size_write(inode, pos + copied);
716 mark_inode_dirty(inode);
717 update_inode_page(inode);
718 }
719
720 unlock_page(page);
721 page_cache_release(page);
722 return copied;
723}
724
687static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, 725static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
688 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 726 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
689{ 727{
@@ -698,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
698 get_data_block_ro); 736 get_data_block_ro);
699} 737}
700 738
701static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) 739static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
740 unsigned int length)
702{ 741{
703 struct inode *inode = page->mapping->host; 742 struct inode *inode = page->mapping->host;
704 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 743 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -740,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = {
740 .writepage = f2fs_write_data_page, 779 .writepage = f2fs_write_data_page,
741 .writepages = f2fs_write_data_pages, 780 .writepages = f2fs_write_data_pages,
742 .write_begin = f2fs_write_begin, 781 .write_begin = f2fs_write_begin,
743 .write_end = nobh_write_end, 782 .write_end = f2fs_write_end,
744 .set_page_dirty = f2fs_set_data_page_dirty, 783 .set_page_dirty = f2fs_set_data_page_dirty,
745 .invalidatepage = f2fs_invalidate_data_page, 784 .invalidatepage = f2fs_invalidate_data_page,
746 .releasepage = f2fs_release_data_page, 785 .releasepage = f2fs_release_data_page,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8d9943786c31..0d6c6aafb235 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -175,12 +175,12 @@ get_cache:
175 175
176static int stat_show(struct seq_file *s, void *v) 176static int stat_show(struct seq_file *s, void *v)
177{ 177{
178 struct f2fs_stat_info *si, *next; 178 struct f2fs_stat_info *si;
179 int i = 0; 179 int i = 0;
180 int j; 180 int j;
181 181
182 mutex_lock(&f2fs_stat_mutex); 182 mutex_lock(&f2fs_stat_mutex);
183 list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { 183 list_for_each_entry(si, &f2fs_stat_list, stat_list) {
184 char devname[BDEVNAME_SIZE]; 184 char devname[BDEVNAME_SIZE];
185 185
186 update_general_status(si->sbi); 186 update_general_status(si->sbi);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1ac6b93036b7..9d1cd423450d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -13,6 +13,7 @@
13#include "f2fs.h" 13#include "f2fs.h"
14#include "node.h" 14#include "node.h"
15#include "acl.h" 15#include "acl.h"
16#include "xattr.h"
16 17
17static unsigned long dir_blocks(struct inode *inode) 18static unsigned long dir_blocks(struct inode *inode)
18{ 19{
@@ -215,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
215 216
216struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) 217struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
217{ 218{
218 struct page *page = NULL; 219 struct page *page;
219 struct f2fs_dir_entry *de = NULL; 220 struct f2fs_dir_entry *de;
220 struct f2fs_dentry_block *dentry_blk = NULL; 221 struct f2fs_dentry_block *dentry_blk;
221 222
222 page = get_lock_data_page(dir, 0); 223 page = get_lock_data_page(dir, 0);
223 if (IS_ERR(page)) 224 if (IS_ERR(page))
@@ -264,15 +265,10 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
264 f2fs_put_page(page, 1); 265 f2fs_put_page(page, 1);
265} 266}
266 267
267void init_dent_inode(const struct qstr *name, struct page *ipage) 268static void init_dent_inode(const struct qstr *name, struct page *ipage)
268{ 269{
269 struct f2fs_node *rn; 270 struct f2fs_node *rn;
270 271
271 if (IS_ERR(ipage))
272 return;
273
274 wait_on_page_writeback(ipage);
275
276 /* copy name info. to this inode page */ 272 /* copy name info. to this inode page */
277 rn = (struct f2fs_node *)page_address(ipage); 273 rn = (struct f2fs_node *)page_address(ipage);
278 rn->i.i_namelen = cpu_to_le32(name->len); 274 rn->i.i_namelen = cpu_to_le32(name->len);
@@ -280,14 +276,15 @@ void init_dent_inode(const struct qstr *name, struct page *ipage)
280 set_page_dirty(ipage); 276 set_page_dirty(ipage);
281} 277}
282 278
283static int make_empty_dir(struct inode *inode, struct inode *parent) 279static int make_empty_dir(struct inode *inode,
280 struct inode *parent, struct page *page)
284{ 281{
285 struct page *dentry_page; 282 struct page *dentry_page;
286 struct f2fs_dentry_block *dentry_blk; 283 struct f2fs_dentry_block *dentry_blk;
287 struct f2fs_dir_entry *de; 284 struct f2fs_dir_entry *de;
288 void *kaddr; 285 void *kaddr;
289 286
290 dentry_page = get_new_data_page(inode, 0, true); 287 dentry_page = get_new_data_page(inode, page, 0, true);
291 if (IS_ERR(dentry_page)) 288 if (IS_ERR(dentry_page))
292 return PTR_ERR(dentry_page); 289 return PTR_ERR(dentry_page);
293 290
@@ -317,63 +314,76 @@ static int make_empty_dir(struct inode *inode, struct inode *parent)
317 return 0; 314 return 0;
318} 315}
319 316
320static int init_inode_metadata(struct inode *inode, 317static struct page *init_inode_metadata(struct inode *inode,
321 struct inode *dir, const struct qstr *name) 318 struct inode *dir, const struct qstr *name)
322{ 319{
320 struct page *page;
321 int err;
322
323 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { 323 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
324 int err; 324 page = new_inode_page(inode, name);
325 err = new_inode_page(inode, name); 325 if (IS_ERR(page))
326 if (err) 326 return page;
327 return err;
328 327
329 if (S_ISDIR(inode->i_mode)) { 328 if (S_ISDIR(inode->i_mode)) {
330 err = make_empty_dir(inode, dir); 329 err = make_empty_dir(inode, dir, page);
331 if (err) { 330 if (err)
332 remove_inode_page(inode); 331 goto error;
333 return err;
334 }
335 } 332 }
336 333
337 err = f2fs_init_acl(inode, dir); 334 err = f2fs_init_acl(inode, dir);
338 if (err) { 335 if (err)
339 remove_inode_page(inode); 336 goto error;
340 return err; 337
341 } 338 err = f2fs_init_security(inode, dir, name, page);
339 if (err)
340 goto error;
341
342 wait_on_page_writeback(page);
342 } else { 343 } else {
343 struct page *ipage; 344 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
344 ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); 345 if (IS_ERR(page))
345 if (IS_ERR(ipage)) 346 return page;
346 return PTR_ERR(ipage); 347
347 set_cold_node(inode, ipage); 348 wait_on_page_writeback(page);
348 init_dent_inode(name, ipage); 349 set_cold_node(inode, page);
349 f2fs_put_page(ipage, 1);
350 } 350 }
351
352 init_dent_inode(name, page);
353
354 /*
355 * This file should be checkpointed during fsync.
356 * We lost i_pino from now on.
357 */
351 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { 358 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
359 file_lost_pino(inode);
352 inc_nlink(inode); 360 inc_nlink(inode);
353 update_inode_page(inode);
354 } 361 }
355 return 0; 362 return page;
363
364error:
365 f2fs_put_page(page, 1);
366 remove_inode_page(inode);
367 return ERR_PTR(err);
356} 368}
357 369
358static void update_parent_metadata(struct inode *dir, struct inode *inode, 370static void update_parent_metadata(struct inode *dir, struct inode *inode,
359 unsigned int current_depth) 371 unsigned int current_depth)
360{ 372{
361 bool need_dir_update = false;
362
363 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { 373 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
364 if (S_ISDIR(inode->i_mode)) { 374 if (S_ISDIR(inode->i_mode)) {
365 inc_nlink(dir); 375 inc_nlink(dir);
366 need_dir_update = true; 376 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
367 } 377 }
368 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); 378 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
369 } 379 }
370 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 380 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
371 if (F2FS_I(dir)->i_current_depth != current_depth) { 381 if (F2FS_I(dir)->i_current_depth != current_depth) {
372 F2FS_I(dir)->i_current_depth = current_depth; 382 F2FS_I(dir)->i_current_depth = current_depth;
373 need_dir_update = true; 383 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
374 } 384 }
375 385
376 if (need_dir_update) 386 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
377 update_inode_page(dir); 387 update_inode_page(dir);
378 else 388 else
379 mark_inode_dirty(dir); 389 mark_inode_dirty(dir);
@@ -423,6 +433,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
423 struct page *dentry_page = NULL; 433 struct page *dentry_page = NULL;
424 struct f2fs_dentry_block *dentry_blk = NULL; 434 struct f2fs_dentry_block *dentry_blk = NULL;
425 int slots = GET_DENTRY_SLOTS(namelen); 435 int slots = GET_DENTRY_SLOTS(namelen);
436 struct page *page;
426 int err = 0; 437 int err = 0;
427 int i; 438 int i;
428 439
@@ -448,7 +459,7 @@ start:
448 bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); 459 bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
449 460
450 for (block = bidx; block <= (bidx + nblock - 1); block++) { 461 for (block = bidx; block <= (bidx + nblock - 1); block++) {
451 dentry_page = get_new_data_page(dir, block, true); 462 dentry_page = get_new_data_page(dir, NULL, block, true);
452 if (IS_ERR(dentry_page)) 463 if (IS_ERR(dentry_page))
453 return PTR_ERR(dentry_page); 464 return PTR_ERR(dentry_page);
454 465
@@ -465,12 +476,13 @@ start:
465 ++level; 476 ++level;
466 goto start; 477 goto start;
467add_dentry: 478add_dentry:
468 err = init_inode_metadata(inode, dir, name);
469 if (err)
470 goto fail;
471
472 wait_on_page_writeback(dentry_page); 479 wait_on_page_writeback(dentry_page);
473 480
481 page = init_inode_metadata(inode, dir, name);
482 if (IS_ERR(page)) {
483 err = PTR_ERR(page);
484 goto fail;
485 }
474 de = &dentry_blk->dentry[bit_pos]; 486 de = &dentry_blk->dentry[bit_pos];
475 de->hash_code = dentry_hash; 487 de->hash_code = dentry_hash;
476 de->name_len = cpu_to_le16(namelen); 488 de->name_len = cpu_to_le16(namelen);
@@ -481,11 +493,14 @@ add_dentry:
481 test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); 493 test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
482 set_page_dirty(dentry_page); 494 set_page_dirty(dentry_page);
483 495
484 update_parent_metadata(dir, inode, current_depth); 496 /* we don't need to mark_inode_dirty now */
485
486 /* update parent inode number before releasing dentry page */
487 F2FS_I(inode)->i_pino = dir->i_ino; 497 F2FS_I(inode)->i_pino = dir->i_ino;
498 update_inode(inode, page);
499 f2fs_put_page(page, 1);
500
501 update_parent_metadata(dir, inode, current_depth);
488fail: 502fail:
503 clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
489 kunmap(dentry_page); 504 kunmap(dentry_page);
490 f2fs_put_page(dentry_page, 1); 505 f2fs_put_page(dentry_page, 1);
491 return err; 506 return err;
@@ -591,24 +606,19 @@ bool f2fs_empty_dir(struct inode *dir)
591 return true; 606 return true;
592} 607}
593 608
594static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) 609static int f2fs_readdir(struct file *file, struct dir_context *ctx)
595{ 610{
596 unsigned long pos = file->f_pos;
597 struct inode *inode = file_inode(file); 611 struct inode *inode = file_inode(file);
598 unsigned long npages = dir_blocks(inode); 612 unsigned long npages = dir_blocks(inode);
599 unsigned char *types = NULL;
600 unsigned int bit_pos = 0, start_bit_pos = 0; 613 unsigned int bit_pos = 0, start_bit_pos = 0;
601 int over = 0;
602 struct f2fs_dentry_block *dentry_blk = NULL; 614 struct f2fs_dentry_block *dentry_blk = NULL;
603 struct f2fs_dir_entry *de = NULL; 615 struct f2fs_dir_entry *de = NULL;
604 struct page *dentry_page = NULL; 616 struct page *dentry_page = NULL;
605 unsigned int n = 0; 617 unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
606 unsigned char d_type = DT_UNKNOWN; 618 unsigned char d_type = DT_UNKNOWN;
607 int slots; 619 int slots;
608 620
609 types = f2fs_filetype_table; 621 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
610 bit_pos = (pos % NR_DENTRY_IN_BLOCK);
611 n = (pos / NR_DENTRY_IN_BLOCK);
612 622
613 for ( ; n < npages; n++) { 623 for ( ; n < npages; n++) {
614 dentry_page = get_lock_data_page(inode, n); 624 dentry_page = get_lock_data_page(inode, n);
@@ -618,31 +628,28 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
618 start_bit_pos = bit_pos; 628 start_bit_pos = bit_pos;
619 dentry_blk = kmap(dentry_page); 629 dentry_blk = kmap(dentry_page);
620 while (bit_pos < NR_DENTRY_IN_BLOCK) { 630 while (bit_pos < NR_DENTRY_IN_BLOCK) {
621 d_type = DT_UNKNOWN;
622 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, 631 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
623 NR_DENTRY_IN_BLOCK, 632 NR_DENTRY_IN_BLOCK,
624 bit_pos); 633 bit_pos);
625 if (bit_pos >= NR_DENTRY_IN_BLOCK) 634 if (bit_pos >= NR_DENTRY_IN_BLOCK)
626 break; 635 break;
627 636
637 ctx->pos += bit_pos - start_bit_pos;
628 de = &dentry_blk->dentry[bit_pos]; 638 de = &dentry_blk->dentry[bit_pos];
629 if (types && de->file_type < F2FS_FT_MAX) 639 if (de->file_type < F2FS_FT_MAX)
630 d_type = types[de->file_type]; 640 d_type = f2fs_filetype_table[de->file_type];
631 641 else
632 over = filldir(dirent, 642 d_type = DT_UNKNOWN;
633 dentry_blk->filename[bit_pos], 643 if (!dir_emit(ctx,
634 le16_to_cpu(de->name_len), 644 dentry_blk->filename[bit_pos],
635 (n * NR_DENTRY_IN_BLOCK) + bit_pos, 645 le16_to_cpu(de->name_len),
636 le32_to_cpu(de->ino), d_type); 646 le32_to_cpu(de->ino), d_type))
637 if (over) {
638 file->f_pos += bit_pos - start_bit_pos;
639 goto success; 647 goto success;
640 }
641 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); 648 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
642 bit_pos += slots; 649 bit_pos += slots;
643 } 650 }
644 bit_pos = 0; 651 bit_pos = 0;
645 file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK; 652 ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
646 kunmap(dentry_page); 653 kunmap(dentry_page);
647 f2fs_put_page(dentry_page, 1); 654 f2fs_put_page(dentry_page, 1);
648 dentry_page = NULL; 655 dentry_page = NULL;
@@ -659,7 +666,7 @@ success:
659const struct file_operations f2fs_dir_operations = { 666const struct file_operations f2fs_dir_operations = {
660 .llseek = generic_file_llseek, 667 .llseek = generic_file_llseek,
661 .read = generic_read_dir, 668 .read = generic_read_dir,
662 .readdir = f2fs_readdir, 669 .iterate = f2fs_readdir,
663 .fsync = f2fs_sync_file, 670 .fsync = f2fs_sync_file,
664 .unlocked_ioctl = f2fs_ioctl, 671 .unlocked_ioctl = f2fs_ioctl,
665}; 672};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20aab02f2a42..467d42d65c48 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,21 +37,35 @@
37 typecheck(unsigned long long, b) && \ 37 typecheck(unsigned long long, b) && \
38 ((long long)((a) - (b)) > 0)) 38 ((long long)((a) - (b)) > 0))
39 39
40typedef u64 block_t; 40typedef u32 block_t; /*
41 * should not change u32, since it is the on-disk block
42 * address format, __le32.
43 */
41typedef u32 nid_t; 44typedef u32 nid_t;
42 45
43struct f2fs_mount_info { 46struct f2fs_mount_info {
44 unsigned int opt; 47 unsigned int opt;
45}; 48};
46 49
47static inline __u32 f2fs_crc32(void *buff, size_t len) 50#define CRCPOLY_LE 0xedb88320
51
52static inline __u32 f2fs_crc32(void *buf, size_t len)
48{ 53{
49 return crc32_le(F2FS_SUPER_MAGIC, buff, len); 54 unsigned char *p = (unsigned char *)buf;
55 __u32 crc = F2FS_SUPER_MAGIC;
56 int i;
57
58 while (len--) {
59 crc ^= *p++;
60 for (i = 0; i < 8; i++)
61 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
62 }
63 return crc;
50} 64}
51 65
52static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size) 66static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
53{ 67{
54 return f2fs_crc32(buff, buff_size) == blk_crc; 68 return f2fs_crc32(buf, buf_size) == blk_crc;
55} 69}
56 70
57/* 71/*
@@ -148,7 +162,7 @@ struct extent_info {
148 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. 162 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
149 */ 163 */
150#define FADVISE_COLD_BIT 0x01 164#define FADVISE_COLD_BIT 0x01
151#define FADVISE_CP_BIT 0x02 165#define FADVISE_LOST_PINO_BIT 0x02
152 166
153struct f2fs_inode_info { 167struct f2fs_inode_info {
154 struct inode vfs_inode; /* serve a vfs inode */ 168 struct inode vfs_inode; /* serve a vfs inode */
@@ -369,7 +383,6 @@ struct f2fs_sb_info {
369 /* for directory inode management */ 383 /* for directory inode management */
370 struct list_head dir_inode_list; /* dir inode list */ 384 struct list_head dir_inode_list; /* dir inode list */
371 spinlock_t dir_inode_lock; /* for dir inode list lock */ 385 spinlock_t dir_inode_lock; /* for dir inode list lock */
372 unsigned int n_dirty_dirs; /* # of dir inodes */
373 386
374 /* basic file system units */ 387 /* basic file system units */
375 unsigned int log_sectors_per_block; /* log2 sectors per block */ 388 unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -406,12 +419,15 @@ struct f2fs_sb_info {
406 * for stat information. 419 * for stat information.
407 * one is for the LFS mode, and the other is for the SSR mode. 420 * one is for the LFS mode, and the other is for the SSR mode.
408 */ 421 */
422#ifdef CONFIG_F2FS_STAT_FS
409 struct f2fs_stat_info *stat_info; /* FS status information */ 423 struct f2fs_stat_info *stat_info; /* FS status information */
410 unsigned int segment_count[2]; /* # of allocated segments */ 424 unsigned int segment_count[2]; /* # of allocated segments */
411 unsigned int block_count[2]; /* # of allocated blocks */ 425 unsigned int block_count[2]; /* # of allocated blocks */
412 unsigned int last_victim[2]; /* last victim segment # */
413 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ 426 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
414 int bg_gc; /* background gc calls */ 427 int bg_gc; /* background gc calls */
428 unsigned int n_dirty_dirs; /* # of dir inodes */
429#endif
430 unsigned int last_victim[2]; /* last victim segment # */
415 spinlock_t stat_lock; /* lock for stat operations */ 431 spinlock_t stat_lock; /* lock for stat operations */
416}; 432};
417 433
@@ -495,9 +511,17 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
495 511
496static inline void mutex_lock_all(struct f2fs_sb_info *sbi) 512static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
497{ 513{
498 int i = 0; 514 int i;
499 for (; i < NR_GLOBAL_LOCKS; i++) 515
500 mutex_lock(&sbi->fs_lock[i]); 516 for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
517 /*
518 * This is the only time we take multiple fs_lock[]
519 * instances; the order is immaterial since we
520 * always hold cp_mutex, which serializes multiple
521 * such operations.
522 */
523 mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
524 }
501} 525}
502 526
503static inline void mutex_unlock_all(struct f2fs_sb_info *sbi) 527static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
@@ -843,9 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
843/* used for f2fs_inode_info->flags */ 867/* used for f2fs_inode_info->flags */
844enum { 868enum {
845 FI_NEW_INODE, /* indicate newly allocated inode */ 869 FI_NEW_INODE, /* indicate newly allocated inode */
870 FI_DIRTY_INODE, /* indicate inode is dirty or not */
846 FI_INC_LINK, /* need to increment i_nlink */ 871 FI_INC_LINK, /* need to increment i_nlink */
847 FI_ACL_MODE, /* indicate acl mode */ 872 FI_ACL_MODE, /* indicate acl mode */
848 FI_NO_ALLOC, /* should not allocate any blocks */ 873 FI_NO_ALLOC, /* should not allocate any blocks */
874 FI_UPDATE_DIR, /* should update inode block for consistency */
875 FI_DELAY_IPUT, /* used for the recovery */
849}; 876};
850 877
851static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) 878static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -878,14 +905,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
878 return 0; 905 return 0;
879} 906}
880 907
908static inline int f2fs_readonly(struct super_block *sb)
909{
910 return sb->s_flags & MS_RDONLY;
911}
912
881/* 913/*
882 * file.c 914 * file.c
883 */ 915 */
884int f2fs_sync_file(struct file *, loff_t, loff_t, int); 916int f2fs_sync_file(struct file *, loff_t, loff_t, int);
885void truncate_data_blocks(struct dnode_of_data *); 917void truncate_data_blocks(struct dnode_of_data *);
886void f2fs_truncate(struct inode *); 918void f2fs_truncate(struct inode *);
919int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
887int f2fs_setattr(struct dentry *, struct iattr *); 920int f2fs_setattr(struct dentry *, struct iattr *);
888int truncate_hole(struct inode *, pgoff_t, pgoff_t); 921int truncate_hole(struct inode *, pgoff_t, pgoff_t);
922int truncate_data_blocks_range(struct dnode_of_data *, int);
889long f2fs_ioctl(struct file *, unsigned int, unsigned long); 923long f2fs_ioctl(struct file *, unsigned int, unsigned long);
890long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); 924long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
891 925
@@ -913,7 +947,6 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
913ino_t f2fs_inode_by_name(struct inode *, struct qstr *); 947ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
914void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, 948void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
915 struct page *, struct inode *); 949 struct page *, struct inode *);
916void init_dent_inode(const struct qstr *, struct page *);
917int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); 950int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
918void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); 951void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
919int f2fs_make_empty(struct inode *, struct inode *); 952int f2fs_make_empty(struct inode *, struct inode *);
@@ -948,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
948int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 981int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
949int truncate_inode_blocks(struct inode *, pgoff_t); 982int truncate_inode_blocks(struct inode *, pgoff_t);
950int remove_inode_page(struct inode *); 983int remove_inode_page(struct inode *);
951int new_inode_page(struct inode *, const struct qstr *); 984struct page *new_inode_page(struct inode *, const struct qstr *);
952struct page *new_node_page(struct dnode_of_data *, unsigned int); 985struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
953void ra_node_page(struct f2fs_sb_info *, nid_t); 986void ra_node_page(struct f2fs_sb_info *, nid_t);
954struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); 987struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
955struct page *get_node_page_ra(struct page *, int); 988struct page *get_node_page_ra(struct page *, int);
@@ -974,7 +1007,6 @@ void destroy_node_manager_caches(void);
974 */ 1007 */
975void f2fs_balance_fs(struct f2fs_sb_info *); 1008void f2fs_balance_fs(struct f2fs_sb_info *);
976void invalidate_blocks(struct f2fs_sb_info *, block_t); 1009void invalidate_blocks(struct f2fs_sb_info *, block_t);
977void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
978void clear_prefree_segments(struct f2fs_sb_info *); 1010void clear_prefree_segments(struct f2fs_sb_info *);
979int npages_for_summary_flush(struct f2fs_sb_info *); 1011int npages_for_summary_flush(struct f2fs_sb_info *);
980void allocate_new_segments(struct f2fs_sb_info *); 1012void allocate_new_segments(struct f2fs_sb_info *);
@@ -1011,7 +1043,9 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
1011int recover_orphan_inodes(struct f2fs_sb_info *); 1043int recover_orphan_inodes(struct f2fs_sb_info *);
1012int get_valid_checkpoint(struct f2fs_sb_info *); 1044int get_valid_checkpoint(struct f2fs_sb_info *);
1013void set_dirty_dir_page(struct inode *, struct page *); 1045void set_dirty_dir_page(struct inode *, struct page *);
1046void add_dirty_dir_inode(struct inode *);
1014void remove_dirty_dir_inode(struct inode *); 1047void remove_dirty_dir_inode(struct inode *);
1048struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
1015void sync_dirty_dir_inodes(struct f2fs_sb_info *); 1049void sync_dirty_dir_inodes(struct f2fs_sb_info *);
1016void write_checkpoint(struct f2fs_sb_info *, bool); 1050void write_checkpoint(struct f2fs_sb_info *, bool);
1017void init_orphan_info(struct f2fs_sb_info *); 1051void init_orphan_info(struct f2fs_sb_info *);
@@ -1025,7 +1059,7 @@ int reserve_new_block(struct dnode_of_data *);
1025void update_extent_cache(block_t, struct dnode_of_data *); 1059void update_extent_cache(block_t, struct dnode_of_data *);
1026struct page *find_data_page(struct inode *, pgoff_t, bool); 1060struct page *find_data_page(struct inode *, pgoff_t, bool);
1027struct page *get_lock_data_page(struct inode *, pgoff_t); 1061struct page *get_lock_data_page(struct inode *, pgoff_t);
1028struct page *get_new_data_page(struct inode *, pgoff_t, bool); 1062struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
1029int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); 1063int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
1030int do_write_data_page(struct page *); 1064int do_write_data_page(struct page *);
1031 1065
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1cae864f8dfc..d2d2b7dbdcc1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -63,9 +63,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
63 f2fs_put_dnode(&dn); 63 f2fs_put_dnode(&dn);
64 mutex_unlock_op(sbi, ilock); 64 mutex_unlock_op(sbi, ilock);
65 65
66 file_update_time(vma->vm_file);
66 lock_page(page); 67 lock_page(page);
67 if (page->mapping != inode->i_mapping || 68 if (page->mapping != inode->i_mapping ||
68 page_offset(page) >= i_size_read(inode) || 69 page_offset(page) > i_size_read(inode) ||
69 !PageUptodate(page)) { 70 !PageUptodate(page)) {
70 unlock_page(page); 71 unlock_page(page);
71 err = -EFAULT; 72 err = -EFAULT;
@@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
76 * check to see if the page is mapped already (no holes) 77 * check to see if the page is mapped already (no holes)
77 */ 78 */
78 if (PageMappedToDisk(page)) 79 if (PageMappedToDisk(page))
79 goto out; 80 goto mapped;
80
81 /* fill the page */
82 wait_on_page_writeback(page);
83 81
84 /* page is wholly or partially inside EOF */ 82 /* page is wholly or partially inside EOF */
85 if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { 83 if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
90 set_page_dirty(page); 88 set_page_dirty(page);
91 SetPageUptodate(page); 89 SetPageUptodate(page);
92 90
93 file_update_time(vma->vm_file); 91mapped:
92 /* fill the page */
93 wait_on_page_writeback(page);
94out: 94out:
95 sb_end_pagefault(inode->i_sb); 95 sb_end_pagefault(inode->i_sb);
96 return block_page_mkwrite_return(err); 96 return block_page_mkwrite_return(err);
@@ -102,6 +102,24 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
102 .remap_pages = generic_file_remap_pages, 102 .remap_pages = generic_file_remap_pages,
103}; 103};
104 104
105static int get_parent_ino(struct inode *inode, nid_t *pino)
106{
107 struct dentry *dentry;
108
109 inode = igrab(inode);
110 dentry = d_find_any_alias(inode);
111 iput(inode);
112 if (!dentry)
113 return 0;
114
115 inode = igrab(dentry->d_parent->d_inode);
116 dput(dentry);
117
118 *pino = inode->i_ino;
119 iput(inode);
120 return 1;
121}
122
105int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 123int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
106{ 124{
107 struct inode *inode = file->f_mapping->host; 125 struct inode *inode = file->f_mapping->host;
@@ -114,7 +132,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
114 .for_reclaim = 0, 132 .for_reclaim = 0,
115 }; 133 };
116 134
117 if (inode->i_sb->s_flags & MS_RDONLY) 135 if (f2fs_readonly(inode->i_sb))
118 return 0; 136 return 0;
119 137
120 trace_f2fs_sync_file_enter(inode); 138 trace_f2fs_sync_file_enter(inode);
@@ -134,7 +152,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
134 152
135 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) 153 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
136 need_cp = true; 154 need_cp = true;
137 else if (is_cp_file(inode)) 155 else if (file_wrong_pino(inode))
138 need_cp = true; 156 need_cp = true;
139 else if (!space_for_roll_forward(sbi)) 157 else if (!space_for_roll_forward(sbi))
140 need_cp = true; 158 need_cp = true;
@@ -142,11 +160,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
142 need_cp = true; 160 need_cp = true;
143 161
144 if (need_cp) { 162 if (need_cp) {
163 nid_t pino;
164
145 /* all the dirty node pages should be flushed for POR */ 165 /* all the dirty node pages should be flushed for POR */
146 ret = f2fs_sync_fs(inode->i_sb, 1); 166 ret = f2fs_sync_fs(inode->i_sb, 1);
167 if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
168 get_parent_ino(inode, &pino)) {
169 F2FS_I(inode)->i_pino = pino;
170 file_got_pino(inode);
171 mark_inode_dirty_sync(inode);
172 ret = f2fs_write_inode(inode, NULL);
173 if (ret)
174 goto out;
175 }
147 } else { 176 } else {
148 /* if there is no written node page, write its inode page */ 177 /* if there is no written node page, write its inode page */
149 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { 178 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
179 mark_inode_dirty_sync(inode);
150 ret = f2fs_write_inode(inode, NULL); 180 ret = f2fs_write_inode(inode, NULL);
151 if (ret) 181 if (ret)
152 goto out; 182 goto out;
@@ -168,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
168 return 0; 198 return 0;
169} 199}
170 200
171static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) 201int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
172{ 202{
173 int nr_free = 0, ofs = dn->ofs_in_node; 203 int nr_free = 0, ofs = dn->ofs_in_node;
174 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 204 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
@@ -185,10 +215,10 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
185 215
186 update_extent_cache(NULL_ADDR, dn); 216 update_extent_cache(NULL_ADDR, dn);
187 invalidate_blocks(sbi, blkaddr); 217 invalidate_blocks(sbi, blkaddr);
188 dec_valid_block_count(sbi, dn->inode, 1);
189 nr_free++; 218 nr_free++;
190 } 219 }
191 if (nr_free) { 220 if (nr_free) {
221 dec_valid_block_count(sbi, dn->inode, nr_free);
192 set_page_dirty(dn->node_page); 222 set_page_dirty(dn->node_page);
193 sync_inode_page(dn); 223 sync_inode_page(dn);
194 } 224 }
@@ -291,7 +321,7 @@ void f2fs_truncate(struct inode *inode)
291 } 321 }
292} 322}
293 323
294static int f2fs_getattr(struct vfsmount *mnt, 324int f2fs_getattr(struct vfsmount *mnt,
295 struct dentry *dentry, struct kstat *stat) 325 struct dentry *dentry, struct kstat *stat)
296{ 326{
297 struct inode *inode = dentry->d_inode; 327 struct inode *inode = dentry->d_inode;
@@ -387,7 +417,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
387 f2fs_balance_fs(sbi); 417 f2fs_balance_fs(sbi);
388 418
389 ilock = mutex_lock_op(sbi); 419 ilock = mutex_lock_op(sbi);
390 page = get_new_data_page(inode, index, false); 420 page = get_new_data_page(inode, NULL, index, false);
391 mutex_unlock_op(sbi, ilock); 421 mutex_unlock_op(sbi, ilock);
392 422
393 if (!IS_ERR(page)) { 423 if (!IS_ERR(page)) {
@@ -575,10 +605,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
575 int ret; 605 int ret;
576 606
577 switch (cmd) { 607 switch (cmd) {
578 case FS_IOC_GETFLAGS: 608 case F2FS_IOC_GETFLAGS:
579 flags = fi->i_flags & FS_FL_USER_VISIBLE; 609 flags = fi->i_flags & FS_FL_USER_VISIBLE;
580 return put_user(flags, (int __user *) arg); 610 return put_user(flags, (int __user *) arg);
581 case FS_IOC_SETFLAGS: 611 case F2FS_IOC_SETFLAGS:
582 { 612 {
583 unsigned int oldflags; 613 unsigned int oldflags;
584 614
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 14961593e93c..35f9b1a196aa 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -76,7 +76,9 @@ static int gc_thread_func(void *data)
76 else 76 else
77 wait_ms = increase_sleep_time(wait_ms); 77 wait_ms = increase_sleep_time(wait_ms);
78 78
79#ifdef CONFIG_F2FS_STAT_FS
79 sbi->bg_gc++; 80 sbi->bg_gc++;
81#endif
80 82
81 /* if return value is not zero, no victim was selected */ 83 /* if return value is not zero, no victim was selected */
82 if (f2fs_gc(sbi)) 84 if (f2fs_gc(sbi))
@@ -89,23 +91,28 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
89{ 91{
90 struct f2fs_gc_kthread *gc_th; 92 struct f2fs_gc_kthread *gc_th;
91 dev_t dev = sbi->sb->s_bdev->bd_dev; 93 dev_t dev = sbi->sb->s_bdev->bd_dev;
94 int err = 0;
92 95
93 if (!test_opt(sbi, BG_GC)) 96 if (!test_opt(sbi, BG_GC))
94 return 0; 97 goto out;
95 gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); 98 gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
96 if (!gc_th) 99 if (!gc_th) {
97 return -ENOMEM; 100 err = -ENOMEM;
101 goto out;
102 }
98 103
99 sbi->gc_thread = gc_th; 104 sbi->gc_thread = gc_th;
100 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); 105 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
101 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, 106 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
102 "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); 107 "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
103 if (IS_ERR(gc_th->f2fs_gc_task)) { 108 if (IS_ERR(gc_th->f2fs_gc_task)) {
109 err = PTR_ERR(gc_th->f2fs_gc_task);
104 kfree(gc_th); 110 kfree(gc_th);
105 sbi->gc_thread = NULL; 111 sbi->gc_thread = NULL;
106 return -ENOMEM;
107 } 112 }
108 return 0; 113
114out:
115 return err;
109} 116}
110 117
111void stop_gc_thread(struct f2fs_sb_info *sbi) 118void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -234,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
234{ 241{
235 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 242 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
236 struct victim_sel_policy p; 243 struct victim_sel_policy p;
237 unsigned int secno; 244 unsigned int secno, max_cost;
238 int nsearched = 0; 245 int nsearched = 0;
239 246
240 p.alloc_mode = alloc_mode; 247 p.alloc_mode = alloc_mode;
241 select_policy(sbi, gc_type, type, &p); 248 select_policy(sbi, gc_type, type, &p);
242 249
243 p.min_segno = NULL_SEGNO; 250 p.min_segno = NULL_SEGNO;
244 p.min_cost = get_max_cost(sbi, &p); 251 p.min_cost = max_cost = get_max_cost(sbi, &p);
245 252
246 mutex_lock(&dirty_i->seglist_lock); 253 mutex_lock(&dirty_i->seglist_lock);
247 254
@@ -280,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
280 p.min_cost = cost; 287 p.min_cost = cost;
281 } 288 }
282 289
283 if (cost == get_max_cost(sbi, &p)) 290 if (cost == max_cost)
284 continue; 291 continue;
285 292
286 if (nsearched++ >= MAX_VICTIM_SEARCH) { 293 if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -288,8 +295,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
288 break; 295 break;
289 } 296 }
290 } 297 }
291got_it:
292 if (p.min_segno != NULL_SEGNO) { 298 if (p.min_segno != NULL_SEGNO) {
299got_it:
293 if (p.alloc_mode == LFS) { 300 if (p.alloc_mode == LFS) {
294 secno = GET_SECNO(sbi, p.min_segno); 301 secno = GET_SECNO(sbi, p.min_segno);
295 if (gc_type == FG_GC) 302 if (gc_type == FG_GC)
@@ -314,28 +321,21 @@ static const struct victim_selection default_v_ops = {
314 321
315static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) 322static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
316{ 323{
317 struct list_head *this;
318 struct inode_entry *ie; 324 struct inode_entry *ie;
319 325
320 list_for_each(this, ilist) { 326 list_for_each_entry(ie, ilist, list)
321 ie = list_entry(this, struct inode_entry, list);
322 if (ie->inode->i_ino == ino) 327 if (ie->inode->i_ino == ino)
323 return ie->inode; 328 return ie->inode;
324 }
325 return NULL; 329 return NULL;
326} 330}
327 331
328static void add_gc_inode(struct inode *inode, struct list_head *ilist) 332static void add_gc_inode(struct inode *inode, struct list_head *ilist)
329{ 333{
330 struct list_head *this; 334 struct inode_entry *new_ie;
331 struct inode_entry *new_ie, *ie;
332 335
333 list_for_each(this, ilist) { 336 if (inode == find_gc_inode(inode->i_ino, ilist)) {
334 ie = list_entry(this, struct inode_entry, list); 337 iput(inode);
335 if (ie->inode == inode) { 338 return;
336 iput(inode);
337 return;
338 }
339 } 339 }
340repeat: 340repeat:
341 new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); 341 new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 91ac7f9d88ee..2b2d45d19e3e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -109,12 +109,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
109 ret = do_read_inode(inode); 109 ret = do_read_inode(inode);
110 if (ret) 110 if (ret)
111 goto bad_inode; 111 goto bad_inode;
112
113 if (!sbi->por_doing && inode->i_nlink == 0) {
114 ret = -ENOENT;
115 goto bad_inode;
116 }
117
118make_now: 112make_now:
119 if (ino == F2FS_NODE_INO(sbi)) { 113 if (ino == F2FS_NODE_INO(sbi)) {
120 inode->i_mapping->a_ops = &f2fs_node_aops; 114 inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -130,8 +124,7 @@ make_now:
130 inode->i_op = &f2fs_dir_inode_operations; 124 inode->i_op = &f2fs_dir_inode_operations;
131 inode->i_fop = &f2fs_dir_operations; 125 inode->i_fop = &f2fs_dir_operations;
132 inode->i_mapping->a_ops = &f2fs_dblock_aops; 126 inode->i_mapping->a_ops = &f2fs_dblock_aops;
133 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE | 127 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
134 __GFP_ZERO);
135 } else if (S_ISLNK(inode->i_mode)) { 128 } else if (S_ISLNK(inode->i_mode)) {
136 inode->i_op = &f2fs_symlink_inode_operations; 129 inode->i_op = &f2fs_symlink_inode_operations;
137 inode->i_mapping->a_ops = &f2fs_dblock_aops; 130 inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -199,6 +192,7 @@ void update_inode(struct inode *inode, struct page *node_page)
199 192
200 set_cold_node(inode, node_page); 193 set_cold_node(inode, node_page);
201 set_page_dirty(node_page); 194 set_page_dirty(node_page);
195 clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
202} 196}
203 197
204int update_inode_page(struct inode *inode) 198int update_inode_page(struct inode *inode)
@@ -224,6 +218,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
224 inode->i_ino == F2FS_META_INO(sbi)) 218 inode->i_ino == F2FS_META_INO(sbi))
225 return 0; 219 return 0;
226 220
221 if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
222 return 0;
223
227 if (wbc) 224 if (wbc)
228 f2fs_balance_fs(sbi); 225 f2fs_balance_fs(sbi);
229 226
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 47abc9722b17..64c07169df05 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -112,7 +112,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
112 int count = le32_to_cpu(sbi->raw_super->extension_count); 112 int count = le32_to_cpu(sbi->raw_super->extension_count);
113 for (i = 0; i < count; i++) { 113 for (i = 0; i < count; i++) {
114 if (is_multimedia_file(name, extlist[i])) { 114 if (is_multimedia_file(name, extlist[i])) {
115 set_cold_file(inode); 115 file_set_cold(inode);
116 break; 116 break;
117 } 117 }
118 } 118 }
@@ -149,8 +149,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
149 149
150 alloc_nid_done(sbi, ino); 150 alloc_nid_done(sbi, ino);
151 151
152 if (!sbi->por_doing) 152 d_instantiate(dentry, inode);
153 d_instantiate(dentry, inode);
154 unlock_new_inode(inode); 153 unlock_new_inode(inode);
155 return 0; 154 return 0;
156out: 155out:
@@ -173,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
173 f2fs_balance_fs(sbi); 172 f2fs_balance_fs(sbi);
174 173
175 inode->i_ctime = CURRENT_TIME; 174 inode->i_ctime = CURRENT_TIME;
176 atomic_inc(&inode->i_count); 175 ihold(inode);
177 176
178 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 177 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
179 ilock = mutex_lock_op(sbi); 178 ilock = mutex_lock_op(sbi);
@@ -182,17 +181,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
182 if (err) 181 if (err)
183 goto out; 182 goto out;
184 183
185 /*
186 * This file should be checkpointed during fsync.
187 * We lost i_pino from now on.
188 */
189 set_cp_file(inode);
190
191 d_instantiate(dentry, inode); 184 d_instantiate(dentry, inode);
192 return 0; 185 return 0;
193out: 186out:
194 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 187 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
195 make_bad_inode(inode);
196 iput(inode); 188 iput(inode);
197 return err; 189 return err;
198} 190}
@@ -498,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
498 .rmdir = f2fs_rmdir, 490 .rmdir = f2fs_rmdir,
499 .mknod = f2fs_mknod, 491 .mknod = f2fs_mknod,
500 .rename = f2fs_rename, 492 .rename = f2fs_rename,
493 .getattr = f2fs_getattr,
501 .setattr = f2fs_setattr, 494 .setattr = f2fs_setattr,
502 .get_acl = f2fs_get_acl, 495 .get_acl = f2fs_get_acl,
503#ifdef CONFIG_F2FS_FS_XATTR 496#ifdef CONFIG_F2FS_FS_XATTR
@@ -512,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
512 .readlink = generic_readlink, 505 .readlink = generic_readlink,
513 .follow_link = page_follow_link_light, 506 .follow_link = page_follow_link_light,
514 .put_link = page_put_link, 507 .put_link = page_put_link,
508 .getattr = f2fs_getattr,
515 .setattr = f2fs_setattr, 509 .setattr = f2fs_setattr,
516#ifdef CONFIG_F2FS_FS_XATTR 510#ifdef CONFIG_F2FS_FS_XATTR
517 .setxattr = generic_setxattr, 511 .setxattr = generic_setxattr,
@@ -522,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
522}; 516};
523 517
524const struct inode_operations f2fs_special_inode_operations = { 518const struct inode_operations f2fs_special_inode_operations = {
519 .getattr = f2fs_getattr,
525 .setattr = f2fs_setattr, 520 .setattr = f2fs_setattr,
526 .get_acl = f2fs_get_acl, 521 .get_acl = f2fs_get_acl,
527#ifdef CONFIG_F2FS_FS_XATTR 522#ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3df43b4efd89..b418aee09573 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -408,10 +408,13 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
408 level = get_node_path(index, offset, noffset); 408 level = get_node_path(index, offset, noffset);
409 409
410 nids[0] = dn->inode->i_ino; 410 nids[0] = dn->inode->i_ino;
411 npage[0] = get_node_page(sbi, nids[0]); 411 npage[0] = dn->inode_page;
412 if (IS_ERR(npage[0]))
413 return PTR_ERR(npage[0]);
414 412
413 if (!npage[0]) {
414 npage[0] = get_node_page(sbi, nids[0]);
415 if (IS_ERR(npage[0]))
416 return PTR_ERR(npage[0]);
417 }
415 parent = npage[0]; 418 parent = npage[0];
416 if (level != 0) 419 if (level != 0)
417 nids[1] = get_nid(parent, offset[0], true); 420 nids[1] = get_nid(parent, offset[0], true);
@@ -430,7 +433,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
430 } 433 }
431 434
432 dn->nid = nids[i]; 435 dn->nid = nids[i];
433 npage[i] = new_node_page(dn, noffset[i]); 436 npage[i] = new_node_page(dn, noffset[i], NULL);
434 if (IS_ERR(npage[i])) { 437 if (IS_ERR(npage[i])) {
435 alloc_nid_failed(sbi, nids[i]); 438 alloc_nid_failed(sbi, nids[i]);
436 err = PTR_ERR(npage[i]); 439 err = PTR_ERR(npage[i]);
@@ -803,22 +806,19 @@ int remove_inode_page(struct inode *inode)
803 return 0; 806 return 0;
804} 807}
805 808
806int new_inode_page(struct inode *inode, const struct qstr *name) 809struct page *new_inode_page(struct inode *inode, const struct qstr *name)
807{ 810{
808 struct page *page;
809 struct dnode_of_data dn; 811 struct dnode_of_data dn;
810 812
811 /* allocate inode page for new inode */ 813 /* allocate inode page for new inode */
812 set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); 814 set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
813 page = new_node_page(&dn, 0); 815
814 init_dent_inode(name, page); 816 /* caller should f2fs_put_page(page, 1); */
815 if (IS_ERR(page)) 817 return new_node_page(&dn, 0, NULL);
816 return PTR_ERR(page);
817 f2fs_put_page(page, 1);
818 return 0;
819} 818}
820 819
821struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) 820struct page *new_node_page(struct dnode_of_data *dn,
821 unsigned int ofs, struct page *ipage)
822{ 822{
823 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 823 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
824 struct address_space *mapping = sbi->node_inode->i_mapping; 824 struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -851,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
851 set_cold_node(dn->inode, page); 851 set_cold_node(dn->inode, page);
852 852
853 dn->node_page = page; 853 dn->node_page = page;
854 sync_inode_page(dn); 854 if (ipage)
855 update_inode(dn->inode, ipage);
856 else
857 sync_inode_page(dn);
855 set_page_dirty(page); 858 set_page_dirty(page);
856 if (ofs == 0) 859 if (ofs == 0)
857 inc_valid_inode_count(sbi); 860 inc_valid_inode_count(sbi);
@@ -1205,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
1205 return 0; 1208 return 0;
1206} 1209}
1207 1210
1208static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) 1211static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
1212 unsigned int length)
1209{ 1213{
1210 struct inode *inode = page->mapping->host; 1214 struct inode *inode = page->mapping->host;
1211 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1215 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1492,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1492 new_ni = old_ni; 1496 new_ni = old_ni;
1493 new_ni.ino = ino; 1497 new_ni.ino = ino;
1494 1498
1499 if (!inc_valid_node_count(sbi, NULL, 1))
1500 WARN_ON(1);
1495 set_node_addr(sbi, &new_ni, NEW_ADDR); 1501 set_node_addr(sbi, &new_ni, NEW_ADDR);
1496 inc_valid_inode_count(sbi); 1502 inc_valid_inode_count(sbi);
1497
1498 f2fs_put_page(ipage, 1); 1503 f2fs_put_page(ipage, 1);
1499 return 0; 1504 return 0;
1500} 1505}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0a2d72f0024d..c65fb4f4230f 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -275,25 +275,27 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
275 * - Mark cold node blocks in their node footer 275 * - Mark cold node blocks in their node footer
276 * - Mark cold data pages in page cache 276 * - Mark cold data pages in page cache
277 */ 277 */
278static inline int is_cold_file(struct inode *inode) 278static inline int is_file(struct inode *inode, int type)
279{ 279{
280 return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT; 280 return F2FS_I(inode)->i_advise & type;
281} 281}
282 282
283static inline void set_cold_file(struct inode *inode) 283static inline void set_file(struct inode *inode, int type)
284{ 284{
285 F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; 285 F2FS_I(inode)->i_advise |= type;
286} 286}
287 287
288static inline int is_cp_file(struct inode *inode) 288static inline void clear_file(struct inode *inode, int type)
289{ 289{
290 return F2FS_I(inode)->i_advise & FADVISE_CP_BIT; 290 F2FS_I(inode)->i_advise &= ~type;
291} 291}
292 292
293static inline void set_cp_file(struct inode *inode) 293#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
294{ 294#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
295 F2FS_I(inode)->i_advise |= FADVISE_CP_BIT; 295#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
296} 296#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
297#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
298#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
297 299
298static inline int is_cold_data(struct page *page) 300static inline int is_cold_data(struct page *page)
299{ 301{
@@ -310,29 +312,16 @@ static inline void clear_cold_data(struct page *page)
310 ClearPageChecked(page); 312 ClearPageChecked(page);
311} 313}
312 314
313static inline int is_cold_node(struct page *page) 315static inline int is_node(struct page *page, int type)
314{ 316{
315 void *kaddr = page_address(page); 317 void *kaddr = page_address(page);
316 struct f2fs_node *rn = (struct f2fs_node *)kaddr; 318 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
317 unsigned int flag = le32_to_cpu(rn->footer.flag); 319 return le32_to_cpu(rn->footer.flag) & (1 << type);
318 return flag & (0x1 << COLD_BIT_SHIFT);
319} 320}
320 321
321static inline unsigned char is_fsync_dnode(struct page *page) 322#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT)
322{ 323#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
323 void *kaddr = page_address(page); 324#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
324 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
325 unsigned int flag = le32_to_cpu(rn->footer.flag);
326 return flag & (0x1 << FSYNC_BIT_SHIFT);
327}
328
329static inline unsigned char is_dent_dnode(struct page *page)
330{
331 void *kaddr = page_address(page);
332 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
333 unsigned int flag = le32_to_cpu(rn->footer.flag);
334 return flag & (0x1 << DENT_BIT_SHIFT);
335}
336 325
337static inline void set_cold_node(struct inode *inode, struct page *page) 326static inline void set_cold_node(struct inode *inode, struct page *page)
338{ 327{
@@ -346,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
346 rn->footer.flag = cpu_to_le32(flag); 335 rn->footer.flag = cpu_to_le32(flag);
347} 336}
348 337
349static inline void set_fsync_mark(struct page *page, int mark) 338static inline void set_mark(struct page *page, int mark, int type)
350{ 339{
351 void *kaddr = page_address(page); 340 struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
352 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
353 unsigned int flag = le32_to_cpu(rn->footer.flag);
354 if (mark)
355 flag |= (0x1 << FSYNC_BIT_SHIFT);
356 else
357 flag &= ~(0x1 << FSYNC_BIT_SHIFT);
358 rn->footer.flag = cpu_to_le32(flag);
359}
360
361static inline void set_dentry_mark(struct page *page, int mark)
362{
363 void *kaddr = page_address(page);
364 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
365 unsigned int flag = le32_to_cpu(rn->footer.flag); 341 unsigned int flag = le32_to_cpu(rn->footer.flag);
366 if (mark) 342 if (mark)
367 flag |= (0x1 << DENT_BIT_SHIFT); 343 flag |= (0x1 << type);
368 else 344 else
369 flag &= ~(0x1 << DENT_BIT_SHIFT); 345 flag &= ~(0x1 << type);
370 rn->footer.flag = cpu_to_le32(flag); 346 rn->footer.flag = cpu_to_le32(flag);
371} 347}
348#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT)
349#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 60c8a5097058..d56d951c2253 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,36 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
40 40
41static int recover_dentry(struct page *ipage, struct inode *inode) 41static int recover_dentry(struct page *ipage, struct inode *inode)
42{ 42{
43 struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage); 43 void *kaddr = page_address(ipage);
44 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
44 struct f2fs_inode *raw_inode = &(raw_node->i); 45 struct f2fs_inode *raw_inode = &(raw_node->i);
45 struct qstr name; 46 nid_t pino = le32_to_cpu(raw_inode->i_pino);
46 struct f2fs_dir_entry *de; 47 struct f2fs_dir_entry *de;
48 struct qstr name;
47 struct page *page; 49 struct page *page;
48 struct inode *dir; 50 struct inode *dir, *einode;
49 int err = 0; 51 int err = 0;
50 52
51 if (!is_dent_dnode(ipage)) 53 dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
52 goto out; 54 if (!dir) {
53 55 dir = f2fs_iget(inode->i_sb, pino);
54 dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino)); 56 if (IS_ERR(dir)) {
55 if (IS_ERR(dir)) { 57 err = PTR_ERR(dir);
56 err = PTR_ERR(dir); 58 goto out;
57 goto out; 59 }
60 set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
61 add_dirty_dir_inode(dir);
58 } 62 }
59 63
60 name.len = le32_to_cpu(raw_inode->i_namelen); 64 name.len = le32_to_cpu(raw_inode->i_namelen);
61 name.name = raw_inode->i_name; 65 name.name = raw_inode->i_name;
62 66retry:
63 de = f2fs_find_entry(dir, &name, &page); 67 de = f2fs_find_entry(dir, &name, &page);
64 if (de) { 68 if (de && inode->i_ino == le32_to_cpu(de->ino)) {
65 kunmap(page); 69 kunmap(page);
66 f2fs_put_page(page, 0); 70 f2fs_put_page(page, 0);
67 } else { 71 goto out;
68 err = __f2fs_add_link(dir, &name, inode); 72 }
73 if (de) {
74 einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
75 if (IS_ERR(einode)) {
76 WARN_ON(1);
77 if (PTR_ERR(einode) == -ENOENT)
78 err = -EEXIST;
79 goto out;
80 }
81 f2fs_delete_entry(de, page, einode);
82 iput(einode);
83 goto retry;
69 } 84 }
70 iput(dir); 85 err = __f2fs_add_link(dir, &name, inode);
71out: 86out:
72 kunmap(ipage); 87 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
88 "ino = %x, name = %s, dir = %lx, err = %d",
89 ino_of_node(ipage), raw_inode->i_name,
90 IS_ERR(dir) ? 0 : dir->i_ino, err);
73 return err; 91 return err;
74} 92}
75 93
@@ -79,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page)
79 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; 97 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
80 struct f2fs_inode *raw_inode = &(raw_node->i); 98 struct f2fs_inode *raw_inode = &(raw_node->i);
81 99
100 if (!IS_INODE(node_page))
101 return 0;
102
82 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 103 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
83 i_size_write(inode, le64_to_cpu(raw_inode->i_size)); 104 i_size_write(inode, le64_to_cpu(raw_inode->i_size));
84 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 105 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -88,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page)
88 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); 109 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
89 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 110 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
90 111
91 return recover_dentry(node_page, inode); 112 if (is_dent_dnode(node_page))
113 return recover_dentry(node_page, inode);
114
115 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
116 ino_of_node(node_page), raw_inode->i_name);
117 return 0;
92} 118}
93 119
94static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) 120static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -119,14 +145,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
119 lock_page(page); 145 lock_page(page);
120 146
121 if (cp_ver != cpver_of_node(page)) 147 if (cp_ver != cpver_of_node(page))
122 goto unlock_out; 148 break;
123 149
124 if (!is_fsync_dnode(page)) 150 if (!is_fsync_dnode(page))
125 goto next; 151 goto next;
126 152
127 entry = get_fsync_inode(head, ino_of_node(page)); 153 entry = get_fsync_inode(head, ino_of_node(page));
128 if (entry) { 154 if (entry) {
129 entry->blkaddr = blkaddr;
130 if (IS_INODE(page) && is_dent_dnode(page)) 155 if (IS_INODE(page) && is_dent_dnode(page))
131 set_inode_flag(F2FS_I(entry->inode), 156 set_inode_flag(F2FS_I(entry->inode),
132 FI_INC_LINK); 157 FI_INC_LINK);
@@ -134,48 +159,40 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
134 if (IS_INODE(page) && is_dent_dnode(page)) { 159 if (IS_INODE(page) && is_dent_dnode(page)) {
135 err = recover_inode_page(sbi, page); 160 err = recover_inode_page(sbi, page);
136 if (err) 161 if (err)
137 goto unlock_out; 162 break;
138 } 163 }
139 164
140 /* add this fsync inode to the list */ 165 /* add this fsync inode to the list */
141 entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); 166 entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
142 if (!entry) { 167 if (!entry) {
143 err = -ENOMEM; 168 err = -ENOMEM;
144 goto unlock_out; 169 break;
145 } 170 }
146 171
147 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); 172 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
148 if (IS_ERR(entry->inode)) { 173 if (IS_ERR(entry->inode)) {
149 err = PTR_ERR(entry->inode); 174 err = PTR_ERR(entry->inode);
150 kmem_cache_free(fsync_entry_slab, entry); 175 kmem_cache_free(fsync_entry_slab, entry);
151 goto unlock_out; 176 break;
152 } 177 }
153
154 list_add_tail(&entry->list, head); 178 list_add_tail(&entry->list, head);
155 entry->blkaddr = blkaddr;
156 }
157 if (IS_INODE(page)) {
158 err = recover_inode(entry->inode, page);
159 if (err == -ENOENT) {
160 goto next;
161 } else if (err) {
162 err = -EINVAL;
163 goto unlock_out;
164 }
165 } 179 }
180 entry->blkaddr = blkaddr;
181
182 err = recover_inode(entry->inode, page);
183 if (err && err != -ENOENT)
184 break;
166next: 185next:
167 /* check next segment */ 186 /* check next segment */
168 blkaddr = next_blkaddr_of_node(page); 187 blkaddr = next_blkaddr_of_node(page);
169 } 188 }
170unlock_out:
171 unlock_page(page); 189 unlock_page(page);
172out: 190out:
173 __free_pages(page, 0); 191 __free_pages(page, 0);
174 return err; 192 return err;
175} 193}
176 194
177static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, 195static void destroy_fsync_dnodes(struct list_head *head)
178 struct list_head *head)
179{ 196{
180 struct fsync_inode_entry *entry, *tmp; 197 struct fsync_inode_entry *entry, *tmp;
181 198
@@ -186,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
186 } 203 }
187} 204}
188 205
189static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, 206static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
190 block_t blkaddr) 207 block_t blkaddr, struct dnode_of_data *dn)
191{ 208{
192 struct seg_entry *sentry; 209 struct seg_entry *sentry;
193 unsigned int segno = GET_SEGNO(sbi, blkaddr); 210 unsigned int segno = GET_SEGNO(sbi, blkaddr);
194 unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & 211 unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
195 (sbi->blocks_per_seg - 1); 212 (sbi->blocks_per_seg - 1);
196 struct f2fs_summary sum; 213 struct f2fs_summary sum;
197 nid_t ino; 214 nid_t ino, nid;
198 void *kaddr; 215 void *kaddr;
199 struct inode *inode; 216 struct inode *inode;
200 struct page *node_page; 217 struct page *node_page;
@@ -203,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
203 220
204 sentry = get_seg_entry(sbi, segno); 221 sentry = get_seg_entry(sbi, segno);
205 if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) 222 if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
206 return; 223 return 0;
207 224
208 /* Get the previous summary */ 225 /* Get the previous summary */
209 for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { 226 for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -222,20 +239,39 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
222 f2fs_put_page(sum_page, 1); 239 f2fs_put_page(sum_page, 1);
223 } 240 }
224 241
242 /* Use the locked dnode page and inode */
243 nid = le32_to_cpu(sum.nid);
244 if (dn->inode->i_ino == nid) {
245 struct dnode_of_data tdn = *dn;
246 tdn.nid = nid;
247 tdn.node_page = dn->inode_page;
248 tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
249 truncate_data_blocks_range(&tdn, 1);
250 return 0;
251 } else if (dn->nid == nid) {
252 struct dnode_of_data tdn = *dn;
253 tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
254 truncate_data_blocks_range(&tdn, 1);
255 return 0;
256 }
257
225 /* Get the node page */ 258 /* Get the node page */
226 node_page = get_node_page(sbi, le32_to_cpu(sum.nid)); 259 node_page = get_node_page(sbi, nid);
260 if (IS_ERR(node_page))
261 return PTR_ERR(node_page);
227 bidx = start_bidx_of_node(ofs_of_node(node_page)) + 262 bidx = start_bidx_of_node(ofs_of_node(node_page)) +
228 le16_to_cpu(sum.ofs_in_node); 263 le16_to_cpu(sum.ofs_in_node);
229 ino = ino_of_node(node_page); 264 ino = ino_of_node(node_page);
230 f2fs_put_page(node_page, 1); 265 f2fs_put_page(node_page, 1);
231 266
232 /* Deallocate previous index in the node page */ 267 /* Deallocate previous index in the node page */
233 inode = f2fs_iget(sbi->sb, ino); 268 inode = f2fs_iget(sbi->sb, ino);
234 if (IS_ERR(inode)) 269 if (IS_ERR(inode))
235 return; 270 return PTR_ERR(inode);
236 271
237 truncate_hole(inode, bidx, bidx + 1); 272 truncate_hole(inode, bidx, bidx + 1);
238 iput(inode); 273 iput(inode);
274 return 0;
239} 275}
240 276
241static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, 277static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
@@ -245,7 +281,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
245 struct dnode_of_data dn; 281 struct dnode_of_data dn;
246 struct f2fs_summary sum; 282 struct f2fs_summary sum;
247 struct node_info ni; 283 struct node_info ni;
248 int err = 0; 284 int err = 0, recovered = 0;
249 int ilock; 285 int ilock;
250 286
251 start = start_bidx_of_node(ofs_of_node(page)); 287 start = start_bidx_of_node(ofs_of_node(page));
@@ -283,13 +319,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
283 } 319 }
284 320
285 /* Check the previous node page having this index */ 321 /* Check the previous node page having this index */
286 check_index_in_prev_nodes(sbi, dest); 322 err = check_index_in_prev_nodes(sbi, dest, &dn);
323 if (err)
324 goto err;
287 325
288 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); 326 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
289 327
290 /* write dummy data page */ 328 /* write dummy data page */
291 recover_data_page(sbi, NULL, &sum, src, dest); 329 recover_data_page(sbi, NULL, &sum, src, dest);
292 update_extent_cache(dest, &dn); 330 update_extent_cache(dest, &dn);
331 recovered++;
293 } 332 }
294 dn.ofs_in_node++; 333 dn.ofs_in_node++;
295 } 334 }
@@ -305,9 +344,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
305 set_page_dirty(dn.node_page); 344 set_page_dirty(dn.node_page);
306 345
307 recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); 346 recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
347err:
308 f2fs_put_dnode(&dn); 348 f2fs_put_dnode(&dn);
309 mutex_unlock_op(sbi, ilock); 349 mutex_unlock_op(sbi, ilock);
310 return 0; 350
351 f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
352 "recovered_data = %d blocks, err = %d",
353 inode->i_ino, recovered, err);
354 return err;
311} 355}
312 356
313static int recover_data(struct f2fs_sb_info *sbi, 357static int recover_data(struct f2fs_sb_info *sbi,
@@ -340,7 +384,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
340 lock_page(page); 384 lock_page(page);
341 385
342 if (cp_ver != cpver_of_node(page)) 386 if (cp_ver != cpver_of_node(page))
343 goto unlock_out; 387 break;
344 388
345 entry = get_fsync_inode(head, ino_of_node(page)); 389 entry = get_fsync_inode(head, ino_of_node(page));
346 if (!entry) 390 if (!entry)
@@ -348,7 +392,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
348 392
349 err = do_recover_data(sbi, entry->inode, page, blkaddr); 393 err = do_recover_data(sbi, entry->inode, page, blkaddr);
350 if (err) 394 if (err)
351 goto out; 395 break;
352 396
353 if (entry->blkaddr == blkaddr) { 397 if (entry->blkaddr == blkaddr) {
354 iput(entry->inode); 398 iput(entry->inode);
@@ -359,7 +403,6 @@ next:
359 /* check next segment */ 403 /* check next segment */
360 blkaddr = next_blkaddr_of_node(page); 404 blkaddr = next_blkaddr_of_node(page);
361 } 405 }
362unlock_out:
363 unlock_page(page); 406 unlock_page(page);
364out: 407out:
365 __free_pages(page, 0); 408 __free_pages(page, 0);
@@ -382,6 +425,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
382 INIT_LIST_HEAD(&inode_list); 425 INIT_LIST_HEAD(&inode_list);
383 426
384 /* step #1: find fsynced inode numbers */ 427 /* step #1: find fsynced inode numbers */
428 sbi->por_doing = 1;
385 err = find_fsync_dnodes(sbi, &inode_list); 429 err = find_fsync_dnodes(sbi, &inode_list);
386 if (err) 430 if (err)
387 goto out; 431 goto out;
@@ -390,13 +434,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
390 goto out; 434 goto out;
391 435
392 /* step #2: recover data */ 436 /* step #2: recover data */
393 sbi->por_doing = 1;
394 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); 437 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
395 sbi->por_doing = 0;
396 BUG_ON(!list_empty(&inode_list)); 438 BUG_ON(!list_empty(&inode_list));
397out: 439out:
398 destroy_fsync_dnodes(sbi, &inode_list); 440 destroy_fsync_dnodes(&inode_list);
399 kmem_cache_destroy(fsync_entry_slab); 441 kmem_cache_destroy(fsync_entry_slab);
400 write_checkpoint(sbi, false); 442 sbi->por_doing = 0;
443 if (!err)
444 write_checkpoint(sbi, false);
401 return err; 445 return err;
402} 446}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d8e84e49a5c3..a86d125a9885 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -94,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
94 * Adding dirty entry into seglist is not critical operation. 94 * Adding dirty entry into seglist is not critical operation.
95 * If a given segment is one of current working segments, it won't be added. 95 * If a given segment is one of current working segments, it won't be added.
96 */ 96 */
97void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) 97static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
98{ 98{
99 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 99 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
100 unsigned short valid_blocks; 100 unsigned short valid_blocks;
@@ -126,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
126static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) 126static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
127{ 127{
128 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 128 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
129 unsigned int segno, offset = 0; 129 unsigned int segno = -1;
130 unsigned int total_segs = TOTAL_SEGS(sbi); 130 unsigned int total_segs = TOTAL_SEGS(sbi);
131 131
132 mutex_lock(&dirty_i->seglist_lock); 132 mutex_lock(&dirty_i->seglist_lock);
133 while (1) { 133 while (1) {
134 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, 134 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
135 offset); 135 segno + 1);
136 if (segno >= total_segs) 136 if (segno >= total_segs)
137 break; 137 break;
138 __set_test_and_free(sbi, segno); 138 __set_test_and_free(sbi, segno);
139 offset = segno + 1;
140 } 139 }
141 mutex_unlock(&dirty_i->seglist_lock); 140 mutex_unlock(&dirty_i->seglist_lock);
142} 141}
@@ -144,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
144void clear_prefree_segments(struct f2fs_sb_info *sbi) 143void clear_prefree_segments(struct f2fs_sb_info *sbi)
145{ 144{
146 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 145 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
147 unsigned int segno, offset = 0; 146 unsigned int segno = -1;
148 unsigned int total_segs = TOTAL_SEGS(sbi); 147 unsigned int total_segs = TOTAL_SEGS(sbi);
149 148
150 mutex_lock(&dirty_i->seglist_lock); 149 mutex_lock(&dirty_i->seglist_lock);
151 while (1) { 150 while (1) {
152 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, 151 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
153 offset); 152 segno + 1);
154 if (segno >= total_segs) 153 if (segno >= total_segs)
155 break; 154 break;
156 155
157 offset = segno + 1;
158 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) 156 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
159 dirty_i->nr_dirty[PRE]--; 157 dirty_i->nr_dirty[PRE]--;
160 158
@@ -257,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
257 * This function should be resided under the curseg_mutex lock 255 * This function should be resided under the curseg_mutex lock
258 */ 256 */
259static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, 257static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
260 struct f2fs_summary *sum, unsigned short offset) 258 struct f2fs_summary *sum)
261{ 259{
262 struct curseg_info *curseg = CURSEG_I(sbi, type); 260 struct curseg_info *curseg = CURSEG_I(sbi, type);
263 void *addr = curseg->sum_blk; 261 void *addr = curseg->sum_blk;
264 addr += offset * sizeof(struct f2fs_summary); 262 addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
265 memcpy(addr, sum, sizeof(struct f2fs_summary)); 263 memcpy(addr, sum, sizeof(struct f2fs_summary));
266 return; 264 return;
267} 265}
@@ -311,64 +309,14 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
311 f2fs_put_page(page, 1); 309 f2fs_put_page(page, 1);
312} 310}
313 311
314static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type)
315{
316 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
317 unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
318 unsigned int segno;
319 unsigned int ofs = 0;
320
321 /*
322 * If there is not enough reserved sections,
323 * we should not reuse prefree segments.
324 */
325 if (has_not_enough_free_secs(sbi, 0))
326 return NULL_SEGNO;
327
328 /*
329 * NODE page should not reuse prefree segment,
330 * since those information is used for SPOR.
331 */
332 if (IS_NODESEG(type))
333 return NULL_SEGNO;
334next:
335 segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs);
336 ofs += sbi->segs_per_sec;
337
338 if (segno < TOTAL_SEGS(sbi)) {
339 int i;
340
341 /* skip intermediate segments in a section */
342 if (segno % sbi->segs_per_sec)
343 goto next;
344
345 /* skip if the section is currently used */
346 if (sec_usage_check(sbi, GET_SECNO(sbi, segno)))
347 goto next;
348
349 /* skip if whole section is not prefree */
350 for (i = 1; i < sbi->segs_per_sec; i++)
351 if (!test_bit(segno + i, prefree_segmap))
352 goto next;
353
354 /* skip if whole section was not free at the last checkpoint */
355 for (i = 0; i < sbi->segs_per_sec; i++)
356 if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks)
357 goto next;
358
359 return segno;
360 }
361 return NULL_SEGNO;
362}
363
364static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) 312static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
365{ 313{
366 struct curseg_info *curseg = CURSEG_I(sbi, type); 314 struct curseg_info *curseg = CURSEG_I(sbi, type);
367 unsigned int segno = curseg->segno; 315 unsigned int segno = curseg->segno + 1;
368 struct free_segmap_info *free_i = FREE_I(sbi); 316 struct free_segmap_info *free_i = FREE_I(sbi);
369 317
370 if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec) 318 if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
371 return !test_bit(segno + 1, free_i->free_segmap); 319 return !test_bit(segno, free_i->free_segmap);
372 return 0; 320 return 0;
373} 321}
374 322
@@ -495,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
495 int dir = ALLOC_LEFT; 443 int dir = ALLOC_LEFT;
496 444
497 write_sum_page(sbi, curseg->sum_blk, 445 write_sum_page(sbi, curseg->sum_blk,
498 GET_SUM_BLOCK(sbi, curseg->segno)); 446 GET_SUM_BLOCK(sbi, segno));
499 if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) 447 if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
500 dir = ALLOC_RIGHT; 448 dir = ALLOC_RIGHT;
501 449
@@ -599,11 +547,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
599 goto out; 547 goto out;
600 } 548 }
601 549
602 curseg->next_segno = check_prefree_segments(sbi, type); 550 if (type == CURSEG_WARM_NODE)
603
604 if (curseg->next_segno != NULL_SEGNO)
605 change_curseg(sbi, type, false);
606 else if (type == CURSEG_WARM_NODE)
607 new_curseg(sbi, type, false); 551 new_curseg(sbi, type, false);
608 else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) 552 else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
609 new_curseg(sbi, type, false); 553 new_curseg(sbi, type, false);
@@ -612,7 +556,10 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
612 else 556 else
613 new_curseg(sbi, type, false); 557 new_curseg(sbi, type, false);
614out: 558out:
559#ifdef CONFIG_F2FS_STAT_FS
615 sbi->segment_count[curseg->alloc_type]++; 560 sbi->segment_count[curseg->alloc_type]++;
561#endif
562 return;
616} 563}
617 564
618void allocate_new_segments(struct f2fs_sb_info *sbi) 565void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -795,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
795 742
796 if (S_ISDIR(inode->i_mode)) 743 if (S_ISDIR(inode->i_mode))
797 return CURSEG_HOT_DATA; 744 return CURSEG_HOT_DATA;
798 else if (is_cold_data(page) || is_cold_file(inode)) 745 else if (is_cold_data(page) || file_is_cold(inode))
799 return CURSEG_COLD_DATA; 746 return CURSEG_COLD_DATA;
800 else 747 else
801 return CURSEG_WARM_DATA; 748 return CURSEG_WARM_DATA;
@@ -844,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
844 * because, this function updates a summary entry in the 791 * because, this function updates a summary entry in the
845 * current summary block. 792 * current summary block.
846 */ 793 */
847 __add_sum_entry(sbi, type, sum, curseg->next_blkoff); 794 __add_sum_entry(sbi, type, sum);
848 795
849 mutex_lock(&sit_i->sentry_lock); 796 mutex_lock(&sit_i->sentry_lock);
850 __refresh_next_blkoff(sbi, curseg); 797 __refresh_next_blkoff(sbi, curseg);
798#ifdef CONFIG_F2FS_STAT_FS
851 sbi->block_count[curseg->alloc_type]++; 799 sbi->block_count[curseg->alloc_type]++;
800#endif
852 801
853 /* 802 /*
854 * SIT information should be updated before segment allocation, 803 * SIT information should be updated before segment allocation,
@@ -943,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
943 892
944 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 893 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
945 (sbi->blocks_per_seg - 1); 894 (sbi->blocks_per_seg - 1);
946 __add_sum_entry(sbi, type, sum, curseg->next_blkoff); 895 __add_sum_entry(sbi, type, sum);
947 896
948 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 897 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
949 898
@@ -980,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
980 } 929 }
981 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 930 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
982 (sbi->blocks_per_seg - 1); 931 (sbi->blocks_per_seg - 1);
983 __add_sum_entry(sbi, type, sum, curseg->next_blkoff); 932 __add_sum_entry(sbi, type, sum);
984 933
985 /* change the current log to the next block addr in advance */ 934 /* change the current log to the next block addr in advance */
986 if (next_segno != segno) { 935 if (next_segno != segno) {
@@ -1579,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1579{ 1528{
1580 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 1529 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1581 struct free_segmap_info *free_i = FREE_I(sbi); 1530 struct free_segmap_info *free_i = FREE_I(sbi);
1582 unsigned int segno = 0, offset = 0; 1531 unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
1583 unsigned short valid_blocks; 1532 unsigned short valid_blocks;
1584 1533
1585 while (segno < TOTAL_SEGS(sbi)) { 1534 while (1) {
1586 /* find dirty segment based on free segmap */ 1535 /* find dirty segment based on free segmap */
1587 segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset); 1536 segno = find_next_inuse(free_i, total_segs, offset);
1588 if (segno >= TOTAL_SEGS(sbi)) 1537 if (segno >= total_segs)
1589 break; 1538 break;
1590 offset = segno + 1; 1539 offset = segno + 1;
1591 valid_blocks = get_valid_blocks(sbi, segno, 0); 1540 valid_blocks = get_valid_blocks(sbi, segno, 0);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8555f7df82c7..75c7dc363e92 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -34,7 +34,7 @@
34static struct kmem_cache *f2fs_inode_cachep; 34static struct kmem_cache *f2fs_inode_cachep;
35 35
36enum { 36enum {
37 Opt_gc_background_off, 37 Opt_gc_background,
38 Opt_disable_roll_forward, 38 Opt_disable_roll_forward,
39 Opt_discard, 39 Opt_discard,
40 Opt_noheap, 40 Opt_noheap,
@@ -46,7 +46,7 @@ enum {
46}; 46};
47 47
48static match_table_t f2fs_tokens = { 48static match_table_t f2fs_tokens = {
49 {Opt_gc_background_off, "background_gc_off"}, 49 {Opt_gc_background, "background_gc=%s"},
50 {Opt_disable_roll_forward, "disable_roll_forward"}, 50 {Opt_disable_roll_forward, "disable_roll_forward"},
51 {Opt_discard, "discard"}, 51 {Opt_discard, "discard"},
52 {Opt_noheap, "no_heap"}, 52 {Opt_noheap, "no_heap"},
@@ -76,6 +76,91 @@ static void init_once(void *foo)
76 inode_init_once(&fi->vfs_inode); 76 inode_init_once(&fi->vfs_inode);
77} 77}
78 78
79static int parse_options(struct super_block *sb, char *options)
80{
81 struct f2fs_sb_info *sbi = F2FS_SB(sb);
82 substring_t args[MAX_OPT_ARGS];
83 char *p, *name;
84 int arg = 0;
85
86 if (!options)
87 return 0;
88
89 while ((p = strsep(&options, ",")) != NULL) {
90 int token;
91 if (!*p)
92 continue;
93 /*
94 * Initialize args struct so we know whether arg was
95 * found; some options take optional arguments.
96 */
97 args[0].to = args[0].from = NULL;
98 token = match_token(p, f2fs_tokens, args);
99
100 switch (token) {
101 case Opt_gc_background:
102 name = match_strdup(&args[0]);
103
104 if (!name)
105 return -ENOMEM;
106 if (!strncmp(name, "on", 2))
107 set_opt(sbi, BG_GC);
108 else if (!strncmp(name, "off", 3))
109 clear_opt(sbi, BG_GC);
110 else {
111 kfree(name);
112 return -EINVAL;
113 }
114 kfree(name);
115 break;
116 case Opt_disable_roll_forward:
117 set_opt(sbi, DISABLE_ROLL_FORWARD);
118 break;
119 case Opt_discard:
120 set_opt(sbi, DISCARD);
121 break;
122 case Opt_noheap:
123 set_opt(sbi, NOHEAP);
124 break;
125#ifdef CONFIG_F2FS_FS_XATTR
126 case Opt_nouser_xattr:
127 clear_opt(sbi, XATTR_USER);
128 break;
129#else
130 case Opt_nouser_xattr:
131 f2fs_msg(sb, KERN_INFO,
132 "nouser_xattr options not supported");
133 break;
134#endif
135#ifdef CONFIG_F2FS_FS_POSIX_ACL
136 case Opt_noacl:
137 clear_opt(sbi, POSIX_ACL);
138 break;
139#else
140 case Opt_noacl:
141 f2fs_msg(sb, KERN_INFO, "noacl options not supported");
142 break;
143#endif
144 case Opt_active_logs:
145 if (args->from && match_int(args, &arg))
146 return -EINVAL;
147 if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
148 return -EINVAL;
149 sbi->active_logs = arg;
150 break;
151 case Opt_disable_ext_identify:
152 set_opt(sbi, DISABLE_EXT_IDENTIFY);
153 break;
154 default:
155 f2fs_msg(sb, KERN_ERR,
156 "Unrecognized mount option \"%s\" or missing value",
157 p);
158 return -EINVAL;
159 }
160 }
161 return 0;
162}
163
79static struct inode *f2fs_alloc_inode(struct super_block *sb) 164static struct inode *f2fs_alloc_inode(struct super_block *sb)
80{ 165{
81 struct f2fs_inode_info *fi; 166 struct f2fs_inode_info *fi;
@@ -112,6 +197,17 @@ static int f2fs_drop_inode(struct inode *inode)
112 return generic_drop_inode(inode); 197 return generic_drop_inode(inode);
113} 198}
114 199
200/*
201 * f2fs_dirty_inode() is called from __mark_inode_dirty()
202 *
203 * We should call set_dirty_inode to write the dirty inode through write_inode.
204 */
205static void f2fs_dirty_inode(struct inode *inode, int flags)
206{
207 set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
208 return;
209}
210
115static void f2fs_i_callback(struct rcu_head *head) 211static void f2fs_i_callback(struct rcu_head *head)
116{ 212{
117 struct inode *inode = container_of(head, struct inode, i_rcu); 213 struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -170,7 +266,7 @@ static int f2fs_freeze(struct super_block *sb)
170{ 266{
171 int err; 267 int err;
172 268
173 if (sb->s_flags & MS_RDONLY) 269 if (f2fs_readonly(sb))
174 return 0; 270 return 0;
175 271
176 err = f2fs_sync_fs(sb, 1); 272 err = f2fs_sync_fs(sb, 1);
@@ -214,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
214{ 310{
215 struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); 311 struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
216 312
217 if (test_opt(sbi, BG_GC)) 313 if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
218 seq_puts(seq, ",background_gc_on"); 314 seq_printf(seq, ",background_gc=%s", "on");
219 else 315 else
220 seq_puts(seq, ",background_gc_off"); 316 seq_printf(seq, ",background_gc=%s", "off");
221 if (test_opt(sbi, DISABLE_ROLL_FORWARD)) 317 if (test_opt(sbi, DISABLE_ROLL_FORWARD))
222 seq_puts(seq, ",disable_roll_forward"); 318 seq_puts(seq, ",disable_roll_forward");
223 if (test_opt(sbi, DISCARD)) 319 if (test_opt(sbi, DISCARD))
@@ -244,11 +340,64 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
244 return 0; 340 return 0;
245} 341}
246 342
343static int f2fs_remount(struct super_block *sb, int *flags, char *data)
344{
345 struct f2fs_sb_info *sbi = F2FS_SB(sb);
346 struct f2fs_mount_info org_mount_opt;
347 int err, active_logs;
348
349 /*
350 * Save the old mount options in case we
351 * need to restore them.
352 */
353 org_mount_opt = sbi->mount_opt;
354 active_logs = sbi->active_logs;
355
356 /* parse mount options */
357 err = parse_options(sb, data);
358 if (err)
359 goto restore_opts;
360
361 /*
362 * Previous and new state of filesystem is RO,
363 * so no point in checking GC conditions.
364 */
365 if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
366 goto skip;
367
368 /*
369 * We stop the GC thread if FS is mounted as RO
370 * or if background_gc = off is passed in mount
371 * option. Also sync the filesystem.
372 */
373 if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
374 if (sbi->gc_thread) {
375 stop_gc_thread(sbi);
376 f2fs_sync_fs(sb, 1);
377 }
378 } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
379 err = start_gc_thread(sbi);
380 if (err)
381 goto restore_opts;
382 }
383skip:
384 /* Update the POSIXACL Flag */
385 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
386 (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
387 return 0;
388
389restore_opts:
390 sbi->mount_opt = org_mount_opt;
391 sbi->active_logs = active_logs;
392 return err;
393}
394
247static struct super_operations f2fs_sops = { 395static struct super_operations f2fs_sops = {
248 .alloc_inode = f2fs_alloc_inode, 396 .alloc_inode = f2fs_alloc_inode,
249 .drop_inode = f2fs_drop_inode, 397 .drop_inode = f2fs_drop_inode,
250 .destroy_inode = f2fs_destroy_inode, 398 .destroy_inode = f2fs_destroy_inode,
251 .write_inode = f2fs_write_inode, 399 .write_inode = f2fs_write_inode,
400 .dirty_inode = f2fs_dirty_inode,
252 .show_options = f2fs_show_options, 401 .show_options = f2fs_show_options,
253 .evict_inode = f2fs_evict_inode, 402 .evict_inode = f2fs_evict_inode,
254 .put_super = f2fs_put_super, 403 .put_super = f2fs_put_super,
@@ -256,6 +405,7 @@ static struct super_operations f2fs_sops = {
256 .freeze_fs = f2fs_freeze, 405 .freeze_fs = f2fs_freeze,
257 .unfreeze_fs = f2fs_unfreeze, 406 .unfreeze_fs = f2fs_unfreeze,
258 .statfs = f2fs_statfs, 407 .statfs = f2fs_statfs,
408 .remount_fs = f2fs_remount,
259}; 409};
260 410
261static struct inode *f2fs_nfs_get_inode(struct super_block *sb, 411static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -303,79 +453,6 @@ static const struct export_operations f2fs_export_ops = {
303 .get_parent = f2fs_get_parent, 453 .get_parent = f2fs_get_parent,
304}; 454};
305 455
306static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
307 char *options)
308{
309 substring_t args[MAX_OPT_ARGS];
310 char *p;
311 int arg = 0;
312
313 if (!options)
314 return 0;
315
316 while ((p = strsep(&options, ",")) != NULL) {
317 int token;
318 if (!*p)
319 continue;
320 /*
321 * Initialize args struct so we know whether arg was
322 * found; some options take optional arguments.
323 */
324 args[0].to = args[0].from = NULL;
325 token = match_token(p, f2fs_tokens, args);
326
327 switch (token) {
328 case Opt_gc_background_off:
329 clear_opt(sbi, BG_GC);
330 break;
331 case Opt_disable_roll_forward:
332 set_opt(sbi, DISABLE_ROLL_FORWARD);
333 break;
334 case Opt_discard:
335 set_opt(sbi, DISCARD);
336 break;
337 case Opt_noheap:
338 set_opt(sbi, NOHEAP);
339 break;
340#ifdef CONFIG_F2FS_FS_XATTR
341 case Opt_nouser_xattr:
342 clear_opt(sbi, XATTR_USER);
343 break;
344#else
345 case Opt_nouser_xattr:
346 f2fs_msg(sb, KERN_INFO,
347 "nouser_xattr options not supported");
348 break;
349#endif
350#ifdef CONFIG_F2FS_FS_POSIX_ACL
351 case Opt_noacl:
352 clear_opt(sbi, POSIX_ACL);
353 break;
354#else
355 case Opt_noacl:
356 f2fs_msg(sb, KERN_INFO, "noacl options not supported");
357 break;
358#endif
359 case Opt_active_logs:
360 if (args->from && match_int(args, &arg))
361 return -EINVAL;
362 if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
363 return -EINVAL;
364 sbi->active_logs = arg;
365 break;
366 case Opt_disable_ext_identify:
367 set_opt(sbi, DISABLE_EXT_IDENTIFY);
368 break;
369 default:
370 f2fs_msg(sb, KERN_ERR,
371 "Unrecognized mount option \"%s\" or missing value",
372 p);
373 return -EINVAL;
374 }
375 }
376 return 0;
377}
378
379static loff_t max_file_size(unsigned bits) 456static loff_t max_file_size(unsigned bits)
380{ 457{
381 loff_t result = ADDRS_PER_INODE; 458 loff_t result = ADDRS_PER_INODE;
@@ -541,6 +618,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
541 if (err) 618 if (err)
542 goto free_sb_buf; 619 goto free_sb_buf;
543 } 620 }
621 sb->s_fs_info = sbi;
544 /* init some FS parameters */ 622 /* init some FS parameters */
545 sbi->active_logs = NR_CURSEG_TYPE; 623 sbi->active_logs = NR_CURSEG_TYPE;
546 624
@@ -553,7 +631,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
553 set_opt(sbi, POSIX_ACL); 631 set_opt(sbi, POSIX_ACL);
554#endif 632#endif
555 /* parse mount options */ 633 /* parse mount options */
556 err = parse_options(sb, sbi, (char *)data); 634 err = parse_options(sb, (char *)data);
557 if (err) 635 if (err)
558 goto free_sb_buf; 636 goto free_sb_buf;
559 637
@@ -565,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
565 sb->s_xattr = f2fs_xattr_handlers; 643 sb->s_xattr = f2fs_xattr_handlers;
566 sb->s_export_op = &f2fs_export_ops; 644 sb->s_export_op = &f2fs_export_ops;
567 sb->s_magic = F2FS_SUPER_MAGIC; 645 sb->s_magic = F2FS_SUPER_MAGIC;
568 sb->s_fs_info = sbi;
569 sb->s_time_gran = 1; 646 sb->s_time_gran = 1;
570 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 647 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
571 (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); 648 (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -674,10 +751,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
674 "Cannot recover all fsync data errno=%ld", err); 751 "Cannot recover all fsync data errno=%ld", err);
675 } 752 }
676 753
677 /* After POR, we can run background GC thread */ 754 /*
678 err = start_gc_thread(sbi); 755 * If filesystem is not mounted as read-only then
679 if (err) 756 * do start the gc_thread.
680 goto fail; 757 */
758 if (!(sb->s_flags & MS_RDONLY)) {
759 /* After POR, we can run background GC thread.*/
760 err = start_gc_thread(sbi);
761 if (err)
762 goto fail;
763 }
681 764
682 err = f2fs_build_stats(sbi); 765 err = f2fs_build_stats(sbi);
683 if (err) 766 if (err)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 0b02dce31356..3ab07ecd86ca 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/rwsem.h> 21#include <linux/rwsem.h>
22#include <linux/f2fs_fs.h> 22#include <linux/f2fs_fs.h>
23#include <linux/security.h>
23#include "f2fs.h" 24#include "f2fs.h"
24#include "xattr.h" 25#include "xattr.h"
25 26
@@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
43 prefix = XATTR_TRUSTED_PREFIX; 44 prefix = XATTR_TRUSTED_PREFIX;
44 prefix_len = XATTR_TRUSTED_PREFIX_LEN; 45 prefix_len = XATTR_TRUSTED_PREFIX_LEN;
45 break; 46 break;
47 case F2FS_XATTR_INDEX_SECURITY:
48 prefix = XATTR_SECURITY_PREFIX;
49 prefix_len = XATTR_SECURITY_PREFIX_LEN;
50 break;
46 default: 51 default:
47 return -EINVAL; 52 return -EINVAL;
48 } 53 }
@@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
50 total_len = prefix_len + name_len + 1; 55 total_len = prefix_len + name_len + 1;
51 if (list && total_len <= list_size) { 56 if (list && total_len <= list_size) {
52 memcpy(list, prefix, prefix_len); 57 memcpy(list, prefix, prefix_len);
53 memcpy(list+prefix_len, name, name_len); 58 memcpy(list + prefix_len, name, name_len);
54 list[prefix_len + name_len] = '\0'; 59 list[prefix_len + name_len] = '\0';
55 } 60 }
56 return total_len; 61 return total_len;
@@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
70 if (!capable(CAP_SYS_ADMIN)) 75 if (!capable(CAP_SYS_ADMIN))
71 return -EPERM; 76 return -EPERM;
72 break; 77 break;
78 case F2FS_XATTR_INDEX_SECURITY:
79 break;
73 default: 80 default:
74 return -EINVAL; 81 return -EINVAL;
75 } 82 }
76 if (strcmp(name, "") == 0) 83 if (strcmp(name, "") == 0)
77 return -EINVAL; 84 return -EINVAL;
78 return f2fs_getxattr(dentry->d_inode, type, name, 85 return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
79 buffer, size);
80} 86}
81 87
82static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, 88static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
93 if (!capable(CAP_SYS_ADMIN)) 99 if (!capable(CAP_SYS_ADMIN))
94 return -EPERM; 100 return -EPERM;
95 break; 101 break;
102 case F2FS_XATTR_INDEX_SECURITY:
103 break;
96 default: 104 default:
97 return -EINVAL; 105 return -EINVAL;
98 } 106 }
99 if (strcmp(name, "") == 0) 107 if (strcmp(name, "") == 0)
100 return -EINVAL; 108 return -EINVAL;
101 109
102 return f2fs_setxattr(dentry->d_inode, type, name, value, size); 110 return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
103} 111}
104 112
105static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, 113static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
145 return 0; 153 return 0;
146} 154}
147 155
156#ifdef CONFIG_F2FS_FS_SECURITY
157static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
158 void *page)
159{
160 const struct xattr *xattr;
161 int err = 0;
162
163 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
164 err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
165 xattr->name, xattr->value,
166 xattr->value_len, (struct page *)page);
167 if (err < 0)
168 break;
169 }
170 return err;
171}
172
173int f2fs_init_security(struct inode *inode, struct inode *dir,
174 const struct qstr *qstr, struct page *ipage)
175{
176 return security_inode_init_security(inode, dir, qstr,
177 &f2fs_initxattrs, ipage);
178}
179#endif
180
148const struct xattr_handler f2fs_xattr_user_handler = { 181const struct xattr_handler f2fs_xattr_user_handler = {
149 .prefix = XATTR_USER_PREFIX, 182 .prefix = XATTR_USER_PREFIX,
150 .flags = F2FS_XATTR_INDEX_USER, 183 .flags = F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
169 .set = f2fs_xattr_advise_set, 202 .set = f2fs_xattr_advise_set,
170}; 203};
171 204
205const struct xattr_handler f2fs_xattr_security_handler = {
206 .prefix = XATTR_SECURITY_PREFIX,
207 .flags = F2FS_XATTR_INDEX_SECURITY,
208 .list = f2fs_xattr_generic_list,
209 .get = f2fs_xattr_generic_get,
210 .set = f2fs_xattr_generic_set,
211};
212
172static const struct xattr_handler *f2fs_xattr_handler_map[] = { 213static const struct xattr_handler *f2fs_xattr_handler_map[] = {
173 [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, 214 [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
174#ifdef CONFIG_F2FS_FS_POSIX_ACL 215#ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
176 [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, 217 [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
177#endif 218#endif
178 [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, 219 [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
220#ifdef CONFIG_F2FS_FS_SECURITY
221 [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
222#endif
179 [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, 223 [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
180}; 224};
181 225
@@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
186 &f2fs_xattr_acl_default_handler, 230 &f2fs_xattr_acl_default_handler,
187#endif 231#endif
188 &f2fs_xattr_trusted_handler, 232 &f2fs_xattr_trusted_handler,
233#ifdef CONFIG_F2FS_FS_SECURITY
234 &f2fs_xattr_security_handler,
235#endif
189 &f2fs_xattr_advise_handler, 236 &f2fs_xattr_advise_handler,
190 NULL, 237 NULL,
191}; 238};
@@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
218 return -ENODATA; 265 return -ENODATA;
219 266
220 page = get_node_page(sbi, fi->i_xattr_nid); 267 page = get_node_page(sbi, fi->i_xattr_nid);
268 if (IS_ERR(page))
269 return PTR_ERR(page);
221 base_addr = page_address(page); 270 base_addr = page_address(page);
222 271
223 list_for_each_xattr(entry, base_addr) { 272 list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
268 return 0; 317 return 0;
269 318
270 page = get_node_page(sbi, fi->i_xattr_nid); 319 page = get_node_page(sbi, fi->i_xattr_nid);
320 if (IS_ERR(page))
321 return PTR_ERR(page);
271 base_addr = page_address(page); 322 base_addr = page_address(page);
272 323
273 list_for_each_xattr(entry, base_addr) { 324 list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@ cleanup:
296} 347}
297 348
298int f2fs_setxattr(struct inode *inode, int name_index, const char *name, 349int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
299 const void *value, size_t value_len) 350 const void *value, size_t value_len, struct page *ipage)
300{ 351{
301 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 352 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
302 struct f2fs_inode_info *fi = F2FS_I(inode); 353 struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -335,7 +386,7 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
335 set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); 386 set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
336 mark_inode_dirty(inode); 387 mark_inode_dirty(inode);
337 388
338 page = new_node_page(&dn, XATTR_NODE_OFFSET); 389 page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
339 if (IS_ERR(page)) { 390 if (IS_ERR(page)) {
340 alloc_nid_failed(sbi, fi->i_xattr_nid); 391 alloc_nid_failed(sbi, fi->i_xattr_nid);
341 fi->i_xattr_nid = 0; 392 fi->i_xattr_nid = 0;
@@ -435,7 +486,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
435 inode->i_ctime = CURRENT_TIME; 486 inode->i_ctime = CURRENT_TIME;
436 clear_inode_flag(fi, FI_ACL_MODE); 487 clear_inode_flag(fi, FI_ACL_MODE);
437 } 488 }
438 update_inode_page(inode); 489 if (ipage)
490 update_inode(inode, ipage);
491 else
492 update_inode_page(inode);
439 mutex_unlock_op(sbi, ilock); 493 mutex_unlock_op(sbi, ilock);
440 494
441 return 0; 495 return 0;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558305e3..3c0817bef25d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
112extern const struct xattr_handler f2fs_xattr_acl_access_handler; 112extern const struct xattr_handler f2fs_xattr_acl_access_handler;
113extern const struct xattr_handler f2fs_xattr_acl_default_handler; 113extern const struct xattr_handler f2fs_xattr_acl_default_handler;
114extern const struct xattr_handler f2fs_xattr_advise_handler; 114extern const struct xattr_handler f2fs_xattr_advise_handler;
115extern const struct xattr_handler f2fs_xattr_security_handler;
115 116
116extern const struct xattr_handler *f2fs_xattr_handlers[]; 117extern const struct xattr_handler *f2fs_xattr_handlers[];
117 118
118extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name, 119extern int f2fs_setxattr(struct inode *, int, const char *,
119 const void *value, size_t value_len); 120 const void *, size_t, struct page *);
120extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name, 121extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
121 void *buffer, size_t buffer_size); 122extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
122extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
123 size_t buffer_size);
124
125#else 123#else
126 124
127#define f2fs_xattr_handlers NULL 125#define f2fs_xattr_handlers NULL
128static inline int f2fs_setxattr(struct inode *inode, int name_index, 126static inline int f2fs_setxattr(struct inode *inode, int name_index,
129 const char *name, const void *value, size_t value_len) 127 const char *name, const void *value, size_t value_len)
130{ 128{
131 return -EOPNOTSUPP; 129 return -EOPNOTSUPP;
132} 130}
@@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
142} 140}
143#endif 141#endif
144 142
143#ifdef CONFIG_F2FS_FS_SECURITY
144extern int f2fs_init_security(struct inode *, struct inode *,
145 const struct qstr *, struct page *);
146#else
147static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
148 const struct qstr *qstr, struct page *ipage)
149{
150 return 0;
151}
152#endif
145#endif /* __F2FS_XATTR_H__ */ 153#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7a6f02caf286..3963ede84eb0 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -543,6 +543,7 @@ end_of_dir:
543EXPORT_SYMBOL_GPL(fat_search_long); 543EXPORT_SYMBOL_GPL(fat_search_long);
544 544
545struct fat_ioctl_filldir_callback { 545struct fat_ioctl_filldir_callback {
546 struct dir_context ctx;
546 void __user *dirent; 547 void __user *dirent;
547 int result; 548 int result;
548 /* for dir ioctl */ 549 /* for dir ioctl */
@@ -552,8 +553,9 @@ struct fat_ioctl_filldir_callback {
552 int short_len; 553 int short_len;
553}; 554};
554 555
555static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, 556static int __fat_readdir(struct inode *inode, struct file *file,
556 filldir_t filldir, int short_only, int both) 557 struct dir_context *ctx, int short_only,
558 struct fat_ioctl_filldir_callback *both)
557{ 559{
558 struct super_block *sb = inode->i_sb; 560 struct super_block *sb = inode->i_sb;
559 struct msdos_sb_info *sbi = MSDOS_SB(sb); 561 struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -564,27 +566,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
564 unsigned char bufname[FAT_MAX_SHORT_SIZE]; 566 unsigned char bufname[FAT_MAX_SHORT_SIZE];
565 int isvfat = sbi->options.isvfat; 567 int isvfat = sbi->options.isvfat;
566 const char *fill_name = NULL; 568 const char *fill_name = NULL;
567 unsigned long inum; 569 int fake_offset = 0;
568 unsigned long lpos, dummy, *furrfu = &lpos;
569 loff_t cpos; 570 loff_t cpos;
570 int short_len = 0, fill_len = 0; 571 int short_len = 0, fill_len = 0;
571 int ret = 0; 572 int ret = 0;
572 573
573 mutex_lock(&sbi->s_lock); 574 mutex_lock(&sbi->s_lock);
574 575
575 cpos = filp->f_pos; 576 cpos = ctx->pos;
576 /* Fake . and .. for the root directory. */ 577 /* Fake . and .. for the root directory. */
577 if (inode->i_ino == MSDOS_ROOT_INO) { 578 if (inode->i_ino == MSDOS_ROOT_INO) {
578 while (cpos < 2) { 579 if (!dir_emit_dots(file, ctx))
579 if (filldir(dirent, "..", cpos+1, cpos, 580 goto out;
580 MSDOS_ROOT_INO, DT_DIR) < 0) 581 if (ctx->pos == 2) {
581 goto out; 582 fake_offset = 1;
582 cpos++;
583 filp->f_pos++;
584 }
585 if (cpos == 2) {
586 dummy = 2;
587 furrfu = &dummy;
588 cpos = 0; 583 cpos = 0;
589 } 584 }
590 } 585 }
@@ -619,7 +614,7 @@ parse_record:
619 int status = fat_parse_long(inode, &cpos, &bh, &de, 614 int status = fat_parse_long(inode, &cpos, &bh, &de,
620 &unicode, &nr_slots); 615 &unicode, &nr_slots);
621 if (status < 0) { 616 if (status < 0) {
622 filp->f_pos = cpos; 617 ctx->pos = cpos;
623 ret = status; 618 ret = status;
624 goto out; 619 goto out;
625 } else if (status == PARSE_INVALID) 620 } else if (status == PARSE_INVALID)
@@ -639,6 +634,19 @@ parse_record:
639 /* !both && !short_only, so we don't need shortname. */ 634 /* !both && !short_only, so we don't need shortname. */
640 if (!both) 635 if (!both)
641 goto start_filldir; 636 goto start_filldir;
637
638 short_len = fat_parse_short(sb, de, bufname,
639 sbi->options.dotsOK);
640 if (short_len == 0)
641 goto record_end;
642 /* hack for fat_ioctl_filldir() */
643 both->longname = fill_name;
644 both->long_len = fill_len;
645 both->shortname = bufname;
646 both->short_len = short_len;
647 fill_name = NULL;
648 fill_len = 0;
649 goto start_filldir;
642 } 650 }
643 } 651 }
644 652
@@ -646,28 +654,21 @@ parse_record:
646 if (short_len == 0) 654 if (short_len == 0)
647 goto record_end; 655 goto record_end;
648 656
649 if (nr_slots) { 657 fill_name = bufname;
650 /* hack for fat_ioctl_filldir() */ 658 fill_len = short_len;
651 struct fat_ioctl_filldir_callback *p = dirent;
652
653 p->longname = fill_name;
654 p->long_len = fill_len;
655 p->shortname = bufname;
656 p->short_len = short_len;
657 fill_name = NULL;
658 fill_len = 0;
659 } else {
660 fill_name = bufname;
661 fill_len = short_len;
662 }
663 659
664start_filldir: 660start_filldir:
665 lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); 661 if (!fake_offset)
666 if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) 662 ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
667 inum = inode->i_ino; 663
668 else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { 664 if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
669 inum = parent_ino(filp->f_path.dentry); 665 if (!dir_emit_dot(file, ctx))
666 goto fill_failed;
667 } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
668 if (!dir_emit_dotdot(file, ctx))
669 goto fill_failed;
670 } else { 670 } else {
671 unsigned long inum;
671 loff_t i_pos = fat_make_i_pos(sb, bh, de); 672 loff_t i_pos = fat_make_i_pos(sb, bh, de);
672 struct inode *tmp = fat_iget(sb, i_pos); 673 struct inode *tmp = fat_iget(sb, i_pos);
673 if (tmp) { 674 if (tmp) {
@@ -675,18 +676,17 @@ start_filldir:
675 iput(tmp); 676 iput(tmp);
676 } else 677 } else
677 inum = iunique(sb, MSDOS_ROOT_INO); 678 inum = iunique(sb, MSDOS_ROOT_INO);
679 if (!dir_emit(ctx, fill_name, fill_len, inum,
680 (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
681 goto fill_failed;
678 } 682 }
679 683
680 if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
681 (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
682 goto fill_failed;
683
684record_end: 684record_end:
685 furrfu = &lpos; 685 fake_offset = 0;
686 filp->f_pos = cpos; 686 ctx->pos = cpos;
687 goto get_new; 687 goto get_new;
688end_of_dir: 688end_of_dir:
689 filp->f_pos = cpos; 689 ctx->pos = cpos;
690fill_failed: 690fill_failed:
691 brelse(bh); 691 brelse(bh);
692 if (unicode) 692 if (unicode)
@@ -696,10 +696,9 @@ out:
696 return ret; 696 return ret;
697} 697}
698 698
699static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir) 699static int fat_readdir(struct file *file, struct dir_context *ctx)
700{ 700{
701 struct inode *inode = file_inode(filp); 701 return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
702 return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
703} 702}
704 703
705#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ 704#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \
@@ -755,20 +754,25 @@ efault: \
755 754
756FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) 755FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
757 756
758static int fat_ioctl_readdir(struct inode *inode, struct file *filp, 757static int fat_ioctl_readdir(struct inode *inode, struct file *file,
759 void __user *dirent, filldir_t filldir, 758 void __user *dirent, filldir_t filldir,
760 int short_only, int both) 759 int short_only, int both)
761{ 760{
762 struct fat_ioctl_filldir_callback buf; 761 struct fat_ioctl_filldir_callback buf = {
762 .ctx.actor = filldir,
763 .dirent = dirent
764 };
763 int ret; 765 int ret;
764 766
765 buf.dirent = dirent; 767 buf.dirent = dirent;
766 buf.result = 0; 768 buf.result = 0;
767 mutex_lock(&inode->i_mutex); 769 mutex_lock(&inode->i_mutex);
770 buf.ctx.pos = file->f_pos;
768 ret = -ENOENT; 771 ret = -ENOENT;
769 if (!IS_DEADDIR(inode)) { 772 if (!IS_DEADDIR(inode)) {
770 ret = __fat_readdir(inode, filp, &buf, filldir, 773 ret = __fat_readdir(inode, file, &buf.ctx,
771 short_only, both); 774 short_only, both ? &buf : NULL);
775 file->f_pos = buf.ctx.pos;
772 } 776 }
773 mutex_unlock(&inode->i_mutex); 777 mutex_unlock(&inode->i_mutex);
774 if (ret >= 0) 778 if (ret >= 0)
@@ -854,7 +858,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
854const struct file_operations fat_dir_operations = { 858const struct file_operations fat_dir_operations = {
855 .llseek = generic_file_llseek, 859 .llseek = generic_file_llseek,
856 .read = generic_read_dir, 860 .read = generic_read_dir,
857 .readdir = fat_readdir, 861 .iterate = fat_readdir,
858 .unlocked_ioctl = fat_dir_ioctl, 862 .unlocked_ioctl = fat_dir_ioctl,
859#ifdef CONFIG_COMPAT 863#ifdef CONFIG_COMPAT
860 .compat_ioctl = fat_compat_dir_ioctl, 864 .compat_ioctl = fat_compat_dir_ioctl,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index dfce656ddb33..5d4513cb1b3c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1229,6 +1229,19 @@ static int fat_read_root(struct inode *inode)
1229 return 0; 1229 return 0;
1230} 1230}
1231 1231
1232static unsigned long calc_fat_clusters(struct super_block *sb)
1233{
1234 struct msdos_sb_info *sbi = MSDOS_SB(sb);
1235
1236 /* Divide first to avoid overflow */
1237 if (sbi->fat_bits != 12) {
1238 unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits;
1239 return ent_per_sec * sbi->fat_length;
1240 }
1241
1242 return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
1243}
1244
1232/* 1245/*
1233 * Read the super block of an MS-DOS FS. 1246 * Read the super block of an MS-DOS FS.
1234 */ 1247 */
@@ -1434,7 +1447,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1434 sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; 1447 sbi->dirty = b->fat16.state & FAT_STATE_DIRTY;
1435 1448
1436 /* check that FAT table does not overflow */ 1449 /* check that FAT table does not overflow */
1437 fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; 1450 fat_clusters = calc_fat_clusters(sb);
1438 total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); 1451 total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
1439 if (total_clusters > MAX_FAT(sb)) { 1452 if (total_clusters > MAX_FAT(sb)) {
1440 if (!silent) 1453 if (!silent)
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 081b759cff83..a783b0e1272a 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,8 +148,7 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
148 * that the existing dentry can be used. The msdos fs routines will 148 * that the existing dentry can be used. The msdos fs routines will
149 * return ENOENT or EINVAL as appropriate. 149 * return ENOENT or EINVAL as appropriate.
150 */ 150 */
151static int msdos_hash(const struct dentry *dentry, const struct inode *inode, 151static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
152 struct qstr *qstr)
153{ 152{
154 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; 153 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
155 unsigned char msdos_name[MSDOS_NAME]; 154 unsigned char msdos_name[MSDOS_NAME];
@@ -165,8 +164,7 @@ static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
165 * Compare two msdos names. If either of the names are invalid, 164 * Compare two msdos names. If either of the names are invalid,
166 * we fall back to doing the standard name comparison. 165 * we fall back to doing the standard name comparison.
167 */ 166 */
168static int msdos_cmp(const struct dentry *parent, const struct inode *pinode, 167static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry,
169 const struct dentry *dentry, const struct inode *inode,
170 unsigned int len, const char *str, const struct qstr *name) 168 unsigned int len, const char *str, const struct qstr *name)
171{ 169{
172 struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; 170 struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 2da952036a3d..6df8d3d885e5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -107,8 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
107 * that the existing dentry can be used. The vfat fs routines will 107 * that the existing dentry can be used. The vfat fs routines will
108 * return ENOENT or EINVAL as appropriate. 108 * return ENOENT or EINVAL as appropriate.
109 */ 109 */
110static int vfat_hash(const struct dentry *dentry, const struct inode *inode, 110static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
111 struct qstr *qstr)
112{ 111{
113 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); 112 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
114 return 0; 113 return 0;
@@ -120,8 +119,7 @@ static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
120 * that the existing dentry can be used. The vfat fs routines will 119 * that the existing dentry can be used. The vfat fs routines will
121 * return ENOENT or EINVAL as appropriate. 120 * return ENOENT or EINVAL as appropriate.
122 */ 121 */
123static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, 122static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
124 struct qstr *qstr)
125{ 123{
126 struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; 124 struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
127 const unsigned char *name; 125 const unsigned char *name;
@@ -142,8 +140,7 @@ static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
142/* 140/*
143 * Case insensitive compare of two vfat names. 141 * Case insensitive compare of two vfat names.
144 */ 142 */
145static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, 143static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
146 const struct dentry *dentry, const struct inode *inode,
147 unsigned int len, const char *str, const struct qstr *name) 144 unsigned int len, const char *str, const struct qstr *name)
148{ 145{
149 struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; 146 struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
@@ -162,8 +159,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
162/* 159/*
163 * Case sensitive compare of two vfat names. 160 * Case sensitive compare of two vfat names.
164 */ 161 */
165static int vfat_cmp(const struct dentry *parent, const struct inode *pinode, 162static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry,
166 const struct dentry *dentry, const struct inode *inode,
167 unsigned int len, const char *str, const struct qstr *name) 163 unsigned int len, const char *str, const struct qstr *name)
168{ 164{
169 unsigned int alen, blen; 165 unsigned int alen, blen;
diff --git a/fs/file_table.c b/fs/file_table.c
index cd4d87a82951..08e719b884ca 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -227,7 +227,7 @@ static void __fput(struct file *file)
227{ 227{
228 struct dentry *dentry = file->f_path.dentry; 228 struct dentry *dentry = file->f_path.dentry;
229 struct vfsmount *mnt = file->f_path.mnt; 229 struct vfsmount *mnt = file->f_path.mnt;
230 struct inode *inode = dentry->d_inode; 230 struct inode *inode = file->f_inode;
231 231
232 might_sleep(); 232 might_sleep();
233 233
@@ -306,17 +306,18 @@ void fput(struct file *file)
306{ 306{
307 if (atomic_long_dec_and_test(&file->f_count)) { 307 if (atomic_long_dec_and_test(&file->f_count)) {
308 struct task_struct *task = current; 308 struct task_struct *task = current;
309 unsigned long flags;
310
309 file_sb_list_del(file); 311 file_sb_list_del(file);
310 if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) { 312 if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
311 unsigned long flags; 313 init_task_work(&file->f_u.fu_rcuhead, ____fput);
312 spin_lock_irqsave(&delayed_fput_lock, flags); 314 if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
313 list_add(&file->f_u.fu_list, &delayed_fput_list); 315 return;
314 schedule_work(&delayed_fput_work);
315 spin_unlock_irqrestore(&delayed_fput_lock, flags);
316 return;
317 } 316 }
318 init_task_work(&file->f_u.fu_rcuhead, ____fput); 317 spin_lock_irqsave(&delayed_fput_lock, flags);
319 task_work_add(task, &file->f_u.fu_rcuhead, true); 318 list_add(&file->f_u.fu_list, &delayed_fput_list);
319 schedule_work(&delayed_fput_work);
320 spin_unlock_irqrestore(&delayed_fput_lock, flags);
320 } 321 }
321} 322}
322 323
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 664b07a53870..25d4099a4aea 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -49,7 +49,7 @@
49 49
50 50
51static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int); 51static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int);
52static int vxfs_readdir(struct file *, void *, filldir_t); 52static int vxfs_readdir(struct file *, struct dir_context *);
53 53
54const struct inode_operations vxfs_dir_inode_ops = { 54const struct inode_operations vxfs_dir_inode_ops = {
55 .lookup = vxfs_lookup, 55 .lookup = vxfs_lookup,
@@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = {
58const struct file_operations vxfs_dir_operations = { 58const struct file_operations vxfs_dir_operations = {
59 .llseek = generic_file_llseek, 59 .llseek = generic_file_llseek,
60 .read = generic_read_dir, 60 .read = generic_read_dir,
61 .readdir = vxfs_readdir, 61 .iterate = vxfs_readdir,
62}; 62};
63 63
64 64
@@ -235,7 +235,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
235 * Zero. 235 * Zero.
236 */ 236 */
237static int 237static int
238vxfs_readdir(struct file *fp, void *retp, filldir_t filler) 238vxfs_readdir(struct file *fp, struct dir_context *ctx)
239{ 239{
240 struct inode *ip = file_inode(fp); 240 struct inode *ip = file_inode(fp);
241 struct super_block *sbp = ip->i_sb; 241 struct super_block *sbp = ip->i_sb;
@@ -243,20 +243,17 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
243 u_long page, npages, block, pblocks, nblocks, offset; 243 u_long page, npages, block, pblocks, nblocks, offset;
244 loff_t pos; 244 loff_t pos;
245 245
246 switch ((long)fp->f_pos) { 246 if (ctx->pos == 0) {
247 case 0: 247 if (!dir_emit_dot(fp, ctx))
248 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) 248 return 0;
249 goto out; 249 ctx->pos = 1;
250 fp->f_pos++;
251 /* fallthrough */
252 case 1:
253 if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0)
254 goto out;
255 fp->f_pos++;
256 /* fallthrough */
257 } 250 }
258 251 if (ctx->pos == 1) {
259 pos = fp->f_pos - 2; 252 if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
253 return 0;
254 ctx->pos = 2;
255 }
256 pos = ctx->pos - 2;
260 257
261 if (pos > VXFS_DIRROUND(ip->i_size)) 258 if (pos > VXFS_DIRROUND(ip->i_size))
262 return 0; 259 return 0;
@@ -270,16 +267,16 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
270 block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks; 267 block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
271 268
272 for (; page < npages; page++, block = 0) { 269 for (; page < npages; page++, block = 0) {
273 caddr_t kaddr; 270 char *kaddr;
274 struct page *pp; 271 struct page *pp;
275 272
276 pp = vxfs_get_page(ip->i_mapping, page); 273 pp = vxfs_get_page(ip->i_mapping, page);
277 if (IS_ERR(pp)) 274 if (IS_ERR(pp))
278 continue; 275 continue;
279 kaddr = (caddr_t)page_address(pp); 276 kaddr = (char *)page_address(pp);
280 277
281 for (; block <= nblocks && block <= pblocks; block++) { 278 for (; block <= nblocks && block <= pblocks; block++) {
282 caddr_t baddr, limit; 279 char *baddr, *limit;
283 struct vxfs_dirblk *dbp; 280 struct vxfs_dirblk *dbp;
284 struct vxfs_direct *de; 281 struct vxfs_direct *de;
285 282
@@ -292,21 +289,18 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
292 (kaddr + offset) : 289 (kaddr + offset) :
293 (baddr + VXFS_DIRBLKOV(dbp))); 290 (baddr + VXFS_DIRBLKOV(dbp)));
294 291
295 for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) { 292 for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
296 int over;
297
298 if (!de->d_reclen) 293 if (!de->d_reclen)
299 break; 294 break;
300 if (!de->d_ino) 295 if (!de->d_ino)
301 continue; 296 continue;
302 297
303 offset = (caddr_t)de - kaddr; 298 offset = (char *)de - kaddr;
304 over = filler(retp, de->d_name, de->d_namelen, 299 ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
305 ((page << PAGE_CACHE_SHIFT) | offset) + 2, 300 if (!dir_emit(ctx, de->d_name, de->d_namelen,
306 de->d_ino, DT_UNKNOWN); 301 de->d_ino, DT_UNKNOWN)) {
307 if (over) {
308 vxfs_put_page(pp); 302 vxfs_put_page(pp);
309 goto done; 303 return 0;
310 } 304 }
311 } 305 }
312 offset = 0; 306 offset = 0;
@@ -314,9 +308,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
314 vxfs_put_page(pp); 308 vxfs_put_page(pp);
315 offset = 0; 309 offset = 0;
316 } 310 }
317 311 ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
318done:
319 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
320out:
321 return 0; 312 return 0;
322} 313}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3be57189efd5..a85ac4e33436 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_work {
45 unsigned int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 unsigned int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 unsigned int for_background:1; 47 unsigned int for_background:1;
48 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
48 enum wb_reason reason; /* why was writeback initiated? */ 49 enum wb_reason reason; /* why was writeback initiated? */
49 50
50 struct list_head list; /* pending work list */ 51 struct list_head list; /* pending work list */
@@ -443,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
443 /* 444 /*
444 * Make sure to wait on the data before writing out the metadata. 445 * Make sure to wait on the data before writing out the metadata.
445 * This is important for filesystems that modify metadata on data 446 * This is important for filesystems that modify metadata on data
446 * I/O completion. 447 * I/O completion. We don't do it for sync(2) writeback because it has a
448 * separate, external IO completion path and ->sync_fs for guaranteeing
449 * inode metadata is written back correctly.
447 */ 450 */
448 if (wbc->sync_mode == WB_SYNC_ALL) { 451 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
449 int err = filemap_fdatawait(mapping); 452 int err = filemap_fdatawait(mapping);
450 if (ret == 0) 453 if (ret == 0)
451 ret = err; 454 ret = err;
@@ -578,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb,
578 .tagged_writepages = work->tagged_writepages, 581 .tagged_writepages = work->tagged_writepages,
579 .for_kupdate = work->for_kupdate, 582 .for_kupdate = work->for_kupdate,
580 .for_background = work->for_background, 583 .for_background = work->for_background,
584 .for_sync = work->for_sync,
581 .range_cyclic = work->range_cyclic, 585 .range_cyclic = work->range_cyclic,
582 .range_start = 0, 586 .range_start = 0,
583 .range_end = LLONG_MAX, 587 .range_end = LLONG_MAX,
@@ -1362,6 +1366,7 @@ void sync_inodes_sb(struct super_block *sb)
1362 .range_cyclic = 0, 1366 .range_cyclic = 0,
1363 .done = &done, 1367 .done = &done,
1364 .reason = WB_REASON_SYNC, 1368 .reason = WB_REASON_SYNC,
1369 .for_sync = 1,
1365 }; 1370 };
1366 1371
1367 /* Nothing to do? */ 1372 /* Nothing to do? */
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1dca97..f7cff367db7f 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object(
115 struct fscache_object, cookie_link); 115 struct fscache_object, cookie_link);
116 116
117 cache = object->cache; 117 cache = object->cache;
118 if (object->state >= FSCACHE_OBJECT_DYING || 118 if (fscache_object_is_dying(object) ||
119 test_bit(FSCACHE_IOERROR, &cache->flags)) 119 test_bit(FSCACHE_IOERROR, &cache->flags))
120 cache = NULL; 120 cache = NULL;
121 121
@@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache,
224 BUG_ON(!ifsdef); 224 BUG_ON(!ifsdef);
225 225
226 cache->flags = 0; 226 cache->flags = 0;
227 ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); 227 ifsdef->event_mask =
228 ifsdef->state = FSCACHE_OBJECT_ACTIVE; 228 ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
229 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
230 __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
229 231
230 if (!tagname) 232 if (!tagname)
231 tagname = cache->identifier; 233 tagname = cache->identifier;
@@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
330{ 332{
331 struct fscache_object *object; 333 struct fscache_object *object;
332 334
333 spin_lock(&cache->object_list_lock);
334
335 while (!list_empty(&cache->object_list)) { 335 while (!list_empty(&cache->object_list)) {
336 object = list_entry(cache->object_list.next, 336 spin_lock(&cache->object_list_lock);
337 struct fscache_object, cache_link);
338 list_move_tail(&object->cache_link, dying_objects);
339 337
340 _debug("withdraw %p", object->cookie); 338 if (!list_empty(&cache->object_list)) {
339 object = list_entry(cache->object_list.next,
340 struct fscache_object, cache_link);
341 list_move_tail(&object->cache_link, dying_objects);
341 342
342 spin_lock(&object->lock); 343 _debug("withdraw %p", object->cookie);
343 spin_unlock(&cache->object_list_lock); 344
344 fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW); 345 /* This must be done under object_list_lock to prevent
345 spin_unlock(&object->lock); 346 * a race with fscache_drop_object().
347 */
348 fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
349 }
346 350
351 spin_unlock(&cache->object_list_lock);
347 cond_resched(); 352 cond_resched();
348 spin_lock(&cache->object_list_lock);
349 } 353 }
350
351 spin_unlock(&cache->object_list_lock);
352} 354}
353 355
354/** 356/**
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e2cba1f60c21..0e91a3c9fdb2 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie(
95 atomic_set(&cookie->usage, 1); 95 atomic_set(&cookie->usage, 1);
96 atomic_set(&cookie->n_children, 0); 96 atomic_set(&cookie->n_children, 0);
97 97
98 /* We keep the active count elevated until relinquishment to prevent an
99 * attempt to wake up every time the object operations queue quiesces.
100 */
101 atomic_set(&cookie->n_active, 1);
102
98 atomic_inc(&parent->usage); 103 atomic_inc(&parent->usage);
99 atomic_inc(&parent->n_children); 104 atomic_inc(&parent->n_children);
100 105
@@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
177 182
178 cookie->flags = 183 cookie->flags =
179 (1 << FSCACHE_COOKIE_LOOKING_UP) | 184 (1 << FSCACHE_COOKIE_LOOKING_UP) |
180 (1 << FSCACHE_COOKIE_CREATING) |
181 (1 << FSCACHE_COOKIE_NO_DATA_YET); 185 (1 << FSCACHE_COOKIE_NO_DATA_YET);
182 186
183 /* ask the cache to allocate objects for this cookie and its parent 187 /* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
205 209
206 /* initiate the process of looking up all the objects in the chain 210 /* initiate the process of looking up all the objects in the chain
207 * (done by fscache_initialise_object()) */ 211 * (done by fscache_initialise_object()) */
208 fscache_enqueue_object(object); 212 fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
209 213
210 spin_unlock(&cookie->lock); 214 spin_unlock(&cookie->lock);
211 215
@@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
285 289
286object_already_extant: 290object_already_extant:
287 ret = -ENOBUFS; 291 ret = -ENOBUFS;
288 if (object->state >= FSCACHE_OBJECT_DYING) { 292 if (fscache_object_is_dead(object)) {
289 spin_unlock(&cookie->lock); 293 spin_unlock(&cookie->lock);
290 goto error; 294 goto error;
291 } 295 }
@@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
321 ret = -EEXIST; 325 ret = -EEXIST;
322 hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { 326 hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
323 if (p->cache == object->cache) { 327 if (p->cache == object->cache) {
324 if (p->state >= FSCACHE_OBJECT_DYING) 328 if (fscache_object_is_dying(p))
325 ret = -ENOBUFS; 329 ret = -ENOBUFS;
326 goto cant_attach_object; 330 goto cant_attach_object;
327 } 331 }
@@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
332 hlist_for_each_entry(p, &cookie->parent->backing_objects, 336 hlist_for_each_entry(p, &cookie->parent->backing_objects,
333 cookie_link) { 337 cookie_link) {
334 if (p->cache == object->cache) { 338 if (p->cache == object->cache) {
335 if (p->state >= FSCACHE_OBJECT_DYING) { 339 if (fscache_object_is_dying(p)) {
336 ret = -ENOBUFS; 340 ret = -ENOBUFS;
337 spin_unlock(&cookie->parent->lock); 341 spin_unlock(&cookie->parent->lock);
338 goto cant_attach_object; 342 goto cant_attach_object;
@@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
400 object = hlist_entry(cookie->backing_objects.first, 404 object = hlist_entry(cookie->backing_objects.first,
401 struct fscache_object, 405 struct fscache_object,
402 cookie_link); 406 cookie_link);
403 if (object->state < FSCACHE_OBJECT_DYING) 407 if (fscache_object_is_live(object))
404 fscache_raise_event( 408 fscache_raise_event(
405 object, FSCACHE_OBJECT_EV_INVALIDATE); 409 object, FSCACHE_OBJECT_EV_INVALIDATE);
406 } 410 }
@@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie);
467 */ 471 */
468void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) 472void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
469{ 473{
470 struct fscache_cache *cache;
471 struct fscache_object *object; 474 struct fscache_object *object;
472 unsigned long event;
473 475
474 fscache_stat(&fscache_n_relinquishes); 476 fscache_stat(&fscache_n_relinquishes);
475 if (retire) 477 if (retire)
@@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
481 return; 483 return;
482 } 484 }
483 485
484 _enter("%p{%s,%p},%d", 486 _enter("%p{%s,%p,%d},%d",
485 cookie, cookie->def->name, cookie->netfs_data, retire); 487 cookie, cookie->def->name, cookie->netfs_data,
488 atomic_read(&cookie->n_active), retire);
489
490 ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
486 491
487 if (atomic_read(&cookie->n_children) != 0) { 492 if (atomic_read(&cookie->n_children) != 0) {
488 printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", 493 printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
490 BUG(); 495 BUG();
491 } 496 }
492 497
493 /* wait for the cookie to finish being instantiated (or to fail) */ 498 /* No further netfs-accessing operations on this cookie permitted */
494 if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) { 499 set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
495 fscache_stat(&fscache_n_relinquishes_waitcrt); 500 if (retire)
496 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING, 501 set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
497 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
498 }
499
500 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
501 502
502try_again:
503 spin_lock(&cookie->lock); 503 spin_lock(&cookie->lock);
504 504 hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
505 /* break links with all the active objects */ 505 fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
506 while (!hlist_empty(&cookie->backing_objects)) {
507 int n_reads;
508 object = hlist_entry(cookie->backing_objects.first,
509 struct fscache_object,
510 cookie_link);
511
512 _debug("RELEASE OBJ%x", object->debug_id);
513
514 set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
515 n_reads = atomic_read(&object->n_reads);
516 if (n_reads) {
517 int n_ops = object->n_ops;
518 int n_in_progress = object->n_in_progress;
519 spin_unlock(&cookie->lock);
520 printk(KERN_ERR "FS-Cache:"
521 " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
522 cookie->def->name,
523 n_reads, n_ops, n_in_progress);
524 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
525 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
526 printk("Wait finished\n");
527 goto try_again;
528 }
529
530 /* detach each cache object from the object cookie */
531 spin_lock(&object->lock);
532 hlist_del_init(&object->cookie_link);
533
534 cache = object->cache;
535 object->cookie = NULL;
536 fscache_raise_event(object, event);
537 spin_unlock(&object->lock);
538
539 if (atomic_dec_and_test(&cookie->usage))
540 /* the cookie refcount shouldn't be reduced to 0 yet */
541 BUG();
542 } 506 }
507 spin_unlock(&cookie->lock);
543 508
544 /* detach pointers back to the netfs */ 509 /* Wait for cessation of activity requiring access to the netfs (when
510 * n_active reaches 0).
511 */
512 if (!atomic_dec_and_test(&cookie->n_active))
513 wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
514 TASK_UNINTERRUPTIBLE);
515
516 /* Clear pointers back to the netfs */
545 cookie->netfs_data = NULL; 517 cookie->netfs_data = NULL;
546 cookie->def = NULL; 518 cookie->def = NULL;
547 519 BUG_ON(cookie->stores.rnode);
548 spin_unlock(&cookie->lock);
549 520
550 if (cookie->parent) { 521 if (cookie->parent) {
551 ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); 522 ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -553,7 +524,7 @@ try_again:
553 atomic_dec(&cookie->parent->n_children); 524 atomic_dec(&cookie->parent->n_children);
554 } 525 }
555 526
556 /* finally dispose of the cookie */ 527 /* Dispose of the netfs's link to the cookie */
557 ASSERTCMP(atomic_read(&cookie->usage), >, 0); 528 ASSERTCMP(atomic_read(&cookie->usage), >, 0);
558 fscache_cookie_put(cookie); 529 fscache_cookie_put(cookie);
559 530
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4baee7352..10a2ade0bdf8 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
55 55
56struct fscache_cookie fscache_fsdef_index = { 56struct fscache_cookie fscache_fsdef_index = {
57 .usage = ATOMIC_INIT(1), 57 .usage = ATOMIC_INIT(1),
58 .n_active = ATOMIC_INIT(1),
58 .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), 59 .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
59 .backing_objects = HLIST_HEAD_INIT, 60 .backing_objects = HLIST_HEAD_INIT,
60 .def = &fscache_fsdef_index_def, 61 .def = &fscache_fsdef_index_def,
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef4be51..12d505bedb5c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void)
93 93
94extern int fscache_wait_bit(void *); 94extern int fscache_wait_bit(void *);
95extern int fscache_wait_bit_interruptible(void *); 95extern int fscache_wait_bit_interruptible(void *);
96extern int fscache_wait_atomic_t(atomic_t *);
96 97
97/* 98/*
98 * object.c 99 * object.c
99 */ 100 */
100extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
101
102extern void fscache_withdrawing_object(struct fscache_cache *,
103 struct fscache_object *);
104extern void fscache_enqueue_object(struct fscache_object *); 101extern void fscache_enqueue_object(struct fscache_object *);
105 102
106/* 103/*
@@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *);
110extern const struct file_operations fscache_objlist_fops; 107extern const struct file_operations fscache_objlist_fops;
111 108
112extern void fscache_objlist_add(struct fscache_object *); 109extern void fscache_objlist_add(struct fscache_object *);
110extern void fscache_objlist_remove(struct fscache_object *);
113#else 111#else
114#define fscache_objlist_add(object) do {} while(0) 112#define fscache_objlist_add(object) do {} while(0)
113#define fscache_objlist_remove(object) do {} while(0)
115#endif 114#endif
116 115
117/* 116/*
@@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object,
291 unsigned event) 290 unsigned event)
292{ 291{
293 BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS); 292 BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
293#if 0
294 printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
295 object->debug_id, object->event_mask, (1 << event));
296#endif
294 if (!test_and_set_bit(event, &object->events) && 297 if (!test_and_set_bit(event, &object->events) &&
295 test_bit(event, &object->event_mask)) 298 test_bit(event, &object->event_mask))
296 fscache_enqueue_object(object); 299 fscache_enqueue_object(object);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d856773f79..7c27907e650c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags)
205 schedule(); 205 schedule();
206 return 0; 206 return 0;
207} 207}
208EXPORT_SYMBOL(fscache_wait_bit);
209 208
210/* 209/*
211 * wait_on_bit() sleep function for interruptible waiting 210 * wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags)
215 schedule(); 214 schedule();
216 return signal_pending(current); 215 return signal_pending(current);
217} 216}
218EXPORT_SYMBOL(fscache_wait_bit_interruptible); 217
218/*
219 * wait_on_atomic_t() sleep function for uninterruptible waiting
220 */
221int fscache_wait_atomic_t(atomic_t *p)
222{
223 schedule();
224 return 0;
225}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8eb1c40..b1bb6117473a 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
40 /* initialise the primary index cookie */ 40 /* initialise the primary index cookie */
41 atomic_set(&netfs->primary_index->usage, 1); 41 atomic_set(&netfs->primary_index->usage, 1);
42 atomic_set(&netfs->primary_index->n_children, 0); 42 atomic_set(&netfs->primary_index->n_children, 0);
43 atomic_set(&netfs->primary_index->n_active, 1);
43 44
44 netfs->primary_index->def = &fscache_fsdef_netfs_def; 45 netfs->primary_index->def = &fscache_fsdef_netfs_def;
45 netfs->primary_index->parent = &fscache_fsdef_index; 46 netfs->primary_index->parent = &fscache_fsdef_index;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d17885..e1959efad64f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj)
70 write_unlock(&fscache_object_list_lock); 70 write_unlock(&fscache_object_list_lock);
71} 71}
72 72
73/** 73/*
74 * fscache_object_destroy - Note that a cache object is about to be destroyed 74 * Remove an object from the object list.
75 * @object: The object to be destroyed
76 *
77 * Note the imminent destruction and deallocation of a cache object record.
78 */ 75 */
79void fscache_object_destroy(struct fscache_object *obj) 76void fscache_objlist_remove(struct fscache_object *obj)
80{ 77{
81 write_lock(&fscache_object_list_lock); 78 write_lock(&fscache_object_list_lock);
82 79
@@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj)
85 82
86 write_unlock(&fscache_object_list_lock); 83 write_unlock(&fscache_object_list_lock);
87} 84}
88EXPORT_SYMBOL(fscache_object_destroy);
89 85
90/* 86/*
91 * find the object in the tree on or after the specified index 87 * find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
166{ 162{
167 struct fscache_objlist_data *data = m->private; 163 struct fscache_objlist_data *data = m->private;
168 struct fscache_object *obj = v; 164 struct fscache_object *obj = v;
165 struct fscache_cookie *cookie;
169 unsigned long config = data->config; 166 unsigned long config = data->config;
170 uint16_t keylen, auxlen;
171 char _type[3], *type; 167 char _type[3], *type;
172 bool no_cookie;
173 u8 *buf = data->buf, *p; 168 u8 *buf = data->buf, *p;
174 169
175 if ((unsigned long) v == 1) { 170 if ((unsigned long) v == 1) {
176 seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" 171 seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
177 " EM EV F S" 172 " EM EV FL S"
178 " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); 173 " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
179 if (config & (FSCACHE_OBJLIST_CONFIG_KEY | 174 if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
180 FSCACHE_OBJLIST_CONFIG_AUX)) 175 FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
193 188
194 if ((unsigned long) v == 2) { 189 if ((unsigned long) v == 2) {
195 seq_puts(m, "======== ======== ==== ===== === === === == =====" 190 seq_puts(m, "======== ======== ==== ===== === === === == ====="
196 " == == = =" 191 " == == == ="
197 " | ================ == == ================"); 192 " | ================ == == ================");
198 if (config & (FSCACHE_OBJLIST_CONFIG_KEY | 193 if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
199 FSCACHE_OBJLIST_CONFIG_AUX)) 194 FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
216 } \ 211 } \
217 } while(0) 212 } while(0)
218 213
214 cookie = obj->cookie;
219 if (~config) { 215 if (~config) {
220 FILTER(obj->cookie, 216 FILTER(cookie->def,
221 COOKIE, NOCOOKIE); 217 COOKIE, NOCOOKIE);
222 FILTER(obj->state != FSCACHE_OBJECT_ACTIVE || 218 FILTER(fscache_object_is_active(obj) ||
223 obj->n_ops != 0 || 219 obj->n_ops != 0 ||
224 obj->n_obj_ops != 0 || 220 obj->n_obj_ops != 0 ||
225 obj->flags || 221 obj->flags ||
@@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
235 } 231 }
236 232
237 seq_printf(m, 233 seq_printf(m,
238 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ", 234 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
239 obj->debug_id, 235 obj->debug_id,
240 obj->parent ? obj->parent->debug_id : -1, 236 obj->parent ? obj->parent->debug_id : -1,
241 fscache_object_states_short[obj->state], 237 obj->state->short_name,
242 obj->n_children, 238 obj->n_children,
243 obj->n_ops, 239 obj->n_ops,
244 obj->n_obj_ops, 240 obj->n_obj_ops,
@@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
250 obj->flags, 246 obj->flags,
251 work_busy(&obj->work)); 247 work_busy(&obj->work));
252 248
253 no_cookie = true; 249 if (fscache_use_cookie(obj)) {
254 keylen = auxlen = 0; 250 uint16_t keylen = 0, auxlen = 0;
255 if (obj->cookie) {
256 spin_lock(&obj->lock);
257 if (obj->cookie) {
258 switch (obj->cookie->def->type) {
259 case 0:
260 type = "IX";
261 break;
262 case 1:
263 type = "DT";
264 break;
265 default:
266 sprintf(_type, "%02u",
267 obj->cookie->def->type);
268 type = _type;
269 break;
270 }
271 251
272 seq_printf(m, "%-16s %s %2lx %16p", 252 switch (cookie->def->type) {
273 obj->cookie->def->name, 253 case 0:
274 type, 254 type = "IX";
275 obj->cookie->flags, 255 break;
276 obj->cookie->netfs_data); 256 case 1:
277 257 type = "DT";
278 if (obj->cookie->def->get_key && 258 break;
279 config & FSCACHE_OBJLIST_CONFIG_KEY) 259 default:
280 keylen = obj->cookie->def->get_key( 260 sprintf(_type, "%02u", cookie->def->type);
281 obj->cookie->netfs_data, 261 type = _type;
282 buf, 400); 262 break;
283
284 if (obj->cookie->def->get_aux &&
285 config & FSCACHE_OBJLIST_CONFIG_AUX)
286 auxlen = obj->cookie->def->get_aux(
287 obj->cookie->netfs_data,
288 buf + keylen, 512 - keylen);
289
290 no_cookie = false;
291 } 263 }
292 spin_unlock(&obj->lock);
293 264
294 if (!no_cookie && (keylen > 0 || auxlen > 0)) { 265 seq_printf(m, "%-16s %s %2lx %16p",
266 cookie->def->name,
267 type,
268 cookie->flags,
269 cookie->netfs_data);
270
271 if (cookie->def->get_key &&
272 config & FSCACHE_OBJLIST_CONFIG_KEY)
273 keylen = cookie->def->get_key(cookie->netfs_data,
274 buf, 400);
275
276 if (cookie->def->get_aux &&
277 config & FSCACHE_OBJLIST_CONFIG_AUX)
278 auxlen = cookie->def->get_aux(cookie->netfs_data,
279 buf + keylen, 512 - keylen);
280 fscache_unuse_cookie(obj);
281
282 if (keylen > 0 || auxlen > 0) {
295 seq_printf(m, " "); 283 seq_printf(m, " ");
296 for (p = buf; keylen > 0; keylen--) 284 for (p = buf; keylen > 0; keylen--)
297 seq_printf(m, "%02x", *p++); 285 seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
302 seq_printf(m, "%02x", *p++); 290 seq_printf(m, "%02x", *p++);
303 } 291 }
304 } 292 }
305 }
306 293
307 if (no_cookie)
308 seq_printf(m, "<no_cookie>\n");
309 else
310 seq_printf(m, "\n"); 294 seq_printf(m, "\n");
295 } else {
296 seq_printf(m, "<no_netfs>\n");
297 }
311 return 0; 298 return 0;
312} 299}
313 300
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c180211..86d75a60b20c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -15,52 +15,131 @@
15#define FSCACHE_DEBUG_LEVEL COOKIE 15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/prefetch.h>
18#include "internal.h" 19#include "internal.h"
19 20
20const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { 21static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
21 [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", 22static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
22 [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", 23static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
23 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", 24static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
24 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", 25static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
25 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", 26static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
26 [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING", 27static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
27 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", 28static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
28 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", 29static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
29 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", 30static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
30 [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT", 31static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
31 [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING", 32static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
32 [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING", 33
33 [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING", 34#define __STATE_NAME(n) fscache_osm_##n
34 [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD", 35#define STATE(n) (&__STATE_NAME(n))
36
37/*
38 * Define a work state. Work states are execution states. No event processing
39 * is performed by them. The function attached to a work state returns a
40 * pointer indicating the next state to which the state machine should
41 * transition. Returning NO_TRANSIT repeats the current state, but goes back
42 * to the scheduler first.
43 */
44#define WORK_STATE(n, sn, f) \
45 const struct fscache_state __STATE_NAME(n) = { \
46 .name = #n, \
47 .short_name = sn, \
48 .work = f \
49 }
50
51/*
52 * Returns from work states.
53 */
54#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
55
56#define NO_TRANSIT ((struct fscache_state *)NULL)
57
58/*
59 * Define a wait state. Wait states are event processing states. No execution
60 * is performed by them. Wait states are just tables of "if event X occurs,
61 * clear it and transition to state Y". The dispatcher returns to the
62 * scheduler if none of the events in which the wait state has an interest are
63 * currently pending.
64 */
65#define WAIT_STATE(n, sn, ...) \
66 const struct fscache_state __STATE_NAME(n) = { \
67 .name = #n, \
68 .short_name = sn, \
69 .work = NULL, \
70 .transitions = { __VA_ARGS__, { 0, NULL } } \
71 }
72
73#define TRANSIT_TO(state, emask) \
74 { .events = (emask), .transit_to = STATE(state) }
75
76/*
77 * The object state machine.
78 */
79static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object);
80static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready);
81static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation);
82static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object);
83static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object);
84static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available);
85static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents);
86
87static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object);
88static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object);
89
90static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure);
91static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object);
92static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents);
93static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object);
94static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL);
95
96static WAIT_STATE(WAIT_FOR_INIT, "?INI",
97 TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
98
99static WAIT_STATE(WAIT_FOR_PARENT, "?PRN",
100 TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY));
101
102static WAIT_STATE(WAIT_FOR_CMD, "?CMD",
103 TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE),
104 TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE),
105 TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
106
107static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR",
108 TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED));
109
110/*
111 * Out-of-band event transition tables. These are for handling unexpected
112 * events, such as an I/O error. If an OOB event occurs, the state machine
113 * clears and disables the event and forces a transition to the nominated work
114 * state (acurrently executing work states will complete first).
115 *
116 * In such a situation, object->state remembers the state the machine should
117 * have been in/gone to and returning NO_TRANSIT returns to that.
118 */
119static const struct fscache_transition fscache_osm_init_oob[] = {
120 TRANSIT_TO(ABORT_INIT,
121 (1 << FSCACHE_OBJECT_EV_ERROR) |
122 (1 << FSCACHE_OBJECT_EV_KILL)),
123 { 0, NULL }
124};
125
126static const struct fscache_transition fscache_osm_lookup_oob[] = {
127 TRANSIT_TO(LOOKUP_FAILURE,
128 (1 << FSCACHE_OBJECT_EV_ERROR) |
129 (1 << FSCACHE_OBJECT_EV_KILL)),
130 { 0, NULL }
35}; 131};
36EXPORT_SYMBOL(fscache_object_states); 132
37 133static const struct fscache_transition fscache_osm_run_oob[] = {
38const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { 134 TRANSIT_TO(KILL_OBJECT,
39 [FSCACHE_OBJECT_INIT] = "INIT", 135 (1 << FSCACHE_OBJECT_EV_ERROR) |
40 [FSCACHE_OBJECT_LOOKING_UP] = "LOOK", 136 (1 << FSCACHE_OBJECT_EV_KILL)),
41 [FSCACHE_OBJECT_CREATING] = "CRTN", 137 { 0, NULL }
42 [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
43 [FSCACHE_OBJECT_ACTIVE] = "ACTV",
44 [FSCACHE_OBJECT_INVALIDATING] = "INVL",
45 [FSCACHE_OBJECT_UPDATING] = "UPDT",
46 [FSCACHE_OBJECT_DYING] = "DYNG",
47 [FSCACHE_OBJECT_LC_DYING] = "LCDY",
48 [FSCACHE_OBJECT_ABORT_INIT] = "ABTI",
49 [FSCACHE_OBJECT_RELEASING] = "RELS",
50 [FSCACHE_OBJECT_RECYCLING] = "RCYC",
51 [FSCACHE_OBJECT_WITHDRAWING] = "WTHD",
52 [FSCACHE_OBJECT_DEAD] = "DEAD",
53}; 138};
54 139
55static int fscache_get_object(struct fscache_object *); 140static int fscache_get_object(struct fscache_object *);
56static void fscache_put_object(struct fscache_object *); 141static void fscache_put_object(struct fscache_object *);
57static void fscache_initialise_object(struct fscache_object *); 142static bool fscache_enqueue_dependents(struct fscache_object *, int);
58static void fscache_lookup_object(struct fscache_object *);
59static void fscache_object_available(struct fscache_object *);
60static void fscache_invalidate_object(struct fscache_object *);
61static void fscache_release_object(struct fscache_object *);
62static void fscache_withdraw_object(struct fscache_object *);
63static void fscache_enqueue_dependents(struct fscache_object *);
64static void fscache_dequeue_object(struct fscache_object *); 143static void fscache_dequeue_object(struct fscache_object *);
65 144
66/* 145/*
@@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
75 object->debug_id, parent->debug_id, parent->n_ops); 154 object->debug_id, parent->debug_id, parent->n_ops);
76 155
77 spin_lock_nested(&parent->lock, 1); 156 spin_lock_nested(&parent->lock, 1);
78 parent->n_ops--;
79 parent->n_obj_ops--; 157 parent->n_obj_ops--;
158 parent->n_ops--;
80 if (parent->n_ops == 0) 159 if (parent->n_ops == 0)
81 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); 160 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
82 spin_unlock(&parent->lock); 161 spin_unlock(&parent->lock);
83} 162}
84 163
85/* 164/*
86 * Notify netfs of invalidation completion. 165 * Object state machine dispatcher.
87 */ 166 */
88static inline void fscache_invalidation_complete(struct fscache_cookie *cookie) 167static void fscache_object_sm_dispatcher(struct fscache_object *object)
89{ 168{
90 if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) 169 const struct fscache_transition *t;
91 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); 170 const struct fscache_state *state, *new_state;
92} 171 unsigned long events, event_mask;
93 172 int event = -1;
94/*
95 * process events that have been sent to an object's state machine
96 * - initiates parent lookup
97 * - does object lookup
98 * - does object creation
99 * - does object recycling and retirement
100 * - does object withdrawal
101 */
102static void fscache_object_state_machine(struct fscache_object *object)
103{
104 enum fscache_object_state new_state;
105 struct fscache_cookie *cookie;
106 int event;
107 173
108 ASSERT(object != NULL); 174 ASSERT(object != NULL);
109 175
110 _enter("{OBJ%x,%s,%lx}", 176 _enter("{OBJ%x,%s,%lx}",
111 object->debug_id, fscache_object_states[object->state], 177 object->debug_id, object->state->name, object->events);
112 object->events); 178
113 179 event_mask = object->event_mask;
114 switch (object->state) { 180restart:
115 /* wait for the parent object to become ready */ 181 object->event_mask = 0; /* Mask normal event handling */
116 case FSCACHE_OBJECT_INIT: 182 state = object->state;
117 object->event_mask = 183restart_masked:
118 FSCACHE_OBJECT_EVENTS_MASK & 184 events = object->events;
119 ~(1 << FSCACHE_OBJECT_EV_CLEARED); 185
120 fscache_initialise_object(object); 186 /* Handle any out-of-band events (typically an error) */
121 goto done; 187 if (events & object->oob_event_mask) {
122 188 _debug("{OBJ%x} oob %lx",
123 /* look up the object metadata on disk */ 189 object->debug_id, events & object->oob_event_mask);
124 case FSCACHE_OBJECT_LOOKING_UP: 190 for (t = object->oob_table; t->events; t++) {
125 fscache_lookup_object(object); 191 if (events & t->events) {
126 goto lookup_transit; 192 state = t->transit_to;
127 193 ASSERT(state->work != NULL);
128 /* create the object metadata on disk */ 194 event = fls(events & t->events) - 1;
129 case FSCACHE_OBJECT_CREATING: 195 __clear_bit(event, &object->oob_event_mask);
130 fscache_lookup_object(object); 196 clear_bit(event, &object->events);
131 goto lookup_transit; 197 goto execute_work_state;
132 198 }
133 /* handle an object becoming available; start pending
134 * operations and queue dependent operations for processing */
135 case FSCACHE_OBJECT_AVAILABLE:
136 fscache_object_available(object);
137 goto active_transit;
138
139 /* normal running state */
140 case FSCACHE_OBJECT_ACTIVE:
141 goto active_transit;
142
143 /* Invalidate an object on disk */
144 case FSCACHE_OBJECT_INVALIDATING:
145 clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
146 fscache_stat(&fscache_n_invalidates_run);
147 fscache_stat(&fscache_n_cop_invalidate_object);
148 fscache_invalidate_object(object);
149 fscache_stat_d(&fscache_n_cop_invalidate_object);
150 fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
151 goto active_transit;
152
153 /* update the object metadata on disk */
154 case FSCACHE_OBJECT_UPDATING:
155 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
156 fscache_stat(&fscache_n_updates_run);
157 fscache_stat(&fscache_n_cop_update_object);
158 object->cache->ops->update_object(object);
159 fscache_stat_d(&fscache_n_cop_update_object);
160 goto active_transit;
161
162 /* handle an object dying during lookup or creation */
163 case FSCACHE_OBJECT_LC_DYING:
164 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
165 fscache_stat(&fscache_n_cop_lookup_complete);
166 object->cache->ops->lookup_complete(object);
167 fscache_stat_d(&fscache_n_cop_lookup_complete);
168
169 spin_lock(&object->lock);
170 object->state = FSCACHE_OBJECT_DYING;
171 cookie = object->cookie;
172 if (cookie) {
173 if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
174 &cookie->flags))
175 wake_up_bit(&cookie->flags,
176 FSCACHE_COOKIE_LOOKING_UP);
177 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
178 &cookie->flags))
179 wake_up_bit(&cookie->flags,
180 FSCACHE_COOKIE_CREATING);
181 } 199 }
182 spin_unlock(&object->lock); 200 }
183 201
184 fscache_done_parent_op(object); 202 /* Wait states are just transition tables */
203 if (!state->work) {
204 if (events & event_mask) {
205 for (t = state->transitions; t->events; t++) {
206 if (events & t->events) {
207 new_state = t->transit_to;
208 event = fls(events & t->events) - 1;
209 clear_bit(event, &object->events);
210 _debug("{OBJ%x} ev %d: %s -> %s",
211 object->debug_id, event,
212 state->name, new_state->name);
213 object->state = state = new_state;
214 goto execute_work_state;
215 }
216 }
185 217
186 /* wait for completion of all active operations on this object 218 /* The event mask didn't include all the tabled bits */
187 * and the death of all child objects of this object */ 219 BUG();
188 case FSCACHE_OBJECT_DYING:
189 dying:
190 clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
191 spin_lock(&object->lock);
192 _debug("dying OBJ%x {%d,%d}",
193 object->debug_id, object->n_ops, object->n_children);
194 if (object->n_ops == 0 && object->n_children == 0) {
195 object->event_mask &=
196 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
197 object->event_mask |=
198 (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
199 (1 << FSCACHE_OBJECT_EV_RETIRE) |
200 (1 << FSCACHE_OBJECT_EV_RELEASE) |
201 (1 << FSCACHE_OBJECT_EV_ERROR);
202 } else {
203 object->event_mask &=
204 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
205 (1 << FSCACHE_OBJECT_EV_RETIRE) |
206 (1 << FSCACHE_OBJECT_EV_RELEASE) |
207 (1 << FSCACHE_OBJECT_EV_ERROR));
208 object->event_mask |=
209 1 << FSCACHE_OBJECT_EV_CLEARED;
210 } 220 }
211 spin_unlock(&object->lock); 221 /* Randomly woke up */
212 fscache_enqueue_dependents(object); 222 goto unmask_events;
213 fscache_start_operations(object);
214 goto terminal_transit;
215
216 /* handle an abort during initialisation */
217 case FSCACHE_OBJECT_ABORT_INIT:
218 _debug("handle abort init %lx", object->events);
219 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
220
221 spin_lock(&object->lock);
222 fscache_dequeue_object(object);
223
224 object->state = FSCACHE_OBJECT_DYING;
225 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
226 &object->cookie->flags))
227 wake_up_bit(&object->cookie->flags,
228 FSCACHE_COOKIE_CREATING);
229 spin_unlock(&object->lock);
230 goto dying;
231
232 /* handle the netfs releasing an object and possibly marking it
233 * obsolete too */
234 case FSCACHE_OBJECT_RELEASING:
235 case FSCACHE_OBJECT_RECYCLING:
236 object->event_mask &=
237 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
238 (1 << FSCACHE_OBJECT_EV_RETIRE) |
239 (1 << FSCACHE_OBJECT_EV_RELEASE) |
240 (1 << FSCACHE_OBJECT_EV_ERROR));
241 fscache_release_object(object);
242 spin_lock(&object->lock);
243 object->state = FSCACHE_OBJECT_DEAD;
244 spin_unlock(&object->lock);
245 fscache_stat(&fscache_n_object_dead);
246 goto terminal_transit;
247
248 /* handle the parent cache of this object being withdrawn from
249 * active service */
250 case FSCACHE_OBJECT_WITHDRAWING:
251 object->event_mask &=
252 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
253 (1 << FSCACHE_OBJECT_EV_RETIRE) |
254 (1 << FSCACHE_OBJECT_EV_RELEASE) |
255 (1 << FSCACHE_OBJECT_EV_ERROR));
256 fscache_withdraw_object(object);
257 spin_lock(&object->lock);
258 object->state = FSCACHE_OBJECT_DEAD;
259 spin_unlock(&object->lock);
260 fscache_stat(&fscache_n_object_dead);
261 goto terminal_transit;
262
263 /* complain about the object being woken up once it is
264 * deceased */
265 case FSCACHE_OBJECT_DEAD:
266 printk(KERN_ERR "FS-Cache:"
267 " Unexpected event in dead state %lx\n",
268 object->events & object->event_mask);
269 BUG();
270
271 default:
272 printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
273 object->state);
274 BUG();
275 }
276
277 /* determine the transition from a lookup state */
278lookup_transit:
279 event = fls(object->events & object->event_mask) - 1;
280 switch (event) {
281 case FSCACHE_OBJECT_EV_WITHDRAW:
282 case FSCACHE_OBJECT_EV_RETIRE:
283 case FSCACHE_OBJECT_EV_RELEASE:
284 case FSCACHE_OBJECT_EV_ERROR:
285 new_state = FSCACHE_OBJECT_LC_DYING;
286 goto change_state;
287 case FSCACHE_OBJECT_EV_INVALIDATE:
288 new_state = FSCACHE_OBJECT_INVALIDATING;
289 goto change_state;
290 case FSCACHE_OBJECT_EV_REQUEUE:
291 goto done;
292 case -1:
293 goto done; /* sleep until event */
294 default:
295 goto unsupported_event;
296 } 223 }
297 224
298 /* determine the transition from an active state */ 225execute_work_state:
299active_transit: 226 _debug("{OBJ%x} exec %s", object->debug_id, state->name);
300 event = fls(object->events & object->event_mask) - 1;
301 switch (event) {
302 case FSCACHE_OBJECT_EV_WITHDRAW:
303 case FSCACHE_OBJECT_EV_RETIRE:
304 case FSCACHE_OBJECT_EV_RELEASE:
305 case FSCACHE_OBJECT_EV_ERROR:
306 new_state = FSCACHE_OBJECT_DYING;
307 goto change_state;
308 case FSCACHE_OBJECT_EV_INVALIDATE:
309 new_state = FSCACHE_OBJECT_INVALIDATING;
310 goto change_state;
311 case FSCACHE_OBJECT_EV_UPDATE:
312 new_state = FSCACHE_OBJECT_UPDATING;
313 goto change_state;
314 case -1:
315 new_state = FSCACHE_OBJECT_ACTIVE;
316 goto change_state; /* sleep until event */
317 default:
318 goto unsupported_event;
319 }
320 227
321 /* determine the transition from a terminal state */ 228 new_state = state->work(object, event);
322terminal_transit: 229 event = -1;
323 event = fls(object->events & object->event_mask) - 1; 230 if (new_state == NO_TRANSIT) {
324 switch (event) { 231 _debug("{OBJ%x} %s notrans", object->debug_id, state->name);
325 case FSCACHE_OBJECT_EV_WITHDRAW: 232 fscache_enqueue_object(object);
326 new_state = FSCACHE_OBJECT_WITHDRAWING; 233 event_mask = object->oob_event_mask;
327 goto change_state; 234 goto unmask_events;
328 case FSCACHE_OBJECT_EV_RETIRE:
329 new_state = FSCACHE_OBJECT_RECYCLING;
330 goto change_state;
331 case FSCACHE_OBJECT_EV_RELEASE:
332 new_state = FSCACHE_OBJECT_RELEASING;
333 goto change_state;
334 case FSCACHE_OBJECT_EV_ERROR:
335 new_state = FSCACHE_OBJECT_WITHDRAWING;
336 goto change_state;
337 case FSCACHE_OBJECT_EV_CLEARED:
338 new_state = FSCACHE_OBJECT_DYING;
339 goto change_state;
340 case -1:
341 goto done; /* sleep until event */
342 default:
343 goto unsupported_event;
344 } 235 }
345 236
346change_state: 237 _debug("{OBJ%x} %s -> %s",
347 spin_lock(&object->lock); 238 object->debug_id, state->name, new_state->name);
348 object->state = new_state; 239 object->state = state = new_state;
349 spin_unlock(&object->lock);
350 240
351done: 241 if (state->work) {
352 _leave(" [->%s]", fscache_object_states[object->state]); 242 if (unlikely(state->work == ((void *)2UL))) {
353 return; 243 _leave(" [dead]");
244 return;
245 }
246 goto restart_masked;
247 }
354 248
355unsupported_event: 249 /* Transited to wait state */
356 printk(KERN_ERR "FS-Cache:" 250 event_mask = object->oob_event_mask;
357 " Unsupported event %d [%lx/%lx] in state %s\n", 251 for (t = state->transitions; t->events; t++)
358 event, object->events, object->event_mask, 252 event_mask |= t->events;
359 fscache_object_states[object->state]); 253
360 BUG(); 254unmask_events:
255 object->event_mask = event_mask;
256 smp_mb();
257 events = object->events;
258 if (events & event_mask)
259 goto restart;
260 _leave(" [msk %lx]", event_mask);
361} 261}
362 262
363/* 263/*
364 * execute an object 264 * execute an object
365 */ 265 */
366void fscache_object_work_func(struct work_struct *work) 266static void fscache_object_work_func(struct work_struct *work)
367{ 267{
368 struct fscache_object *object = 268 struct fscache_object *object =
369 container_of(work, struct fscache_object, work); 269 container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work)
372 _enter("{OBJ%x}", object->debug_id); 272 _enter("{OBJ%x}", object->debug_id);
373 273
374 start = jiffies; 274 start = jiffies;
375 fscache_object_state_machine(object); 275 fscache_object_sm_dispatcher(object);
376 fscache_hist(fscache_objs_histogram, start); 276 fscache_hist(fscache_objs_histogram, start);
377 if (object->events & object->event_mask)
378 fscache_enqueue_object(object);
379 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
380 fscache_put_object(object); 277 fscache_put_object(object);
381} 278}
382EXPORT_SYMBOL(fscache_object_work_func); 279
280/**
281 * fscache_object_init - Initialise a cache object description
282 * @object: Object description
283 * @cookie: Cookie object will be attached to
284 * @cache: Cache in which backing object will be found
285 *
286 * Initialise a cache object description to its basic values.
287 *
288 * See Documentation/filesystems/caching/backend-api.txt for a complete
289 * description.
290 */
291void fscache_object_init(struct fscache_object *object,
292 struct fscache_cookie *cookie,
293 struct fscache_cache *cache)
294{
295 const struct fscache_transition *t;
296
297 atomic_inc(&cache->object_count);
298
299 object->state = STATE(WAIT_FOR_INIT);
300 object->oob_table = fscache_osm_init_oob;
301 object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
302 spin_lock_init(&object->lock);
303 INIT_LIST_HEAD(&object->cache_link);
304 INIT_HLIST_NODE(&object->cookie_link);
305 INIT_WORK(&object->work, fscache_object_work_func);
306 INIT_LIST_HEAD(&object->dependents);
307 INIT_LIST_HEAD(&object->dep_link);
308 INIT_LIST_HEAD(&object->pending_ops);
309 object->n_children = 0;
310 object->n_ops = object->n_in_progress = object->n_exclusive = 0;
311 object->events = 0;
312 object->store_limit = 0;
313 object->store_limit_l = 0;
314 object->cache = cache;
315 object->cookie = cookie;
316 object->parent = NULL;
317
318 object->oob_event_mask = 0;
319 for (t = object->oob_table; t->events; t++)
320 object->oob_event_mask |= t->events;
321 object->event_mask = object->oob_event_mask;
322 for (t = object->state->transitions; t->events; t++)
323 object->event_mask |= t->events;
324}
325EXPORT_SYMBOL(fscache_object_init);
326
327/*
328 * Abort object initialisation before we start it.
329 */
330static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
331 int event)
332{
333 _enter("{OBJ%x},%d", object->debug_id, event);
334
335 object->oob_event_mask = 0;
336 fscache_dequeue_object(object);
337 return transit_to(KILL_OBJECT);
338}
383 339
384/* 340/*
385 * initialise an object 341 * initialise an object
@@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func);
387 * immediately to do a creation 343 * immediately to do a creation
388 * - we may need to start the process of creating a parent and we need to wait 344 * - we may need to start the process of creating a parent and we need to wait
389 * for the parent's lookup and creation to complete if it's not there yet 345 * for the parent's lookup and creation to complete if it's not there yet
390 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
391 * leaf-most cookies of the object and all its children
392 */ 346 */
393static void fscache_initialise_object(struct fscache_object *object) 347static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
348 int event)
394{ 349{
395 struct fscache_object *parent; 350 struct fscache_object *parent;
351 bool success;
396 352
397 _enter(""); 353 _enter("{OBJ%x},%d", object->debug_id, event);
398 ASSERT(object->cookie != NULL);
399 ASSERT(object->cookie->parent != NULL);
400
401 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
402 (1 << FSCACHE_OBJECT_EV_RELEASE) |
403 (1 << FSCACHE_OBJECT_EV_RETIRE) |
404 (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
405 _debug("abort init %lx", object->events);
406 spin_lock(&object->lock);
407 object->state = FSCACHE_OBJECT_ABORT_INIT;
408 spin_unlock(&object->lock);
409 return;
410 }
411 354
412 spin_lock(&object->cookie->lock); 355 ASSERT(list_empty(&object->dep_link));
413 spin_lock_nested(&object->cookie->parent->lock, 1);
414 356
415 parent = object->parent; 357 parent = object->parent;
416 if (!parent) { 358 if (!parent) {
417 _debug("no parent"); 359 _leave(" [no parent]");
418 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); 360 return transit_to(DROP_OBJECT);
419 } else { 361 }
420 spin_lock(&object->lock);
421 spin_lock_nested(&parent->lock, 1);
422 _debug("parent %s", fscache_object_states[parent->state]);
423
424 if (parent->state >= FSCACHE_OBJECT_DYING) {
425 _debug("bad parent");
426 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
427 } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
428 _debug("wait");
429
430 /* we may get woken up in this state by child objects
431 * binding on to us, so we need to make sure we don't
432 * add ourself to the list multiple times */
433 if (list_empty(&object->dep_link)) {
434 fscache_stat(&fscache_n_cop_grab_object);
435 object->cache->ops->grab_object(object);
436 fscache_stat_d(&fscache_n_cop_grab_object);
437 list_add(&object->dep_link,
438 &parent->dependents);
439
440 /* fscache_acquire_non_index_cookie() uses this
441 * to wake the chain up */
442 if (parent->state == FSCACHE_OBJECT_INIT)
443 fscache_enqueue_object(parent);
444 }
445 } else {
446 _debug("go");
447 parent->n_ops++;
448 parent->n_obj_ops++;
449 object->lookup_jif = jiffies;
450 object->state = FSCACHE_OBJECT_LOOKING_UP;
451 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
452 }
453 362
454 spin_unlock(&parent->lock); 363 _debug("parent: %s of:%lx", parent->state->name, parent->flags);
455 spin_unlock(&object->lock); 364
365 if (fscache_object_is_dying(parent)) {
366 _leave(" [bad parent]");
367 return transit_to(DROP_OBJECT);
456 } 368 }
457 369
458 spin_unlock(&object->cookie->parent->lock); 370 if (fscache_object_is_available(parent)) {
459 spin_unlock(&object->cookie->lock); 371 _leave(" [ready]");
372 return transit_to(PARENT_READY);
373 }
374
375 _debug("wait");
376
377 spin_lock(&parent->lock);
378 fscache_stat(&fscache_n_cop_grab_object);
379 success = false;
380 if (fscache_object_is_live(parent) &&
381 object->cache->ops->grab_object(object)) {
382 list_add(&object->dep_link, &parent->dependents);
383 success = true;
384 }
385 fscache_stat_d(&fscache_n_cop_grab_object);
386 spin_unlock(&parent->lock);
387 if (!success) {
388 _leave(" [grab failed]");
389 return transit_to(DROP_OBJECT);
390 }
391
392 /* fscache_acquire_non_index_cookie() uses this
393 * to wake the chain up */
394 fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
395 _leave(" [wait]");
396 return transit_to(WAIT_FOR_PARENT);
397}
398
399/*
400 * Once the parent object is ready, we should kick off our lookup op.
401 */
402static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
403 int event)
404{
405 struct fscache_object *parent = object->parent;
406
407 _enter("{OBJ%x},%d", object->debug_id, event);
408
409 ASSERT(parent != NULL);
410
411 spin_lock(&parent->lock);
412 parent->n_ops++;
413 parent->n_obj_ops++;
414 object->lookup_jif = jiffies;
415 spin_unlock(&parent->lock);
416
460 _leave(""); 417 _leave("");
418 return transit_to(LOOK_UP_OBJECT);
461} 419}
462 420
463/* 421/*
464 * look an object up in the cache from which it was allocated 422 * look an object up in the cache from which it was allocated
465 * - we hold an "access lock" on the parent object, so the parent object cannot 423 * - we hold an "access lock" on the parent object, so the parent object cannot
466 * be withdrawn by either party till we've finished 424 * be withdrawn by either party till we've finished
467 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
468 * leaf-most cookies of the object and all its children
469 */ 425 */
470static void fscache_lookup_object(struct fscache_object *object) 426static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
427 int event)
471{ 428{
472 struct fscache_cookie *cookie = object->cookie; 429 struct fscache_cookie *cookie = object->cookie;
473 struct fscache_object *parent; 430 struct fscache_object *parent = object->parent;
474 int ret; 431 int ret;
475 432
476 _enter(""); 433 _enter("{OBJ%x},%d", object->debug_id, event);
434
435 object->oob_table = fscache_osm_lookup_oob;
477 436
478 parent = object->parent;
479 ASSERT(parent != NULL); 437 ASSERT(parent != NULL);
480 ASSERTCMP(parent->n_ops, >, 0); 438 ASSERTCMP(parent->n_ops, >, 0);
481 ASSERTCMP(parent->n_obj_ops, >, 0); 439 ASSERTCMP(parent->n_obj_ops, >, 0);
482 440
483 /* make sure the parent is still available */ 441 /* make sure the parent is still available */
484 ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE); 442 ASSERT(fscache_object_is_available(parent));
485 443
486 if (parent->state >= FSCACHE_OBJECT_DYING || 444 if (fscache_object_is_dying(parent) ||
487 test_bit(FSCACHE_IOERROR, &object->cache->flags)) { 445 test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
488 _debug("unavailable"); 446 !fscache_use_cookie(object)) {
489 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); 447 _leave(" [unavailable]");
490 _leave(""); 448 return transit_to(LOOKUP_FAILURE);
491 return;
492 } 449 }
493 450
494 _debug("LOOKUP \"%s/%s\" in \"%s\"", 451 _debug("LOOKUP \"%s\" in \"%s\"",
495 parent->cookie->def->name, cookie->def->name, 452 cookie->def->name, object->cache->tag->name);
496 object->cache->tag->name);
497 453
498 fscache_stat(&fscache_n_object_lookups); 454 fscache_stat(&fscache_n_object_lookups);
499 fscache_stat(&fscache_n_cop_lookup_object); 455 fscache_stat(&fscache_n_cop_lookup_object);
500 ret = object->cache->ops->lookup_object(object); 456 ret = object->cache->ops->lookup_object(object);
501 fscache_stat_d(&fscache_n_cop_lookup_object); 457 fscache_stat_d(&fscache_n_cop_lookup_object);
502 458
503 if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) 459 fscache_unuse_cookie(object);
504 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
505 460
506 if (ret == -ETIMEDOUT) { 461 if (ret == -ETIMEDOUT) {
507 /* probably stuck behind another object, so move this one to 462 /* probably stuck behind another object, so move this one to
508 * the back of the queue */ 463 * the back of the queue */
509 fscache_stat(&fscache_n_object_lookups_timed_out); 464 fscache_stat(&fscache_n_object_lookups_timed_out);
510 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); 465 _leave(" [timeout]");
466 return NO_TRANSIT;
511 } 467 }
512 468
513 _leave(""); 469 if (ret < 0) {
470 _leave(" [error]");
471 return transit_to(LOOKUP_FAILURE);
472 }
473
474 _leave(" [ok]");
475 return transit_to(OBJECT_AVAILABLE);
514} 476}
515 477
516/** 478/**
@@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object)
524{ 486{
525 struct fscache_cookie *cookie = object->cookie; 487 struct fscache_cookie *cookie = object->cookie;
526 488
527 _enter("{OBJ%x,%s}", 489 _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
528 object->debug_id, fscache_object_states[object->state]);
529 490
530 spin_lock(&object->lock); 491 if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
531 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
532 fscache_stat(&fscache_n_object_lookups_negative); 492 fscache_stat(&fscache_n_object_lookups_negative);
533 493
534 /* transit here to allow write requests to begin stacking up 494 /* Allow write requests to begin stacking up and read requests to begin
535 * and read requests to begin returning ENODATA */ 495 * returning ENODATA.
536 object->state = FSCACHE_OBJECT_CREATING; 496 */
537 spin_unlock(&object->lock);
538
539 set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
540 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); 497 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
541 498
542 _debug("wake up lookup %p", &cookie->flags); 499 _debug("wake up lookup %p", &cookie->flags);
543 smp_mb__before_clear_bit(); 500 clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
544 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
545 smp_mb__after_clear_bit();
546 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); 501 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
547 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
548 } else {
549 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
550 spin_unlock(&object->lock);
551 } 502 }
552
553 _leave(""); 503 _leave("");
554} 504}
555EXPORT_SYMBOL(fscache_object_lookup_negative); 505EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object)
568{ 518{
569 struct fscache_cookie *cookie = object->cookie; 519 struct fscache_cookie *cookie = object->cookie;
570 520
571 _enter("{OBJ%x,%s}", 521 _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
572 object->debug_id, fscache_object_states[object->state]);
573 522
574 /* if we were still looking up, then we must have a positive lookup 523 /* if we were still looking up, then we must have a positive lookup
575 * result, in which case there may be data available */ 524 * result, in which case there may be data available */
576 spin_lock(&object->lock); 525 if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
577 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
578 fscache_stat(&fscache_n_object_lookups_positive); 526 fscache_stat(&fscache_n_object_lookups_positive);
579 527
580 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); 528 /* We do (presumably) have data */
529 clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
581 530
582 object->state = FSCACHE_OBJECT_AVAILABLE; 531 /* Allow write requests to begin stacking up and read requests
583 spin_unlock(&object->lock); 532 * to begin shovelling data.
584 533 */
585 smp_mb__before_clear_bit(); 534 clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
586 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
587 smp_mb__after_clear_bit();
588 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); 535 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
589 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
590 } else { 536 } else {
591 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
592 fscache_stat(&fscache_n_object_created); 537 fscache_stat(&fscache_n_object_created);
593
594 object->state = FSCACHE_OBJECT_AVAILABLE;
595 spin_unlock(&object->lock);
596 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
597 smp_wmb();
598 } 538 }
599 539
600 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) 540 set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
601 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
602
603 _leave(""); 541 _leave("");
604} 542}
605EXPORT_SYMBOL(fscache_obtained_object); 543EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object);
607/* 545/*
608 * handle an object that has just become available 546 * handle an object that has just become available
609 */ 547 */
610static void fscache_object_available(struct fscache_object *object) 548static const struct fscache_state *fscache_object_available(struct fscache_object *object,
549 int event)
611{ 550{
612 _enter("{OBJ%x}", object->debug_id); 551 _enter("{OBJ%x},%d", object->debug_id, event);
613 552
614 spin_lock(&object->lock); 553 object->oob_table = fscache_osm_run_oob;
615 554
616 if (object->cookie && 555 spin_lock(&object->lock);
617 test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
618 wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
619 556
620 fscache_done_parent_op(object); 557 fscache_done_parent_op(object);
621 if (object->n_in_progress == 0) { 558 if (object->n_in_progress == 0) {
@@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object)
631 fscache_stat(&fscache_n_cop_lookup_complete); 568 fscache_stat(&fscache_n_cop_lookup_complete);
632 object->cache->ops->lookup_complete(object); 569 object->cache->ops->lookup_complete(object);
633 fscache_stat_d(&fscache_n_cop_lookup_complete); 570 fscache_stat_d(&fscache_n_cop_lookup_complete);
634 fscache_enqueue_dependents(object);
635 571
636 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); 572 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
637 fscache_stat(&fscache_n_object_avail); 573 fscache_stat(&fscache_n_object_avail);
638 574
639 _leave(""); 575 _leave("");
576 return transit_to(JUMPSTART_DEPS);
640} 577}
641 578
642/* 579/*
643 * drop an object's attachments 580 * Wake up this object's dependent objects now that we've become available.
644 */ 581 */
645static void fscache_drop_object(struct fscache_object *object) 582static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
583 int event)
646{ 584{
647 struct fscache_object *parent = object->parent; 585 _enter("{OBJ%x},%d", object->debug_id, event);
648 struct fscache_cache *cache = object->cache;
649 586
650 _enter("{OBJ%x,%d}", object->debug_id, object->n_children); 587 if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
588 return NO_TRANSIT; /* Not finished; requeue */
589 return transit_to(WAIT_FOR_CMD);
590}
651 591
652 ASSERTCMP(object->cookie, ==, NULL); 592/*
653 ASSERT(hlist_unhashed(&object->cookie_link)); 593 * Handle lookup or creation failute.
594 */
595static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
596 int event)
597{
598 struct fscache_cookie *cookie;
654 599
655 spin_lock(&cache->object_list_lock); 600 _enter("{OBJ%x},%d", object->debug_id, event);
656 list_del_init(&object->cache_link);
657 spin_unlock(&cache->object_list_lock);
658 601
659 fscache_stat(&fscache_n_cop_drop_object); 602 object->oob_event_mask = 0;
660 cache->ops->drop_object(object);
661 fscache_stat_d(&fscache_n_cop_drop_object);
662 603
663 if (parent) { 604 fscache_stat(&fscache_n_cop_lookup_complete);
664 _debug("release parent OBJ%x {%d}", 605 object->cache->ops->lookup_complete(object);
665 parent->debug_id, parent->n_children); 606 fscache_stat_d(&fscache_n_cop_lookup_complete);
666 607
667 spin_lock(&parent->lock); 608 cookie = object->cookie;
668 parent->n_children--; 609 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
669 if (parent->n_children == 0) 610 if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
670 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); 611 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
671 spin_unlock(&parent->lock); 612
672 object->parent = NULL; 613 fscache_done_parent_op(object);
614 return transit_to(KILL_OBJECT);
615}
616
617/*
618 * Wait for completion of all active operations on this object and the death of
619 * all child objects of this object.
620 */
621static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
622 int event)
623{
624 _enter("{OBJ%x,%d,%d},%d",
625 object->debug_id, object->n_ops, object->n_children, event);
626
627 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
628 object->oob_event_mask = 0;
629
630 if (list_empty(&object->dependents) &&
631 object->n_ops == 0 &&
632 object->n_children == 0)
633 return transit_to(DROP_OBJECT);
634
635 if (object->n_in_progress == 0) {
636 spin_lock(&object->lock);
637 if (object->n_ops > 0 && object->n_in_progress == 0)
638 fscache_start_operations(object);
639 spin_unlock(&object->lock);
673 } 640 }
674 641
675 /* this just shifts the object release to the work processor */ 642 if (!list_empty(&object->dependents))
676 fscache_put_object(object); 643 return transit_to(KILL_DEPENDENTS);
677 644
678 _leave(""); 645 return transit_to(WAIT_FOR_CLEARANCE);
679} 646}
680 647
681/* 648/*
682 * release or recycle an object that the netfs has discarded 649 * Kill dependent objects.
683 */ 650 */
684static void fscache_release_object(struct fscache_object *object) 651static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
652 int event)
685{ 653{
686 _enter(""); 654 _enter("{OBJ%x},%d", object->debug_id, event);
687 655
688 fscache_drop_object(object); 656 if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
657 return NO_TRANSIT; /* Not finished */
658 return transit_to(WAIT_FOR_CLEARANCE);
689} 659}
690 660
691/* 661/*
692 * withdraw an object from active service 662 * Drop an object's attachments
693 */ 663 */
694static void fscache_withdraw_object(struct fscache_object *object) 664static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
665 int event)
695{ 666{
696 struct fscache_cookie *cookie; 667 struct fscache_object *parent = object->parent;
697 bool detached; 668 struct fscache_cookie *cookie = object->cookie;
669 struct fscache_cache *cache = object->cache;
670 bool awaken = false;
698 671
699 _enter(""); 672 _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
700 673
701 spin_lock(&object->lock); 674 ASSERT(cookie != NULL);
702 cookie = object->cookie; 675 ASSERT(!hlist_unhashed(&object->cookie_link));
703 if (cookie) {
704 /* need to get the cookie lock before the object lock, starting
705 * from the object pointer */
706 atomic_inc(&cookie->usage);
707 spin_unlock(&object->lock);
708 676
709 detached = false; 677 /* Make sure the cookie no longer points here and that the netfs isn't
710 spin_lock(&cookie->lock); 678 * waiting for us.
711 spin_lock(&object->lock); 679 */
680 spin_lock(&cookie->lock);
681 hlist_del_init(&object->cookie_link);
682 if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
683 awaken = true;
684 spin_unlock(&cookie->lock);
712 685
713 if (object->cookie == cookie) { 686 if (awaken)
714 hlist_del_init(&object->cookie_link); 687 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
715 object->cookie = NULL;
716 fscache_invalidation_complete(cookie);
717 detached = true;
718 }
719 spin_unlock(&cookie->lock);
720 fscache_cookie_put(cookie);
721 if (detached)
722 fscache_cookie_put(cookie);
723 }
724 688
689 /* Prevent a race with our last child, which has to signal EV_CLEARED
690 * before dropping our spinlock.
691 */
692 spin_lock(&object->lock);
725 spin_unlock(&object->lock); 693 spin_unlock(&object->lock);
726 694
727 fscache_drop_object(object); 695 /* Discard from the cache's collection of objects */
728} 696 spin_lock(&cache->object_list_lock);
697 list_del_init(&object->cache_link);
698 spin_unlock(&cache->object_list_lock);
729 699
730/* 700 fscache_stat(&fscache_n_cop_drop_object);
731 * withdraw an object from active service at the behest of the cache 701 cache->ops->drop_object(object);
732 * - need break the links to a cached object cookie 702 fscache_stat_d(&fscache_n_cop_drop_object);
733 * - called under two situations:
734 * (1) recycler decides to reclaim an in-use object
735 * (2) a cache is unmounted
736 * - have to take care as the cookie can be being relinquished by the netfs
737 * simultaneously
738 * - the object is pinned by the caller holding a refcount on it
739 */
740void fscache_withdrawing_object(struct fscache_cache *cache,
741 struct fscache_object *object)
742{
743 bool enqueue = false;
744 703
745 _enter(",OBJ%x", object->debug_id); 704 /* The parent object wants to know when all it dependents have gone */
705 if (parent) {
706 _debug("release parent OBJ%x {%d}",
707 parent->debug_id, parent->n_children);
746 708
747 spin_lock(&object->lock); 709 spin_lock(&parent->lock);
748 if (object->state < FSCACHE_OBJECT_WITHDRAWING) { 710 parent->n_children--;
749 object->state = FSCACHE_OBJECT_WITHDRAWING; 711 if (parent->n_children == 0)
750 enqueue = true; 712 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
713 spin_unlock(&parent->lock);
714 object->parent = NULL;
751 } 715 }
752 spin_unlock(&object->lock);
753 716
754 if (enqueue) 717 /* this just shifts the object release to the work processor */
755 fscache_enqueue_object(object); 718 fscache_put_object(object);
719 fscache_stat(&fscache_n_object_dead);
756 720
757 _leave(""); 721 _leave("");
722 return transit_to(OBJECT_DEAD);
758} 723}
759 724
760/* 725/*
@@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object)
771} 736}
772 737
773/* 738/*
774 * discard a ref on a work item 739 * Discard a ref on an object
775 */ 740 */
776static void fscache_put_object(struct fscache_object *object) 741static void fscache_put_object(struct fscache_object *object)
777{ 742{
@@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object)
780 fscache_stat_d(&fscache_n_cop_put_object); 745 fscache_stat_d(&fscache_n_cop_put_object);
781} 746}
782 747
748/**
749 * fscache_object_destroy - Note that a cache object is about to be destroyed
750 * @object: The object to be destroyed
751 *
752 * Note the imminent destruction and deallocation of a cache object record.
753 */
754void fscache_object_destroy(struct fscache_object *object)
755{
756 fscache_objlist_remove(object);
757
758 /* We can get rid of the cookie now */
759 fscache_cookie_put(object->cookie);
760 object->cookie = NULL;
761}
762EXPORT_SYMBOL(fscache_object_destroy);
763
783/* 764/*
784 * enqueue an object for metadata-type processing 765 * enqueue an object for metadata-type processing
785 */ 766 */
@@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object)
803 784
804/** 785/**
805 * fscache_object_sleep_till_congested - Sleep until object wq is congested 786 * fscache_object_sleep_till_congested - Sleep until object wq is congested
806 * @timoutp: Scheduler sleep timeout 787 * @timeoutp: Scheduler sleep timeout
807 * 788 *
808 * Allow an object handler to sleep until the object workqueue is congested. 789 * Allow an object handler to sleep until the object workqueue is congested.
809 * 790 *
@@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp)
831EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); 812EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
832 813
833/* 814/*
834 * enqueue the dependents of an object for metadata-type processing 815 * Enqueue the dependents of an object for metadata-type processing.
835 * - the caller must hold the object's lock 816 *
836 * - this may cause an already locked object to wind up being processed again 817 * If we don't manage to finish the list before the scheduler wants to run
818 * again then return false immediately. We return true if the list was
819 * cleared.
837 */ 820 */
838static void fscache_enqueue_dependents(struct fscache_object *object) 821static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
839{ 822{
840 struct fscache_object *dep; 823 struct fscache_object *dep;
824 bool ret = true;
841 825
842 _enter("{OBJ%x}", object->debug_id); 826 _enter("{OBJ%x}", object->debug_id);
843 827
844 if (list_empty(&object->dependents)) 828 if (list_empty(&object->dependents))
845 return; 829 return true;
846 830
847 spin_lock(&object->lock); 831 spin_lock(&object->lock);
848 832
@@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
851 struct fscache_object, dep_link); 835 struct fscache_object, dep_link);
852 list_del_init(&dep->dep_link); 836 list_del_init(&dep->dep_link);
853 837
854 838 fscache_raise_event(dep, event);
855 /* sort onto appropriate lists */
856 fscache_enqueue_object(dep);
857 fscache_put_object(dep); 839 fscache_put_object(dep);
858 840
859 if (!list_empty(&object->dependents)) 841 if (!list_empty(&object->dependents) && need_resched()) {
860 cond_resched_lock(&object->lock); 842 ret = false;
843 break;
844 }
861 } 845 }
862 846
863 spin_unlock(&object->lock); 847 spin_unlock(&object->lock);
848 return ret;
864} 849}
865 850
866/* 851/*
867 * remove an object from whatever queue it's waiting on 852 * remove an object from whatever queue it's waiting on
868 * - the caller must hold object->lock
869 */ 853 */
870void fscache_dequeue_object(struct fscache_object *object) 854static void fscache_dequeue_object(struct fscache_object *object)
871{ 855{
872 _enter("{OBJ%x}", object->debug_id); 856 _enter("{OBJ%x}", object->debug_id);
873 857
@@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object)
886 * @data: The auxiliary data for the object 870 * @data: The auxiliary data for the object
887 * @datalen: The size of the auxiliary data 871 * @datalen: The size of the auxiliary data
888 * 872 *
889 * This function consults the netfs about the coherency state of an object 873 * This function consults the netfs about the coherency state of an object.
874 * The caller must be holding a ref on cookie->n_active (held by
875 * fscache_look_up_object() on behalf of the cache backend during object lookup
876 * and creation).
890 */ 877 */
891enum fscache_checkaux fscache_check_aux(struct fscache_object *object, 878enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
892 const void *data, uint16_t datalen) 879 const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux);
927/* 914/*
928 * Asynchronously invalidate an object. 915 * Asynchronously invalidate an object.
929 */ 916 */
930static void fscache_invalidate_object(struct fscache_object *object) 917static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
918 int event)
931{ 919{
932 struct fscache_operation *op; 920 struct fscache_operation *op;
933 struct fscache_cookie *cookie = object->cookie; 921 struct fscache_cookie *cookie = object->cookie;
934 922
935 _enter("{OBJ%x}", object->debug_id); 923 _enter("{OBJ%x},%d", object->debug_id, event);
924
925 /* We're going to need the cookie. If the cookie is not available then
926 * retire the object instead.
927 */
928 if (!fscache_use_cookie(object)) {
929 ASSERT(object->cookie->stores.rnode == NULL);
930 set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
931 _leave(" [no cookie]");
932 return transit_to(KILL_OBJECT);
933 }
936 934
937 /* Reject any new read/write ops and abort any that are pending. */ 935 /* Reject any new read/write ops and abort any that are pending. */
938 fscache_invalidate_writes(cookie); 936 fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object)
941 939
942 /* Now we have to wait for in-progress reads and writes */ 940 /* Now we have to wait for in-progress reads and writes */
943 op = kzalloc(sizeof(*op), GFP_KERNEL); 941 op = kzalloc(sizeof(*op), GFP_KERNEL);
944 if (!op) { 942 if (!op)
945 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); 943 goto nomem;
946 _leave(" [ENOMEM]");
947 return;
948 }
949 944
950 fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); 945 fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
951 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); 946 op->flags = FSCACHE_OP_ASYNC |
947 (1 << FSCACHE_OP_EXCLUSIVE) |
948 (1 << FSCACHE_OP_UNUSE_COOKIE);
952 949
953 spin_lock(&cookie->lock); 950 spin_lock(&cookie->lock);
954 if (fscache_submit_exclusive_op(object, op) < 0) 951 if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object)
965 /* We can allow read and write requests to come in once again. They'll 962 /* We can allow read and write requests to come in once again. They'll
966 * queue up behind our exclusive invalidation operation. 963 * queue up behind our exclusive invalidation operation.
967 */ 964 */
968 fscache_invalidation_complete(cookie); 965 if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
969 _leave(""); 966 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
970 return; 967 _leave(" [ok]");
968 return transit_to(UPDATE_OBJECT);
969
970nomem:
971 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
972 fscache_unuse_cookie(object);
973 _leave(" [ENOMEM]");
974 return transit_to(KILL_OBJECT);
971 975
972submit_op_failed: 976submit_op_failed:
977 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
973 spin_unlock(&cookie->lock); 978 spin_unlock(&cookie->lock);
974 kfree(op); 979 kfree(op);
975 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
976 _leave(" [EIO]"); 980 _leave(" [EIO]");
981 return transit_to(KILL_OBJECT);
982}
983
984static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
985 int event)
986{
987 const struct fscache_state *s;
988
989 fscache_stat(&fscache_n_invalidates_run);
990 fscache_stat(&fscache_n_cop_invalidate_object);
991 s = _fscache_invalidate_object(object, event);
992 fscache_stat_d(&fscache_n_cop_invalidate_object);
993 return s;
994}
995
996/*
997 * Asynchronously update an object.
998 */
999static const struct fscache_state *fscache_update_object(struct fscache_object *object,
1000 int event)
1001{
1002 _enter("{OBJ%x},%d", object->debug_id, event);
1003
1004 fscache_stat(&fscache_n_updates_run);
1005 fscache_stat(&fscache_n_cop_update_object);
1006 object->cache->ops->update_object(object);
1007 fscache_stat_d(&fscache_n_cop_update_object);
1008
1009 _leave("");
1010 return transit_to(WAIT_FOR_CMD);
977} 1011}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec4ffa4..318071aca217 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
35 35
36 ASSERT(list_empty(&op->pend_link)); 36 ASSERT(list_empty(&op->pend_link));
37 ASSERT(op->processor != NULL); 37 ASSERT(op->processor != NULL);
38 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); 38 ASSERT(fscache_object_is_available(op->object));
39 ASSERTCMP(atomic_read(&op->usage), >, 0); 39 ASSERTCMP(atomic_read(&op->usage), >, 0);
40 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); 40 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
41 41
@@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
119 /* need to issue a new write op after this */ 119 /* need to issue a new write op after this */
120 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); 120 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
121 ret = 0; 121 ret = 0;
122 } else if (object->state == FSCACHE_OBJECT_CREATING) { 122 } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
123 op->object = object; 123 op->object = object;
124 object->n_ops++; 124 object->n_ops++;
125 object->n_exclusive++; /* reads and writes must wait */ 125 object->n_exclusive++; /* reads and writes must wait */
@@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
144 */ 144 */
145static void fscache_report_unexpected_submission(struct fscache_object *object, 145static void fscache_report_unexpected_submission(struct fscache_object *object,
146 struct fscache_operation *op, 146 struct fscache_operation *op,
147 unsigned long ostate) 147 const struct fscache_state *ostate)
148{ 148{
149 static bool once_only; 149 static bool once_only;
150 struct fscache_operation *p; 150 struct fscache_operation *p;
@@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
155 once_only = true; 155 once_only = true;
156 156
157 kdebug("unexpected submission OP%x [OBJ%x %s]", 157 kdebug("unexpected submission OP%x [OBJ%x %s]",
158 op->debug_id, object->debug_id, 158 op->debug_id, object->debug_id, object->state->name);
159 fscache_object_states[object->state]); 159 kdebug("objstate=%s [%s]", object->state->name, ostate->name);
160 kdebug("objstate=%s [%s]",
161 fscache_object_states[object->state],
162 fscache_object_states[ostate]);
163 kdebug("objflags=%lx", object->flags); 160 kdebug("objflags=%lx", object->flags);
164 kdebug("objevent=%lx [%lx]", object->events, object->event_mask); 161 kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
165 kdebug("ops=%u inp=%u exc=%u", 162 kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
190int fscache_submit_op(struct fscache_object *object, 187int fscache_submit_op(struct fscache_object *object,
191 struct fscache_operation *op) 188 struct fscache_operation *op)
192{ 189{
193 unsigned long ostate; 190 const struct fscache_state *ostate;
194 int ret; 191 int ret;
195 192
196 _enter("{OBJ%x OP%x},{%u}", 193 _enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object,
226 fscache_run_op(object, op); 223 fscache_run_op(object, op);
227 } 224 }
228 ret = 0; 225 ret = 0;
229 } else if (object->state == FSCACHE_OBJECT_CREATING) { 226 } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
230 op->object = object; 227 op->object = object;
231 object->n_ops++; 228 object->n_ops++;
232 atomic_inc(&op->usage); 229 atomic_inc(&op->usage);
233 list_add_tail(&op->pend_link, &object->pending_ops); 230 list_add_tail(&op->pend_link, &object->pending_ops);
234 fscache_stat(&fscache_n_op_pend); 231 fscache_stat(&fscache_n_op_pend);
235 ret = 0; 232 ret = 0;
236 } else if (object->state == FSCACHE_OBJECT_DYING || 233 } else if (fscache_object_is_dying(object)) {
237 object->state == FSCACHE_OBJECT_LC_DYING ||
238 object->state == FSCACHE_OBJECT_WITHDRAWING) {
239 fscache_stat(&fscache_n_op_rejected); 234 fscache_stat(&fscache_n_op_rejected);
240 op->state = FSCACHE_OP_ST_CANCELLED; 235 op->state = FSCACHE_OP_ST_CANCELLED;
241 ret = -ENOBUFS; 236 ret = -ENOBUFS;
@@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object)
265} 260}
266 261
267/* 262/*
268 * jump start the operation processing on an object 263 * Jump start the operation processing on an object. The caller must hold
269 * - caller must hold object->lock 264 * object->lock.
270 */ 265 */
271void fscache_start_operations(struct fscache_object *object) 266void fscache_start_operations(struct fscache_object *object)
272{ 267{
@@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op)
428 423
429 object = op->object; 424 object = op->object;
430 425
431 if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) { 426 if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
432 if (atomic_dec_and_test(&object->n_reads)) { 427 atomic_dec(&object->n_reads);
433 clear_bit(FSCACHE_COOKIE_WAITING_ON_READS, 428 if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
434 &object->cookie->flags); 429 fscache_unuse_cookie(object);
435 wake_up_bit(&object->cookie->flags,
436 FSCACHE_COOKIE_WAITING_ON_READS);
437 }
438 }
439 430
440 /* now... we may get called with the object spinlock held, so we 431 /* now... we may get called with the object spinlock held, so we
441 * complete the cleanup here only if we can immediately acquire the 432 * complete the cleanup here only if we can immediately acquire the
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e52072d..d479ab3c63e4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -109,7 +109,7 @@ page_busy:
109 * allocator as the work threads writing to the cache may all end up 109 * allocator as the work threads writing to the cache may all end up
110 * sleeping on memory allocation, so we may need to impose a timeout 110 * sleeping on memory allocation, so we may need to impose a timeout
111 * too. */ 111 * too. */
112 if (!(gfp & __GFP_WAIT)) { 112 if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
113 fscache_stat(&fscache_n_store_vmscan_busy); 113 fscache_stat(&fscache_n_store_vmscan_busy);
114 return false; 114 return false;
115 } 115 }
@@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
163 163
164 fscache_stat(&fscache_n_attr_changed_calls); 164 fscache_stat(&fscache_n_attr_changed_calls);
165 165
166 if (fscache_object_is_active(object)) { 166 if (fscache_object_is_active(object) &&
167 fscache_use_cookie(object)) {
167 fscache_stat(&fscache_n_cop_attr_changed); 168 fscache_stat(&fscache_n_cop_attr_changed);
168 ret = object->cache->ops->attr_changed(object); 169 ret = object->cache->ops->attr_changed(object);
169 fscache_stat_d(&fscache_n_cop_attr_changed); 170 fscache_stat_d(&fscache_n_cop_attr_changed);
171 fscache_unuse_cookie(object);
170 if (ret < 0) 172 if (ret < 0)
171 fscache_abort_object(object); 173 fscache_abort_object(object);
172 } 174 }
@@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
233 235
234 _enter("{OP%x}", op->op.debug_id); 236 _enter("{OP%x}", op->op.debug_id);
235 237
236 ASSERTCMP(op->n_pages, ==, 0); 238 ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
237 239
238 fscache_hist(fscache_retrieval_histogram, op->start_time); 240 fscache_hist(fscache_retrieval_histogram, op->start_time);
239 if (op->context) 241 if (op->context)
@@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
246 * allocate a retrieval op 248 * allocate a retrieval op
247 */ 249 */
248static struct fscache_retrieval *fscache_alloc_retrieval( 250static struct fscache_retrieval *fscache_alloc_retrieval(
251 struct fscache_cookie *cookie,
249 struct address_space *mapping, 252 struct address_space *mapping,
250 fscache_rw_complete_t end_io_func, 253 fscache_rw_complete_t end_io_func,
251 void *context) 254 void *context)
@@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
260 } 263 }
261 264
262 fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); 265 fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
263 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); 266 atomic_inc(&cookie->n_active);
267 op->op.flags = FSCACHE_OP_MYTHREAD |
268 (1UL << FSCACHE_OP_WAITING) |
269 (1UL << FSCACHE_OP_UNUSE_COOKIE);
264 op->mapping = mapping; 270 op->mapping = mapping;
265 op->end_io_func = end_io_func; 271 op->end_io_func = end_io_func;
266 op->context = context; 272 op->context = context;
@@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
310 struct fscache_retrieval *op = 316 struct fscache_retrieval *op =
311 container_of(_op, struct fscache_retrieval, op); 317 container_of(_op, struct fscache_retrieval, op);
312 318
313 op->n_pages = 0; 319 atomic_set(&op->n_pages, 0);
314} 320}
315 321
316/* 322/*
@@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
394 if (fscache_wait_for_deferred_lookup(cookie) < 0) 400 if (fscache_wait_for_deferred_lookup(cookie) < 0)
395 return -ERESTARTSYS; 401 return -ERESTARTSYS;
396 402
397 op = fscache_alloc_retrieval(page->mapping, end_io_func, context); 403 op = fscache_alloc_retrieval(cookie, page->mapping,
404 end_io_func,context);
398 if (!op) { 405 if (!op) {
399 _leave(" = -ENOMEM"); 406 _leave(" = -ENOMEM");
400 return -ENOMEM; 407 return -ENOMEM;
401 } 408 }
402 op->n_pages = 1; 409 atomic_set(&op->n_pages, 1);
403 410
404 spin_lock(&cookie->lock); 411 spin_lock(&cookie->lock);
405 412
@@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
408 object = hlist_entry(cookie->backing_objects.first, 415 object = hlist_entry(cookie->backing_objects.first,
409 struct fscache_object, cookie_link); 416 struct fscache_object, cookie_link);
410 417
411 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); 418 ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
412 419
413 atomic_inc(&object->n_reads); 420 atomic_inc(&object->n_reads);
414 __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); 421 __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@ nobufs_unlock_dec:
465 atomic_dec(&object->n_reads); 472 atomic_dec(&object->n_reads);
466nobufs_unlock: 473nobufs_unlock:
467 spin_unlock(&cookie->lock); 474 spin_unlock(&cookie->lock);
475 atomic_dec(&cookie->n_active);
468 kfree(op); 476 kfree(op);
469nobufs: 477nobufs:
470 fscache_stat(&fscache_n_retrievals_nobufs); 478 fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
522 if (fscache_wait_for_deferred_lookup(cookie) < 0) 530 if (fscache_wait_for_deferred_lookup(cookie) < 0)
523 return -ERESTARTSYS; 531 return -ERESTARTSYS;
524 532
525 op = fscache_alloc_retrieval(mapping, end_io_func, context); 533 op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
526 if (!op) 534 if (!op)
527 return -ENOMEM; 535 return -ENOMEM;
528 op->n_pages = *nr_pages; 536 atomic_set(&op->n_pages, *nr_pages);
529 537
530 spin_lock(&cookie->lock); 538 spin_lock(&cookie->lock);
531 539
@@ -589,6 +597,7 @@ nobufs_unlock_dec:
589 atomic_dec(&object->n_reads); 597 atomic_dec(&object->n_reads);
590nobufs_unlock: 598nobufs_unlock:
591 spin_unlock(&cookie->lock); 599 spin_unlock(&cookie->lock);
600 atomic_dec(&cookie->n_active);
592 kfree(op); 601 kfree(op);
593nobufs: 602nobufs:
594 fscache_stat(&fscache_n_retrievals_nobufs); 603 fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
631 if (fscache_wait_for_deferred_lookup(cookie) < 0) 640 if (fscache_wait_for_deferred_lookup(cookie) < 0)
632 return -ERESTARTSYS; 641 return -ERESTARTSYS;
633 642
634 op = fscache_alloc_retrieval(page->mapping, NULL, NULL); 643 op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
635 if (!op) 644 if (!op)
636 return -ENOMEM; 645 return -ENOMEM;
637 op->n_pages = 1; 646 atomic_set(&op->n_pages, 1);
638 647
639 spin_lock(&cookie->lock); 648 spin_lock(&cookie->lock);
640 649
@@ -675,6 +684,7 @@ error:
675 684
676nobufs_unlock: 685nobufs_unlock:
677 spin_unlock(&cookie->lock); 686 spin_unlock(&cookie->lock);
687 atomic_dec(&cookie->n_active);
678 kfree(op); 688 kfree(op);
679nobufs: 689nobufs:
680 fscache_stat(&fscache_n_allocs_nobufs); 690 fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op)
729 */ 739 */
730 spin_unlock(&object->lock); 740 spin_unlock(&object->lock);
731 fscache_op_complete(&op->op, false); 741 fscache_op_complete(&op->op, false);
732 _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}", 742 _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
733 _op->flags, _op->state, object->state, object->flags); 743 _op->flags, _op->state, object->state->short_name,
744 object->flags);
734 return; 745 return;
735 } 746 }
736 747
@@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
796 807
797 _enter(""); 808 _enter("");
798 809
799 while (spin_lock(&cookie->stores_lock), 810 for (;;) {
800 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 811 spin_lock(&cookie->stores_lock);
801 ARRAY_SIZE(results), 812 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
802 FSCACHE_COOKIE_PENDING_TAG), 813 ARRAY_SIZE(results),
803 n > 0) { 814 FSCACHE_COOKIE_PENDING_TAG);
815 if (n == 0) {
816 spin_unlock(&cookie->stores_lock);
817 break;
818 }
819
804 for (i = n - 1; i >= 0; i--) { 820 for (i = n - 1; i >= 0; i--) {
805 page = results[i]; 821 page = results[i];
806 radix_tree_delete(&cookie->stores, page->index); 822 radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
812 page_cache_release(results[i]); 828 page_cache_release(results[i]);
813 } 829 }
814 830
815 spin_unlock(&cookie->stores_lock);
816 _leave(""); 831 _leave("");
817} 832}
818 833
@@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
829 * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is 844 * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
830 * set) 845 * set)
831 * 846 *
832 * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred 847 * (a) no writes yet
833 * fill op)
834 * 848 *
835 * (b) writes deferred till post-creation (mark page for writing and 849 * (b) writes deferred till post-creation (mark page for writing and
836 * return immediately) 850 * return immediately)
837 * 851 *
838 * (2) negative lookup, object created, initial fill being made from netfs 852 * (2) negative lookup, object created, initial fill being made from netfs
839 * (FSCACHE_COOKIE_INITIAL_FILL is set)
840 * 853 *
841 * (a) fill point not yet reached this page (mark page for writing and 854 * (a) fill point not yet reached this page (mark page for writing and
842 * return) 855 * return)
@@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
873 886
874 fscache_operation_init(&op->op, fscache_write_op, 887 fscache_operation_init(&op->op, fscache_write_op,
875 fscache_release_write_op); 888 fscache_release_write_op);
876 op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); 889 op->op.flags = FSCACHE_OP_ASYNC |
890 (1 << FSCACHE_OP_WAITING) |
891 (1 << FSCACHE_OP_UNUSE_COOKIE);
877 892
878 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 893 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
879 if (ret < 0) 894 if (ret < 0)
@@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
919 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); 934 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
920 op->store_limit = object->store_limit; 935 op->store_limit = object->store_limit;
921 936
937 atomic_inc(&cookie->n_active);
922 if (fscache_submit_op(object, &op->op) < 0) 938 if (fscache_submit_op(object, &op->op) < 0)
923 goto submit_failed; 939 goto submit_failed;
924 940
@@ -945,6 +961,7 @@ already_pending:
945 return 0; 961 return 0;
946 962
947submit_failed: 963submit_failed:
964 atomic_dec(&cookie->n_active);
948 spin_lock(&cookie->stores_lock); 965 spin_lock(&cookie->stores_lock);
949 radix_tree_delete(&cookie->stores, page->index); 966 radix_tree_delete(&cookie->stores, page->index);
950 spin_unlock(&cookie->stores_lock); 967 spin_unlock(&cookie->stores_lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 254df56b847b..0eda52738ec4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,7 +14,7 @@
14#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) 17static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
18{ 18{
19 struct fuse_conn *fc = get_fuse_conn(dir); 19 struct fuse_conn *fc = get_fuse_conn(dir);
20 struct fuse_inode *fi = get_fuse_inode(dir); 20 struct fuse_inode *fi = get_fuse_inode(dir);
@@ -25,7 +25,7 @@ static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
25 return true; 25 return true;
26 if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) 26 if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
27 return true; 27 return true;
28 if (filp->f_pos == 0) 28 if (ctx->pos == 0)
29 return true; 29 return true;
30 return false; 30 return false;
31} 31}
@@ -180,6 +180,8 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
180static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) 180static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
181{ 181{
182 struct inode *inode; 182 struct inode *inode;
183 struct dentry *parent;
184 struct fuse_conn *fc;
183 185
184 inode = ACCESS_ONCE(entry->d_inode); 186 inode = ACCESS_ONCE(entry->d_inode);
185 if (inode && is_bad_inode(inode)) 187 if (inode && is_bad_inode(inode))
@@ -187,10 +189,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
187 else if (fuse_dentry_time(entry) < get_jiffies_64()) { 189 else if (fuse_dentry_time(entry) < get_jiffies_64()) {
188 int err; 190 int err;
189 struct fuse_entry_out outarg; 191 struct fuse_entry_out outarg;
190 struct fuse_conn *fc;
191 struct fuse_req *req; 192 struct fuse_req *req;
192 struct fuse_forget_link *forget; 193 struct fuse_forget_link *forget;
193 struct dentry *parent;
194 u64 attr_version; 194 u64 attr_version;
195 195
196 /* For negative dentries, always do a fresh lookup */ 196 /* For negative dentries, always do a fresh lookup */
@@ -241,8 +241,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
241 entry_attr_timeout(&outarg), 241 entry_attr_timeout(&outarg),
242 attr_version); 242 attr_version);
243 fuse_change_entry_timeout(entry, &outarg); 243 fuse_change_entry_timeout(entry, &outarg);
244 } else if (inode) {
245 fc = get_fuse_conn(inode);
246 if (fc->readdirplus_auto) {
247 parent = dget_parent(entry);
248 fuse_advise_use_readdirplus(parent->d_inode);
249 dput(parent);
250 }
244 } 251 }
245 fuse_advise_use_readdirplus(inode);
246 return 1; 252 return 1;
247} 253}
248 254
@@ -1159,25 +1165,23 @@ static int fuse_permission(struct inode *inode, int mask)
1159} 1165}
1160 1166
1161static int parse_dirfile(char *buf, size_t nbytes, struct file *file, 1167static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
1162 void *dstbuf, filldir_t filldir) 1168 struct dir_context *ctx)
1163{ 1169{
1164 while (nbytes >= FUSE_NAME_OFFSET) { 1170 while (nbytes >= FUSE_NAME_OFFSET) {
1165 struct fuse_dirent *dirent = (struct fuse_dirent *) buf; 1171 struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
1166 size_t reclen = FUSE_DIRENT_SIZE(dirent); 1172 size_t reclen = FUSE_DIRENT_SIZE(dirent);
1167 int over;
1168 if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) 1173 if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
1169 return -EIO; 1174 return -EIO;
1170 if (reclen > nbytes) 1175 if (reclen > nbytes)
1171 break; 1176 break;
1172 1177
1173 over = filldir(dstbuf, dirent->name, dirent->namelen, 1178 if (!dir_emit(ctx, dirent->name, dirent->namelen,
1174 file->f_pos, dirent->ino, dirent->type); 1179 dirent->ino, dirent->type))
1175 if (over)
1176 break; 1180 break;
1177 1181
1178 buf += reclen; 1182 buf += reclen;
1179 nbytes -= reclen; 1183 nbytes -= reclen;
1180 file->f_pos = dirent->off; 1184 ctx->pos = dirent->off;
1181 } 1185 }
1182 1186
1183 return 0; 1187 return 0;
@@ -1278,7 +1282,7 @@ out:
1278} 1282}
1279 1283
1280static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, 1284static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
1281 void *dstbuf, filldir_t filldir, u64 attr_version) 1285 struct dir_context *ctx, u64 attr_version)
1282{ 1286{
1283 struct fuse_direntplus *direntplus; 1287 struct fuse_direntplus *direntplus;
1284 struct fuse_dirent *dirent; 1288 struct fuse_dirent *dirent;
@@ -1303,10 +1307,9 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
1303 we need to send a FORGET for each of those 1307 we need to send a FORGET for each of those
1304 which we did not link. 1308 which we did not link.
1305 */ 1309 */
1306 over = filldir(dstbuf, dirent->name, dirent->namelen, 1310 over = !dir_emit(ctx, dirent->name, dirent->namelen,
1307 file->f_pos, dirent->ino, 1311 dirent->ino, dirent->type);
1308 dirent->type); 1312 ctx->pos = dirent->off;
1309 file->f_pos = dirent->off;
1310 } 1313 }
1311 1314
1312 buf += reclen; 1315 buf += reclen;
@@ -1320,7 +1323,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
1320 return 0; 1323 return 0;
1321} 1324}
1322 1325
1323static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) 1326static int fuse_readdir(struct file *file, struct dir_context *ctx)
1324{ 1327{
1325 int plus, err; 1328 int plus, err;
1326 size_t nbytes; 1329 size_t nbytes;
@@ -1343,17 +1346,17 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1343 return -ENOMEM; 1346 return -ENOMEM;
1344 } 1347 }
1345 1348
1346 plus = fuse_use_readdirplus(inode, file); 1349 plus = fuse_use_readdirplus(inode, ctx);
1347 req->out.argpages = 1; 1350 req->out.argpages = 1;
1348 req->num_pages = 1; 1351 req->num_pages = 1;
1349 req->pages[0] = page; 1352 req->pages[0] = page;
1350 req->page_descs[0].length = PAGE_SIZE; 1353 req->page_descs[0].length = PAGE_SIZE;
1351 if (plus) { 1354 if (plus) {
1352 attr_version = fuse_get_attr_version(fc); 1355 attr_version = fuse_get_attr_version(fc);
1353 fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, 1356 fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
1354 FUSE_READDIRPLUS); 1357 FUSE_READDIRPLUS);
1355 } else { 1358 } else {
1356 fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, 1359 fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
1357 FUSE_READDIR); 1360 FUSE_READDIR);
1358 } 1361 }
1359 fuse_request_send(fc, req); 1362 fuse_request_send(fc, req);
@@ -1363,11 +1366,11 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1363 if (!err) { 1366 if (!err) {
1364 if (plus) { 1367 if (plus) {
1365 err = parse_dirplusfile(page_address(page), nbytes, 1368 err = parse_dirplusfile(page_address(page), nbytes,
1366 file, dstbuf, filldir, 1369 file, ctx,
1367 attr_version); 1370 attr_version);
1368 } else { 1371 } else {
1369 err = parse_dirfile(page_address(page), nbytes, file, 1372 err = parse_dirfile(page_address(page), nbytes, file,
1370 dstbuf, filldir); 1373 ctx);
1371 } 1374 }
1372 } 1375 }
1373 1376
@@ -1880,7 +1883,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
1880static const struct file_operations fuse_dir_operations = { 1883static const struct file_operations fuse_dir_operations = {
1881 .llseek = generic_file_llseek, 1884 .llseek = generic_file_llseek,
1882 .read = generic_read_dir, 1885 .read = generic_read_dir,
1883 .readdir = fuse_readdir, 1886 .iterate = fuse_readdir,
1884 .open = fuse_dir_open, 1887 .open = fuse_dir_open,
1885 .release = fuse_dir_release, 1888 .release = fuse_dir_release,
1886 .fsync = fuse_dir_fsync, 1889 .fsync = fuse_dir_fsync,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d1c9b85b3f58..5c121fe19c5f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -16,6 +16,7 @@
16#include <linux/compat.h> 16#include <linux/compat.h>
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/aio.h> 18#include <linux/aio.h>
19#include <linux/falloc.h>
19 20
20static const struct file_operations fuse_direct_io_file_operations; 21static const struct file_operations fuse_direct_io_file_operations;
21 22
@@ -547,8 +548,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
547 res = io->bytes < 0 ? io->size : io->bytes; 548 res = io->bytes < 0 ? io->size : io->bytes;
548 549
549 if (!is_sync_kiocb(io->iocb)) { 550 if (!is_sync_kiocb(io->iocb)) {
550 struct path *path = &io->iocb->ki_filp->f_path; 551 struct inode *inode = file_inode(io->iocb->ki_filp);
551 struct inode *inode = path->dentry->d_inode;
552 struct fuse_conn *fc = get_fuse_conn(inode); 552 struct fuse_conn *fc = get_fuse_conn(inode);
553 struct fuse_inode *fi = get_fuse_inode(inode); 553 struct fuse_inode *fi = get_fuse_inode(inode);
554 554
@@ -1278,7 +1278,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1278 1278
1279 iov_iter_init(&ii, iov, nr_segs, count, 0); 1279 iov_iter_init(&ii, iov, nr_segs, count, 0);
1280 1280
1281 req = fuse_get_req(fc, fuse_iter_npages(&ii)); 1281 if (io->async)
1282 req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
1283 else
1284 req = fuse_get_req(fc, fuse_iter_npages(&ii));
1282 if (IS_ERR(req)) 1285 if (IS_ERR(req))
1283 return PTR_ERR(req); 1286 return PTR_ERR(req);
1284 1287
@@ -1314,7 +1317,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1314 break; 1317 break;
1315 if (count) { 1318 if (count) {
1316 fuse_put_request(fc, req); 1319 fuse_put_request(fc, req);
1317 req = fuse_get_req(fc, fuse_iter_npages(&ii)); 1320 if (io->async)
1321 req = fuse_get_req_for_background(fc,
1322 fuse_iter_npages(&ii));
1323 else
1324 req = fuse_get_req(fc, fuse_iter_npages(&ii));
1318 if (IS_ERR(req)) 1325 if (IS_ERR(req))
1319 break; 1326 break;
1320 } 1327 }
@@ -2365,6 +2372,11 @@ static void fuse_do_truncate(struct file *file)
2365 fuse_do_setattr(inode, &attr, file); 2372 fuse_do_setattr(inode, &attr, file);
2366} 2373}
2367 2374
2375static inline loff_t fuse_round_up(loff_t off)
2376{
2377 return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
2378}
2379
2368static ssize_t 2380static ssize_t
2369fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 2381fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2370 loff_t offset, unsigned long nr_segs) 2382 loff_t offset, unsigned long nr_segs)
@@ -2372,6 +2384,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2372 ssize_t ret = 0; 2384 ssize_t ret = 0;
2373 struct file *file = iocb->ki_filp; 2385 struct file *file = iocb->ki_filp;
2374 struct fuse_file *ff = file->private_data; 2386 struct fuse_file *ff = file->private_data;
2387 bool async_dio = ff->fc->async_dio;
2375 loff_t pos = 0; 2388 loff_t pos = 0;
2376 struct inode *inode; 2389 struct inode *inode;
2377 loff_t i_size; 2390 loff_t i_size;
@@ -2383,10 +2396,10 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2383 i_size = i_size_read(inode); 2396 i_size = i_size_read(inode);
2384 2397
2385 /* optimization for short read */ 2398 /* optimization for short read */
2386 if (rw != WRITE && offset + count > i_size) { 2399 if (async_dio && rw != WRITE && offset + count > i_size) {
2387 if (offset >= i_size) 2400 if (offset >= i_size)
2388 return 0; 2401 return 0;
2389 count = i_size - offset; 2402 count = min_t(loff_t, count, fuse_round_up(i_size - offset));
2390 } 2403 }
2391 2404
2392 io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); 2405 io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
@@ -2404,7 +2417,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2404 * By default, we want to optimize all I/Os with async request 2417 * By default, we want to optimize all I/Os with async request
2405 * submission to the client filesystem if supported. 2418 * submission to the client filesystem if supported.
2406 */ 2419 */
2407 io->async = ff->fc->async_dio; 2420 io->async = async_dio;
2408 io->iocb = iocb; 2421 io->iocb = iocb;
2409 2422
2410 /* 2423 /*
@@ -2412,7 +2425,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2412 * to wait on real async I/O requests, so we must submit this request 2425 * to wait on real async I/O requests, so we must submit this request
2413 * synchronously. 2426 * synchronously.
2414 */ 2427 */
2415 if (!is_sync_kiocb(iocb) && (offset + count > i_size)) 2428 if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
2416 io->async = false; 2429 io->async = false;
2417 2430
2418 if (rw == WRITE) 2431 if (rw == WRITE)
@@ -2424,7 +2437,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2424 fuse_aio_complete(io, ret < 0 ? ret : 0, -1); 2437 fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
2425 2438
2426 /* we have a non-extending, async request, so return */ 2439 /* we have a non-extending, async request, so return */
2427 if (ret > 0 && !is_sync_kiocb(iocb)) 2440 if (!is_sync_kiocb(iocb))
2428 return -EIOCBQUEUED; 2441 return -EIOCBQUEUED;
2429 2442
2430 ret = wait_on_sync_kiocb(iocb); 2443 ret = wait_on_sync_kiocb(iocb);
@@ -2446,6 +2459,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2446 loff_t length) 2459 loff_t length)
2447{ 2460{
2448 struct fuse_file *ff = file->private_data; 2461 struct fuse_file *ff = file->private_data;
2462 struct inode *inode = file->f_inode;
2449 struct fuse_conn *fc = ff->fc; 2463 struct fuse_conn *fc = ff->fc;
2450 struct fuse_req *req; 2464 struct fuse_req *req;
2451 struct fuse_fallocate_in inarg = { 2465 struct fuse_fallocate_in inarg = {
@@ -2455,13 +2469,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2455 .mode = mode 2469 .mode = mode
2456 }; 2470 };
2457 int err; 2471 int err;
2472 bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
2473 (mode & FALLOC_FL_PUNCH_HOLE);
2458 2474
2459 if (fc->no_fallocate) 2475 if (fc->no_fallocate)
2460 return -EOPNOTSUPP; 2476 return -EOPNOTSUPP;
2461 2477
2478 if (lock_inode) {
2479 mutex_lock(&inode->i_mutex);
2480 if (mode & FALLOC_FL_PUNCH_HOLE)
2481 fuse_set_nowrite(inode);
2482 }
2483
2462 req = fuse_get_req_nopages(fc); 2484 req = fuse_get_req_nopages(fc);
2463 if (IS_ERR(req)) 2485 if (IS_ERR(req)) {
2464 return PTR_ERR(req); 2486 err = PTR_ERR(req);
2487 goto out;
2488 }
2465 2489
2466 req->in.h.opcode = FUSE_FALLOCATE; 2490 req->in.h.opcode = FUSE_FALLOCATE;
2467 req->in.h.nodeid = ff->nodeid; 2491 req->in.h.nodeid = ff->nodeid;
@@ -2476,6 +2500,25 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2476 } 2500 }
2477 fuse_put_request(fc, req); 2501 fuse_put_request(fc, req);
2478 2502
2503 if (err)
2504 goto out;
2505
2506 /* we could have extended the file */
2507 if (!(mode & FALLOC_FL_KEEP_SIZE))
2508 fuse_write_update_size(inode, offset + length);
2509
2510 if (mode & FALLOC_FL_PUNCH_HOLE)
2511 truncate_pagecache_range(inode, offset, offset + length - 1);
2512
2513 fuse_invalidate_attr(inode);
2514
2515out:
2516 if (lock_inode) {
2517 if (mode & FALLOC_FL_PUNCH_HOLE)
2518 fuse_release_nowrite(inode);
2519 mutex_unlock(&inode->i_mutex);
2520 }
2521
2479 return err; 2522 return err;
2480} 2523}
2481 2524
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6201f81e4d3a..9a0cdde14a08 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -867,10 +867,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
867 fc->dont_mask = 1; 867 fc->dont_mask = 1;
868 if (arg->flags & FUSE_AUTO_INVAL_DATA) 868 if (arg->flags & FUSE_AUTO_INVAL_DATA)
869 fc->auto_inval_data = 1; 869 fc->auto_inval_data = 1;
870 if (arg->flags & FUSE_DO_READDIRPLUS) 870 if (arg->flags & FUSE_DO_READDIRPLUS) {
871 fc->do_readdirplus = 1; 871 fc->do_readdirplus = 1;
872 if (arg->flags & FUSE_READDIRPLUS_AUTO) 872 if (arg->flags & FUSE_READDIRPLUS_AUTO)
873 fc->readdirplus_auto = 1; 873 fc->readdirplus_auto = 1;
874 }
874 if (arg->flags & FUSE_ASYNC_DIO) 875 if (arg->flags & FUSE_ASYNC_DIO)
875 fc->async_dio = 1; 876 fc->async_dio = 1;
876 } else { 877 } else {
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index eb08c9e43c2a..90c6a8faaecb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -20,13 +20,12 @@ config GFS2_FS
20 be found here: http://sources.redhat.com/cluster 20 be found here: http://sources.redhat.com/cluster
21 21
22 The "nolock" lock module is now built in to GFS2 by default. If 22 The "nolock" lock module is now built in to GFS2 by default. If
23 you want to use the DLM, be sure to enable HOTPLUG and IPv4/6 23 you want to use the DLM, be sure to enable IPv4/6 networking.
24 networking.
25 24
26config GFS2_FS_LOCKING_DLM 25config GFS2_FS_LOCKING_DLM
27 bool "GFS2 DLM locking" 26 bool "GFS2 DLM locking"
28 depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ 27 depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
29 HOTPLUG && DLM && CONFIGFS_FS && SYSFS 28 CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
30 help 29 help
31 Multiple node locking module for GFS2 30 Multiple node locking module for GFS2
32 31
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69ed6336..ee48ad37d9c0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
110 /* Is the page fully outside i_size? (truncate in progress) */ 110 /* Is the page fully outside i_size? (truncate in progress) */
111 offset = i_size & (PAGE_CACHE_SIZE-1); 111 offset = i_size & (PAGE_CACHE_SIZE-1);
112 if (page->index > end_index || (page->index == end_index && !offset)) { 112 if (page->index > end_index || (page->index == end_index && !offset)) {
113 page->mapping->a_ops->invalidatepage(page, 0); 113 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
114 goto out; 114 goto out;
115 } 115 }
116 return 1; 116 return 1;
@@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
299 299
300 /* Is the page fully outside i_size? (truncate in progress) */ 300 /* Is the page fully outside i_size? (truncate in progress) */
301 if (page->index > end_index || (page->index == end_index && !offset)) { 301 if (page->index > end_index || (page->index == end_index && !offset)) {
302 page->mapping->a_ops->invalidatepage(page, 0); 302 page->mapping->a_ops->invalidatepage(page, 0,
303 PAGE_CACHE_SIZE);
303 unlock_page(page); 304 unlock_page(page);
304 continue; 305 continue;
305 } 306 }
@@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
943 unlock_buffer(bh); 944 unlock_buffer(bh);
944} 945}
945 946
946static void gfs2_invalidatepage(struct page *page, unsigned long offset) 947static void gfs2_invalidatepage(struct page *page, unsigned int offset,
948 unsigned int length)
947{ 949{
948 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); 950 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
951 unsigned int stop = offset + length;
952 int partial_page = (offset || length < PAGE_CACHE_SIZE);
949 struct buffer_head *bh, *head; 953 struct buffer_head *bh, *head;
950 unsigned long pos = 0; 954 unsigned long pos = 0;
951 955
952 BUG_ON(!PageLocked(page)); 956 BUG_ON(!PageLocked(page));
953 if (offset == 0) 957 if (!partial_page)
954 ClearPageChecked(page); 958 ClearPageChecked(page);
955 if (!page_has_buffers(page)) 959 if (!page_has_buffers(page))
956 goto out; 960 goto out;
957 961
958 bh = head = page_buffers(page); 962 bh = head = page_buffers(page);
959 do { 963 do {
964 if (pos + bh->b_size > stop)
965 return;
966
960 if (offset <= pos) 967 if (offset <= pos)
961 gfs2_discard(sdp, bh); 968 gfs2_discard(sdp, bh);
962 pos += bh->b_size; 969 pos += bh->b_size;
963 bh = bh->b_this_page; 970 bh = bh->b_this_page;
964 } while (bh != head); 971 } while (bh != head);
965out: 972out:
966 if (offset == 0) 973 if (!partial_page)
967 try_to_release_page(page, 0); 974 try_to_release_page(page, 0);
968} 975}
969 976
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1dc9a13ce6bb..5e2f56fccf6b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size)
1232 unstuff = 1; 1232 unstuff = 1;
1233 } 1233 }
1234 1234
1235 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); 1235 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1236 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1237 0 : RES_QUOTA), 0);
1236 if (error) 1238 if (error)
1237 goto do_grow_release; 1239 goto do_grow_release;
1238 1240
@@ -1286,17 +1288,26 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
1286 if (ret) 1288 if (ret)
1287 return ret; 1289 return ret;
1288 1290
1291 ret = get_write_access(inode);
1292 if (ret)
1293 return ret;
1294
1289 inode_dio_wait(inode); 1295 inode_dio_wait(inode);
1290 1296
1291 ret = gfs2_rs_alloc(GFS2_I(inode)); 1297 ret = gfs2_rs_alloc(GFS2_I(inode));
1292 if (ret) 1298 if (ret)
1293 return ret; 1299 goto out;
1294 1300
1295 oldsize = inode->i_size; 1301 oldsize = inode->i_size;
1296 if (newsize >= oldsize) 1302 if (newsize >= oldsize) {
1297 return do_grow(inode, newsize); 1303 ret = do_grow(inode, newsize);
1304 goto out;
1305 }
1298 1306
1299 return do_shrink(inode, oldsize, newsize); 1307 ret = do_shrink(inode, oldsize, newsize);
1308out:
1309 put_write_access(inode);
1310 return ret;
1300} 1311}
1301 1312
1302int gfs2_truncatei_resume(struct gfs2_inode *ip) 1313int gfs2_truncatei_resume(struct gfs2_inode *ip)
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4fddb3c22d25..f2448ab2aac5 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -109,8 +109,7 @@ fail:
109 return 0; 109 return 0;
110} 110}
111 111
112static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode, 112static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)
113 struct qstr *str)
114{ 113{
115 str->hash = gfs2_disk_hash(str->name, str->len); 114 str->hash = gfs2_disk_hash(str->name, str->len);
116 return 0; 115 return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c3e82bd23179..0cb4c1557f20 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -354,22 +354,31 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
354 return ERR_PTR(-EIO); 354 return ERR_PTR(-EIO);
355 } 355 }
356 356
357 hc = kmalloc(hsize, GFP_NOFS); 357 hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN);
358 ret = -ENOMEM; 358 if (hc == NULL)
359 hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL);
360
359 if (hc == NULL) 361 if (hc == NULL)
360 return ERR_PTR(-ENOMEM); 362 return ERR_PTR(-ENOMEM);
361 363
362 ret = gfs2_dir_read_data(ip, hc, hsize); 364 ret = gfs2_dir_read_data(ip, hc, hsize);
363 if (ret < 0) { 365 if (ret < 0) {
364 kfree(hc); 366 if (is_vmalloc_addr(hc))
367 vfree(hc);
368 else
369 kfree(hc);
365 return ERR_PTR(ret); 370 return ERR_PTR(ret);
366 } 371 }
367 372
368 spin_lock(&inode->i_lock); 373 spin_lock(&inode->i_lock);
369 if (ip->i_hash_cache) 374 if (ip->i_hash_cache) {
370 kfree(hc); 375 if (is_vmalloc_addr(hc))
371 else 376 vfree(hc);
377 else
378 kfree(hc);
379 } else {
372 ip->i_hash_cache = hc; 380 ip->i_hash_cache = hc;
381 }
373 spin_unlock(&inode->i_lock); 382 spin_unlock(&inode->i_lock);
374 383
375 return ip->i_hash_cache; 384 return ip->i_hash_cache;
@@ -385,7 +394,10 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip)
385{ 394{
386 __be64 *hc = ip->i_hash_cache; 395 __be64 *hc = ip->i_hash_cache;
387 ip->i_hash_cache = NULL; 396 ip->i_hash_cache = NULL;
388 kfree(hc); 397 if (is_vmalloc_addr(hc))
398 vfree(hc);
399 else
400 kfree(hc);
389} 401}
390 402
391static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) 403static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -1113,10 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1113 if (IS_ERR(hc)) 1125 if (IS_ERR(hc))
1114 return PTR_ERR(hc); 1126 return PTR_ERR(hc);
1115 1127
1116 h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS); 1128 hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
1129 if (hc2 == NULL)
1130 hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
1131
1117 if (!hc2) 1132 if (!hc2)
1118 return -ENOMEM; 1133 return -ENOMEM;
1119 1134
1135 h = hc2;
1120 error = gfs2_meta_inode_buffer(dip, &dibh); 1136 error = gfs2_meta_inode_buffer(dip, &dibh);
1121 if (error) 1137 if (error)
1122 goto out_kfree; 1138 goto out_kfree;
@@ -1145,7 +1161,10 @@ fail:
1145 gfs2_dinode_out(dip, dibh->b_data); 1161 gfs2_dinode_out(dip, dibh->b_data);
1146 brelse(dibh); 1162 brelse(dibh);
1147out_kfree: 1163out_kfree:
1148 kfree(hc2); 1164 if (is_vmalloc_addr(hc2))
1165 vfree(hc2);
1166 else
1167 kfree(hc2);
1149 return error; 1168 return error;
1150} 1169}
1151 1170
@@ -1194,9 +1213,7 @@ static int compare_dents(const void *a, const void *b)
1194/** 1213/**
1195 * do_filldir_main - read out directory entries 1214 * do_filldir_main - read out directory entries
1196 * @dip: The GFS2 inode 1215 * @dip: The GFS2 inode
1197 * @offset: The offset in the file to read from 1216 * @ctx: what to feed the entries to
1198 * @opaque: opaque data to pass to filldir
1199 * @filldir: The function to pass entries to
1200 * @darr: an array of struct gfs2_dirent pointers to read 1217 * @darr: an array of struct gfs2_dirent pointers to read
1201 * @entries: the number of entries in darr 1218 * @entries: the number of entries in darr
1202 * @copied: pointer to int that's non-zero if a entry has been copied out 1219 * @copied: pointer to int that's non-zero if a entry has been copied out
@@ -1206,11 +1223,10 @@ static int compare_dents(const void *a, const void *b)
1206 * the possibility that they will fall into different readdir buffers or 1223 * the possibility that they will fall into different readdir buffers or
1207 * that someone will want to seek to that location. 1224 * that someone will want to seek to that location.
1208 * 1225 *
1209 * Returns: errno, >0 on exception from filldir 1226 * Returns: errno, >0 if the actor tells you to stop
1210 */ 1227 */
1211 1228
1212static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, 1229static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
1213 void *opaque, filldir_t filldir,
1214 const struct gfs2_dirent **darr, u32 entries, 1230 const struct gfs2_dirent **darr, u32 entries,
1215 int *copied) 1231 int *copied)
1216{ 1232{
@@ -1218,7 +1234,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1218 u64 off, off_next; 1234 u64 off, off_next;
1219 unsigned int x, y; 1235 unsigned int x, y;
1220 int run = 0; 1236 int run = 0;
1221 int error = 0;
1222 1237
1223 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); 1238 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1224 1239
@@ -1235,9 +1250,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1235 off_next = be32_to_cpu(dent_next->de_hash); 1250 off_next = be32_to_cpu(dent_next->de_hash);
1236 off_next = gfs2_disk_hash2offset(off_next); 1251 off_next = gfs2_disk_hash2offset(off_next);
1237 1252
1238 if (off < *offset) 1253 if (off < ctx->pos)
1239 continue; 1254 continue;
1240 *offset = off; 1255 ctx->pos = off;
1241 1256
1242 if (off_next == off) { 1257 if (off_next == off) {
1243 if (*copied && !run) 1258 if (*copied && !run)
@@ -1246,26 +1261,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1246 } else 1261 } else
1247 run = 0; 1262 run = 0;
1248 } else { 1263 } else {
1249 if (off < *offset) 1264 if (off < ctx->pos)
1250 continue; 1265 continue;
1251 *offset = off; 1266 ctx->pos = off;
1252 } 1267 }
1253 1268
1254 error = filldir(opaque, (const char *)(dent + 1), 1269 if (!dir_emit(ctx, (const char *)(dent + 1),
1255 be16_to_cpu(dent->de_name_len), 1270 be16_to_cpu(dent->de_name_len),
1256 off, be64_to_cpu(dent->de_inum.no_addr), 1271 be64_to_cpu(dent->de_inum.no_addr),
1257 be16_to_cpu(dent->de_type)); 1272 be16_to_cpu(dent->de_type)))
1258 if (error)
1259 return 1; 1273 return 1;
1260 1274
1261 *copied = 1; 1275 *copied = 1;
1262 } 1276 }
1263 1277
1264 /* Increment the *offset by one, so the next time we come into the 1278 /* Increment the ctx->pos by one, so the next time we come into the
1265 do_filldir fxn, we get the next entry instead of the last one in the 1279 do_filldir fxn, we get the next entry instead of the last one in the
1266 current leaf */ 1280 current leaf */
1267 1281
1268 (*offset)++; 1282 ctx->pos++;
1269 1283
1270 return 0; 1284 return 0;
1271} 1285}
@@ -1289,8 +1303,8 @@ static void gfs2_free_sort_buffer(void *ptr)
1289 kfree(ptr); 1303 kfree(ptr);
1290} 1304}
1291 1305
1292static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, 1306static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
1293 filldir_t filldir, int *copied, unsigned *depth, 1307 int *copied, unsigned *depth,
1294 u64 leaf_no) 1308 u64 leaf_no)
1295{ 1309{
1296 struct gfs2_inode *ip = GFS2_I(inode); 1310 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1368,8 +1382,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1368 } while(lfn); 1382 } while(lfn);
1369 1383
1370 BUG_ON(entries2 != entries); 1384 BUG_ON(entries2 != entries);
1371 error = do_filldir_main(ip, offset, opaque, filldir, darr, 1385 error = do_filldir_main(ip, ctx, darr, entries, copied);
1372 entries, copied);
1373out_free: 1386out_free:
1374 for(i = 0; i < leaf; i++) 1387 for(i = 0; i < leaf; i++)
1375 brelse(larr[i]); 1388 brelse(larr[i]);
@@ -1428,15 +1441,13 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
1428/** 1441/**
1429 * dir_e_read - Reads the entries from a directory into a filldir buffer 1442 * dir_e_read - Reads the entries from a directory into a filldir buffer
1430 * @dip: dinode pointer 1443 * @dip: dinode pointer
1431 * @offset: the hash of the last entry read shifted to the right once 1444 * @ctx: actor to feed the entries to
1432 * @opaque: buffer for the filldir function to fill
1433 * @filldir: points to the filldir function to use
1434 * 1445 *
1435 * Returns: errno 1446 * Returns: errno
1436 */ 1447 */
1437 1448
1438static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, 1449static int dir_e_read(struct inode *inode, struct dir_context *ctx,
1439 filldir_t filldir, struct file_ra_state *f_ra) 1450 struct file_ra_state *f_ra)
1440{ 1451{
1441 struct gfs2_inode *dip = GFS2_I(inode); 1452 struct gfs2_inode *dip = GFS2_I(inode);
1442 u32 hsize, len = 0; 1453 u32 hsize, len = 0;
@@ -1447,7 +1458,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1447 unsigned depth = 0; 1458 unsigned depth = 0;
1448 1459
1449 hsize = 1 << dip->i_depth; 1460 hsize = 1 << dip->i_depth;
1450 hash = gfs2_dir_offset2hash(*offset); 1461 hash = gfs2_dir_offset2hash(ctx->pos);
1451 index = hash >> (32 - dip->i_depth); 1462 index = hash >> (32 - dip->i_depth);
1452 1463
1453 if (dip->i_hash_cache == NULL) 1464 if (dip->i_hash_cache == NULL)
@@ -1459,7 +1470,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1459 gfs2_dir_readahead(inode, hsize, index, f_ra); 1470 gfs2_dir_readahead(inode, hsize, index, f_ra);
1460 1471
1461 while (index < hsize) { 1472 while (index < hsize) {
1462 error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, 1473 error = gfs2_dir_read_leaf(inode, ctx,
1463 &copied, &depth, 1474 &copied, &depth,
1464 be64_to_cpu(lp[index])); 1475 be64_to_cpu(lp[index]));
1465 if (error) 1476 if (error)
@@ -1474,8 +1485,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1474 return error; 1485 return error;
1475} 1486}
1476 1487
1477int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 1488int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
1478 filldir_t filldir, struct file_ra_state *f_ra) 1489 struct file_ra_state *f_ra)
1479{ 1490{
1480 struct gfs2_inode *dip = GFS2_I(inode); 1491 struct gfs2_inode *dip = GFS2_I(inode);
1481 struct gfs2_sbd *sdp = GFS2_SB(inode); 1492 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1489,7 +1500,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1489 return 0; 1500 return 0;
1490 1501
1491 if (dip->i_diskflags & GFS2_DIF_EXHASH) 1502 if (dip->i_diskflags & GFS2_DIF_EXHASH)
1492 return dir_e_read(inode, offset, opaque, filldir, f_ra); 1503 return dir_e_read(inode, ctx, f_ra);
1493 1504
1494 if (!gfs2_is_stuffed(dip)) { 1505 if (!gfs2_is_stuffed(dip)) {
1495 gfs2_consist_inode(dip); 1506 gfs2_consist_inode(dip);
@@ -1521,7 +1532,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1521 error = -EIO; 1532 error = -EIO;
1522 goto out; 1533 goto out;
1523 } 1534 }
1524 error = do_filldir_main(dip, offset, opaque, filldir, darr, 1535 error = do_filldir_main(dip, ctx, darr,
1525 dip->i_entries, &copied); 1536 dip->i_entries, &copied);
1526out: 1537out:
1527 kfree(darr); 1538 kfree(darr);
@@ -1537,9 +1548,9 @@ out:
1537 1548
1538/** 1549/**
1539 * gfs2_dir_search - Search a directory 1550 * gfs2_dir_search - Search a directory
1540 * @dip: The GFS2 inode 1551 * @dip: The GFS2 dir inode
1541 * @filename: 1552 * @name: The name we are looking up
1542 * @inode: 1553 * @fail_on_exist: Fail if the name exists rather than looking it up
1543 * 1554 *
1544 * This routine searches a directory for a file or another directory. 1555 * This routine searches a directory for a file or another directory.
1545 * Assumes a glock is held on dip. 1556 * Assumes a glock is held on dip.
@@ -1547,22 +1558,25 @@ out:
1547 * Returns: errno 1558 * Returns: errno
1548 */ 1559 */
1549 1560
1550struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) 1561struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
1562 bool fail_on_exist)
1551{ 1563{
1552 struct buffer_head *bh; 1564 struct buffer_head *bh;
1553 struct gfs2_dirent *dent; 1565 struct gfs2_dirent *dent;
1554 struct inode *inode; 1566 u64 addr, formal_ino;
1567 u16 dtype;
1555 1568
1556 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); 1569 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1557 if (dent) { 1570 if (dent) {
1558 if (IS_ERR(dent)) 1571 if (IS_ERR(dent))
1559 return ERR_CAST(dent); 1572 return ERR_CAST(dent);
1560 inode = gfs2_inode_lookup(dir->i_sb, 1573 dtype = be16_to_cpu(dent->de_type);
1561 be16_to_cpu(dent->de_type), 1574 addr = be64_to_cpu(dent->de_inum.no_addr);
1562 be64_to_cpu(dent->de_inum.no_addr), 1575 formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
1563 be64_to_cpu(dent->de_inum.no_formal_ino), 0);
1564 brelse(bh); 1576 brelse(bh);
1565 return inode; 1577 if (fail_on_exist)
1578 return ERR_PTR(-EEXIST);
1579 return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
1566 } 1580 }
1567 return ERR_PTR(-ENOENT); 1581 return ERR_PTR(-ENOENT);
1568} 1582}
@@ -1846,6 +1860,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1846 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); 1860 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1847 1861
1848 ht = kzalloc(size, GFP_NOFS); 1862 ht = kzalloc(size, GFP_NOFS);
1863 if (ht == NULL)
1864 ht = vzalloc(size);
1849 if (!ht) 1865 if (!ht)
1850 return -ENOMEM; 1866 return -ENOMEM;
1851 1867
@@ -1933,7 +1949,10 @@ out_rlist:
1933 gfs2_rlist_free(&rlist); 1949 gfs2_rlist_free(&rlist);
1934 gfs2_quota_unhold(dip); 1950 gfs2_quota_unhold(dip);
1935out: 1951out:
1936 kfree(ht); 1952 if (is_vmalloc_addr(ht))
1953 vfree(ht);
1954 else
1955 kfree(ht);
1937 return error; 1956 return error;
1938} 1957}
1939 1958
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 98c960beab35..4f03bbd1873f 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -18,14 +18,15 @@ struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19 19
20extern struct inode *gfs2_dir_search(struct inode *dir, 20extern struct inode *gfs2_dir_search(struct inode *dir,
21 const struct qstr *filename); 21 const struct qstr *filename,
22 bool fail_on_exist);
22extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 23extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
23 const struct gfs2_inode *ip); 24 const struct gfs2_inode *ip);
24extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 25extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
25 const struct gfs2_inode *ip); 26 const struct gfs2_inode *ip);
26extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); 27extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
27extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 28extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
28 filldir_t filldir, struct file_ra_state *f_ra); 29 struct file_ra_state *f_ra);
29extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 30extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
30 const struct gfs2_inode *nip, unsigned int new_type); 31 const struct gfs2_inode *nip, unsigned int new_type);
31 32
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9973df4ff565..8b9b3775e2e7 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -64,6 +64,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
64} 64}
65 65
66struct get_name_filldir { 66struct get_name_filldir {
67 struct dir_context ctx;
67 struct gfs2_inum_host inum; 68 struct gfs2_inum_host inum;
68 char *name; 69 char *name;
69}; 70};
@@ -88,9 +89,11 @@ static int gfs2_get_name(struct dentry *parent, char *name,
88 struct inode *dir = parent->d_inode; 89 struct inode *dir = parent->d_inode;
89 struct inode *inode = child->d_inode; 90 struct inode *inode = child->d_inode;
90 struct gfs2_inode *dip, *ip; 91 struct gfs2_inode *dip, *ip;
91 struct get_name_filldir gnfd; 92 struct get_name_filldir gnfd = {
93 .ctx.actor = get_name_filldir,
94 .name = name
95 };
92 struct gfs2_holder gh; 96 struct gfs2_holder gh;
93 u64 offset = 0;
94 int error; 97 int error;
95 struct file_ra_state f_ra = { .start = 0 }; 98 struct file_ra_state f_ra = { .start = 0 };
96 99
@@ -106,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name,
106 *name = 0; 109 *name = 0;
107 gnfd.inum.no_addr = ip->i_no_addr; 110 gnfd.inum.no_addr = ip->i_no_addr;
108 gnfd.inum.no_formal_ino = ip->i_no_formal_ino; 111 gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
109 gnfd.name = name;
110 112
111 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); 113 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
112 if (error) 114 if (error)
113 return error; 115 return error;
114 116
115 error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra); 117 error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra);
116 118
117 gfs2_glock_dq_uninit(&gh); 119 gfs2_glock_dq_uninit(&gh);
118 120
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index acd16764b133..72c3866a7320 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -82,35 +82,28 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
82} 82}
83 83
84/** 84/**
85 * gfs2_readdir - Read directory entries from a directory 85 * gfs2_readdir - Iterator for a directory
86 * @file: The directory to read from 86 * @file: The directory to read from
87 * @dirent: Buffer for dirents 87 * @ctx: What to feed directory entries to
88 * @filldir: Function used to do the copying
89 * 88 *
90 * Returns: errno 89 * Returns: errno
91 */ 90 */
92 91
93static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) 92static int gfs2_readdir(struct file *file, struct dir_context *ctx)
94{ 93{
95 struct inode *dir = file->f_mapping->host; 94 struct inode *dir = file->f_mapping->host;
96 struct gfs2_inode *dip = GFS2_I(dir); 95 struct gfs2_inode *dip = GFS2_I(dir);
97 struct gfs2_holder d_gh; 96 struct gfs2_holder d_gh;
98 u64 offset = file->f_pos;
99 int error; 97 int error;
100 98
101 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); 99 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
102 error = gfs2_glock_nq(&d_gh); 100 if (error)
103 if (error) {
104 gfs2_holder_uninit(&d_gh);
105 return error; 101 return error;
106 }
107 102
108 error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra); 103 error = gfs2_dir_read(dir, ctx, &file->f_ra);
109 104
110 gfs2_glock_dq_uninit(&d_gh); 105 gfs2_glock_dq_uninit(&d_gh);
111 106
112 file->f_pos = offset;
113
114 return error; 107 return error;
115} 108}
116 109
@@ -402,16 +395,20 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
402 /* Update file times before taking page lock */ 395 /* Update file times before taking page lock */
403 file_update_time(vma->vm_file); 396 file_update_time(vma->vm_file);
404 397
398 ret = get_write_access(inode);
399 if (ret)
400 goto out;
401
405 ret = gfs2_rs_alloc(ip); 402 ret = gfs2_rs_alloc(ip);
406 if (ret) 403 if (ret)
407 return ret; 404 goto out_write_access;
408 405
409 gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); 406 gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
410 407
411 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 408 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
412 ret = gfs2_glock_nq(&gh); 409 ret = gfs2_glock_nq(&gh);
413 if (ret) 410 if (ret)
414 goto out; 411 goto out_uninit;
415 412
416 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); 413 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
417 set_bit(GIF_SW_PAGED, &ip->i_flags); 414 set_bit(GIF_SW_PAGED, &ip->i_flags);
@@ -480,12 +477,15 @@ out_quota_unlock:
480 gfs2_quota_unlock(ip); 477 gfs2_quota_unlock(ip);
481out_unlock: 478out_unlock:
482 gfs2_glock_dq(&gh); 479 gfs2_glock_dq(&gh);
483out: 480out_uninit:
484 gfs2_holder_uninit(&gh); 481 gfs2_holder_uninit(&gh);
485 if (ret == 0) { 482 if (ret == 0) {
486 set_page_dirty(page); 483 set_page_dirty(page);
487 wait_for_stable_page(page); 484 wait_for_stable_page(page);
488 } 485 }
486out_write_access:
487 put_write_access(inode);
488out:
489 sb_end_pagefault(inode->i_sb); 489 sb_end_pagefault(inode->i_sb);
490 return block_page_mkwrite_return(ret); 490 return block_page_mkwrite_return(ret);
491} 491}
@@ -531,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
531} 531}
532 532
533/** 533/**
534 * gfs2_open - open a file 534 * gfs2_open_common - This is common to open and atomic_open
535 * @inode: the inode to open 535 * @inode: The inode being opened
536 * @file: the struct file for this opening 536 * @file: The file being opened
537 * 537 *
538 * Returns: errno 538 * This maybe called under a glock or not depending upon how it has
539 * been called. We must always be called under a glock for regular
540 * files, however. For other file types, it does not matter whether
541 * we hold the glock or not.
542 *
543 * Returns: Error code or 0 for success
539 */ 544 */
540 545
541static int gfs2_open(struct inode *inode, struct file *file) 546int gfs2_open_common(struct inode *inode, struct file *file)
542{ 547{
543 struct gfs2_inode *ip = GFS2_I(inode);
544 struct gfs2_holder i_gh;
545 struct gfs2_file *fp; 548 struct gfs2_file *fp;
546 int error; 549 int ret;
550
551 if (S_ISREG(inode->i_mode)) {
552 ret = generic_file_open(inode, file);
553 if (ret)
554 return ret;
555 }
547 556
548 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); 557 fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
549 if (!fp) 558 if (!fp)
550 return -ENOMEM; 559 return -ENOMEM;
551 560
@@ -553,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file)
553 562
554 gfs2_assert_warn(GFS2_SB(inode), !file->private_data); 563 gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
555 file->private_data = fp; 564 file->private_data = fp;
565 return 0;
566}
567
568/**
569 * gfs2_open - open a file
570 * @inode: the inode to open
571 * @file: the struct file for this opening
572 *
573 * After atomic_open, this function is only used for opening files
574 * which are already cached. We must still get the glock for regular
575 * files to ensure that we have the file size uptodate for the large
576 * file check which is in the common code. That is only an issue for
577 * regular files though.
578 *
579 * Returns: errno
580 */
581
582static int gfs2_open(struct inode *inode, struct file *file)
583{
584 struct gfs2_inode *ip = GFS2_I(inode);
585 struct gfs2_holder i_gh;
586 int error;
587 bool need_unlock = false;
556 588
557 if (S_ISREG(ip->i_inode.i_mode)) { 589 if (S_ISREG(ip->i_inode.i_mode)) {
558 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 590 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
559 &i_gh); 591 &i_gh);
560 if (error) 592 if (error)
561 goto fail; 593 return error;
594 need_unlock = true;
595 }
562 596
563 if (!(file->f_flags & O_LARGEFILE) && 597 error = gfs2_open_common(inode, file);
564 i_size_read(inode) > MAX_NON_LFS) {
565 error = -EOVERFLOW;
566 goto fail_gunlock;
567 }
568 598
599 if (need_unlock)
569 gfs2_glock_dq_uninit(&i_gh); 600 gfs2_glock_dq_uninit(&i_gh);
570 }
571
572 return 0;
573 601
574fail_gunlock:
575 gfs2_glock_dq_uninit(&i_gh);
576fail:
577 file->private_data = NULL;
578 kfree(fp);
579 return error; 602 return error;
580} 603}
581 604
@@ -594,10 +617,10 @@ static int gfs2_release(struct inode *inode, struct file *file)
594 kfree(file->private_data); 617 kfree(file->private_data);
595 file->private_data = NULL; 618 file->private_data = NULL;
596 619
597 if ((file->f_mode & FMODE_WRITE) && 620 if (!(file->f_mode & FMODE_WRITE))
598 (atomic_read(&inode->i_writecount) == 1)) 621 return 0;
599 gfs2_rs_delete(ip);
600 622
623 gfs2_rs_delete(ip);
601 return 0; 624 return 0;
602} 625}
603 626
@@ -889,7 +912,7 @@ out_uninit:
889 * cluster; until we do, disable leases (by just returning -EINVAL), 912 * cluster; until we do, disable leases (by just returning -EINVAL),
890 * unless the administrator has requested purely local locking. 913 * unless the administrator has requested purely local locking.
891 * 914 *
892 * Locking: called under lock_flocks 915 * Locking: called under i_lock
893 * 916 *
894 * Returns: errno 917 * Returns: errno
895 */ 918 */
@@ -1041,7 +1064,7 @@ const struct file_operations gfs2_file_fops = {
1041}; 1064};
1042 1065
1043const struct file_operations gfs2_dir_fops = { 1066const struct file_operations gfs2_dir_fops = {
1044 .readdir = gfs2_readdir, 1067 .iterate = gfs2_readdir,
1045 .unlocked_ioctl = gfs2_ioctl, 1068 .unlocked_ioctl = gfs2_ioctl,
1046 .open = gfs2_open, 1069 .open = gfs2_open,
1047 .release = gfs2_release, 1070 .release = gfs2_release,
@@ -1071,7 +1094,7 @@ const struct file_operations gfs2_file_fops_nolock = {
1071}; 1094};
1072 1095
1073const struct file_operations gfs2_dir_fops_nolock = { 1096const struct file_operations gfs2_dir_fops_nolock = {
1074 .readdir = gfs2_readdir, 1097 .iterate = gfs2_readdir,
1075 .unlocked_ioctl = gfs2_ioctl, 1098 .unlocked_ioctl = gfs2_ioctl,
1076 .open = gfs2_open, 1099 .open = gfs2_open,
1077 .release = gfs2_release, 1100 .release = gfs2_release,
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c66e99c97571..5f2e5224c51c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
54 struct gfs2_bufdata *bd, *tmp; 54 struct gfs2_bufdata *bd, *tmp;
55 struct buffer_head *bh; 55 struct buffer_head *bh;
56 const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); 56 const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
57 sector_t blocknr;
58 57
59 gfs2_log_lock(sdp); 58 gfs2_log_lock(sdp);
60 spin_lock(&sdp->sd_ail_lock); 59 spin_lock(&sdp->sd_ail_lock);
@@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
65 continue; 64 continue;
66 gfs2_ail_error(gl, bh); 65 gfs2_ail_error(gl, bh);
67 } 66 }
68 blocknr = bh->b_blocknr;
69 bh->b_private = NULL;
70 gfs2_remove_from_ail(bd); /* drops ref on bh */
71
72 bd->bd_bh = NULL;
73 bd->bd_blkno = blocknr;
74
75 gfs2_trans_add_revoke(sdp, bd); 67 gfs2_trans_add_revoke(sdp, bd);
76 } 68 }
77 GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); 69 GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8833a4f264e3..bbb2715171cd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -189,6 +189,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
189 return inode; 189 return inode;
190 190
191fail_refresh: 191fail_refresh:
192 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
192 ip->i_iopen_gh.gh_gl->gl_object = NULL; 193 ip->i_iopen_gh.gh_gl->gl_object = NULL;
193 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 194 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
194fail_iopen: 195fail_iopen:
@@ -312,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
312 goto out; 313 goto out;
313 } 314 }
314 315
315 inode = gfs2_dir_search(dir, name); 316 inode = gfs2_dir_search(dir, name, false);
316 if (IS_ERR(inode)) 317 if (IS_ERR(inode))
317 error = PTR_ERR(inode); 318 error = PTR_ERR(inode);
318out: 319out:
@@ -345,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
345 if (!dip->i_inode.i_nlink) 346 if (!dip->i_inode.i_nlink)
346 return -ENOENT; 347 return -ENOENT;
347 348
348 error = gfs2_dir_check(&dip->i_inode, name, NULL);
349 switch (error) {
350 case -ENOENT:
351 error = 0;
352 break;
353 case 0:
354 return -EEXIST;
355 default:
356 return error;
357 }
358
359 if (dip->i_entries == (u32)-1) 349 if (dip->i_entries == (u32)-1)
360 return -EFBIG; 350 return -EFBIG;
361 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) 351 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -545,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
545 * gfs2_create_inode - Create a new inode 535 * gfs2_create_inode - Create a new inode
546 * @dir: The parent directory 536 * @dir: The parent directory
547 * @dentry: The new dentry 537 * @dentry: The new dentry
538 * @file: If non-NULL, the file which is being opened
548 * @mode: The permissions on the new inode 539 * @mode: The permissions on the new inode
549 * @dev: For device nodes, this is the device number 540 * @dev: For device nodes, this is the device number
550 * @symname: For symlinks, this is the link destination 541 * @symname: For symlinks, this is the link destination
@@ -554,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
554 */ 545 */
555 546
556static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, 547static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
548 struct file *file,
557 umode_t mode, dev_t dev, const char *symname, 549 umode_t mode, dev_t dev, const char *symname,
558 unsigned int size, int excl) 550 unsigned int size, int excl, int *opened)
559{ 551{
560 const struct qstr *name = &dentry->d_name; 552 const struct qstr *name = &dentry->d_name;
561 struct gfs2_holder ghs[2]; 553 struct gfs2_holder ghs[2];
@@ -563,6 +555,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
563 struct gfs2_inode *dip = GFS2_I(dir), *ip; 555 struct gfs2_inode *dip = GFS2_I(dir), *ip;
564 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 556 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
565 struct gfs2_glock *io_gl; 557 struct gfs2_glock *io_gl;
558 struct dentry *d;
566 int error; 559 int error;
567 u32 aflags = 0; 560 u32 aflags = 0;
568 int arq; 561 int arq;
@@ -583,15 +576,30 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
583 goto fail; 576 goto fail;
584 577
585 error = create_ok(dip, name, mode); 578 error = create_ok(dip, name, mode);
586 if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
587 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
588 gfs2_glock_dq_uninit(ghs);
589 d_instantiate(dentry, inode);
590 return IS_ERR(inode) ? PTR_ERR(inode) : 0;
591 }
592 if (error) 579 if (error)
593 goto fail_gunlock; 580 goto fail_gunlock;
594 581
582 inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
583 error = PTR_ERR(inode);
584 if (!IS_ERR(inode)) {
585 d = d_splice_alias(inode, dentry);
586 error = 0;
587 if (file && !IS_ERR(d)) {
588 if (d == NULL)
589 d = dentry;
590 if (S_ISREG(inode->i_mode))
591 error = finish_open(file, d, gfs2_open_common, opened);
592 else
593 error = finish_no_open(file, d);
594 }
595 gfs2_glock_dq_uninit(ghs);
596 if (IS_ERR(d))
597 return PTR_RET(d);
598 return error;
599 } else if (error != -ENOENT) {
600 goto fail_gunlock;
601 }
602
595 arq = error = gfs2_diradd_alloc_required(dir, name); 603 arq = error = gfs2_diradd_alloc_required(dir, name);
596 if (error < 0) 604 if (error < 0)
597 goto fail_gunlock; 605 goto fail_gunlock;
@@ -685,10 +693,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
685 goto fail_gunlock3; 693 goto fail_gunlock3;
686 694
687 mark_inode_dirty(inode); 695 mark_inode_dirty(inode);
696 d_instantiate(dentry, inode);
697 if (file)
698 error = finish_open(file, dentry, gfs2_open_common, opened);
688 gfs2_glock_dq_uninit(ghs); 699 gfs2_glock_dq_uninit(ghs);
689 gfs2_glock_dq_uninit(ghs + 1); 700 gfs2_glock_dq_uninit(ghs + 1);
690 d_instantiate(dentry, inode); 701 return error;
691 return 0;
692 702
693fail_gunlock3: 703fail_gunlock3:
694 gfs2_glock_dq_uninit(ghs + 1); 704 gfs2_glock_dq_uninit(ghs + 1);
@@ -728,36 +738,56 @@ fail:
728static int gfs2_create(struct inode *dir, struct dentry *dentry, 738static int gfs2_create(struct inode *dir, struct dentry *dentry,
729 umode_t mode, bool excl) 739 umode_t mode, bool excl)
730{ 740{
731 return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl); 741 return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
732} 742}
733 743
734/** 744/**
735 * gfs2_lookup - Look up a filename in a directory and return its inode 745 * __gfs2_lookup - Look up a filename in a directory and return its inode
736 * @dir: The directory inode 746 * @dir: The directory inode
737 * @dentry: The dentry of the new inode 747 * @dentry: The dentry of the new inode
738 * @nd: passed from Linux VFS, ignored by us 748 * @file: File to be opened
749 * @opened: atomic_open flags
739 * 750 *
740 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
741 * 751 *
742 * Returns: errno 752 * Returns: errno
743 */ 753 */
744 754
745static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, 755static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
746 unsigned int flags) 756 struct file *file, int *opened)
747{ 757{
748 struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0); 758 struct inode *inode;
749 if (inode && !IS_ERR(inode)) { 759 struct dentry *d;
750 struct gfs2_glock *gl = GFS2_I(inode)->i_gl; 760 struct gfs2_holder gh;
751 struct gfs2_holder gh; 761 struct gfs2_glock *gl;
752 int error; 762 int error;
753 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); 763
754 if (error) { 764 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
755 iput(inode); 765 if (!inode)
756 return ERR_PTR(error); 766 return NULL;
757 } 767 if (IS_ERR(inode))
758 gfs2_glock_dq_uninit(&gh); 768 return ERR_CAST(inode);
769
770 gl = GFS2_I(inode)->i_gl;
771 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
772 if (error) {
773 iput(inode);
774 return ERR_PTR(error);
759 } 775 }
760 return d_splice_alias(inode, dentry); 776
777 d = d_splice_alias(inode, dentry);
778 if (file && S_ISREG(inode->i_mode))
779 error = finish_open(file, dentry, gfs2_open_common, opened);
780
781 gfs2_glock_dq_uninit(&gh);
782 if (error)
783 return ERR_PTR(error);
784 return d;
785}
786
787static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
788 unsigned flags)
789{
790 return __gfs2_lookup(dir, dentry, NULL, NULL);
761} 791}
762 792
763/** 793/**
@@ -1075,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
1075 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) 1105 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
1076 return -ENAMETOOLONG; 1106 return -ENAMETOOLONG;
1077 1107
1078 return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0); 1108 return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
1079} 1109}
1080 1110
1081/** 1111/**
@@ -1091,7 +1121,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1091{ 1121{
1092 struct gfs2_sbd *sdp = GFS2_SB(dir); 1122 struct gfs2_sbd *sdp = GFS2_SB(dir);
1093 unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 1123 unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
1094 return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0); 1124 return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
1095} 1125}
1096 1126
1097/** 1127/**
@@ -1106,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1106static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, 1136static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
1107 dev_t dev) 1137 dev_t dev)
1108{ 1138{
1109 return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0); 1139 return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
1140}
1141
1142/**
1143 * gfs2_atomic_open - Atomically open a file
1144 * @dir: The directory
1145 * @dentry: The proposed new entry
1146 * @file: The proposed new struct file
1147 * @flags: open flags
1148 * @mode: File mode
1149 * @opened: Flag to say whether the file has been opened or not
1150 *
1151 * Returns: error code or 0 for success
1152 */
1153
1154static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
1155 struct file *file, unsigned flags,
1156 umode_t mode, int *opened)
1157{
1158 struct dentry *d;
1159 bool excl = !!(flags & O_EXCL);
1160
1161 d = __gfs2_lookup(dir, dentry, file, opened);
1162 if (IS_ERR(d))
1163 return PTR_ERR(d);
1164 if (d == NULL)
1165 d = dentry;
1166 if (d->d_inode) {
1167 if (!(*opened & FILE_OPENED))
1168 return finish_no_open(file, d);
1169 return 0;
1170 }
1171
1172 if (!(flags & O_CREAT))
1173 return -ENOENT;
1174
1175 return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
1110} 1176}
1111 1177
1112/* 1178/*
@@ -1786,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = {
1786 .removexattr = gfs2_removexattr, 1852 .removexattr = gfs2_removexattr,
1787 .fiemap = gfs2_fiemap, 1853 .fiemap = gfs2_fiemap,
1788 .get_acl = gfs2_get_acl, 1854 .get_acl = gfs2_get_acl,
1855 .atomic_open = gfs2_atomic_open,
1789}; 1856};
1790 1857
1791const struct inode_operations gfs2_symlink_iops = { 1858const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c7477f6da..ba4d9492d422 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask);
109extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); 109extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
110extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 110extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
111extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 111extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
112extern int gfs2_open_common(struct inode *inode, struct file *file);
112 113
113extern const struct inode_operations gfs2_file_iops; 114extern const struct inode_operations gfs2_file_iops;
114extern const struct inode_operations gfs2_dir_iops; 115extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b404f4853034..610613fb65b5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -211,15 +211,16 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
211static int gfs2_ail1_empty(struct gfs2_sbd *sdp) 211static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
212{ 212{
213 struct gfs2_trans *tr, *s; 213 struct gfs2_trans *tr, *s;
214 int oldest_tr = 1;
214 int ret; 215 int ret;
215 216
216 spin_lock(&sdp->sd_ail_lock); 217 spin_lock(&sdp->sd_ail_lock);
217 list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { 218 list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
218 gfs2_ail1_empty_one(sdp, tr); 219 gfs2_ail1_empty_one(sdp, tr);
219 if (list_empty(&tr->tr_ail1_list)) 220 if (list_empty(&tr->tr_ail1_list) && oldest_tr)
220 list_move(&tr->tr_list, &sdp->sd_ail2_list); 221 list_move(&tr->tr_list, &sdp->sd_ail2_list);
221 else 222 else
222 break; 223 oldest_tr = 0;
223 } 224 }
224 ret = list_empty(&sdp->sd_ail1_list); 225 ret = list_empty(&sdp->sd_ail1_list);
225 spin_unlock(&sdp->sd_ail_lock); 226 spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
317 318
318int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) 319int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
319{ 320{
320 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); 321 unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
321 unsigned wanted = blks + reserved_blks; 322 unsigned wanted = blks + reserved_blks;
322 DEFINE_WAIT(wait); 323 DEFINE_WAIT(wait);
323 int did_wait = 0; 324 int did_wait = 0;
@@ -545,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
545 spin_unlock(&sdp->sd_ordered_lock); 546 spin_unlock(&sdp->sd_ordered_lock);
546} 547}
547 548
549void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
550{
551 struct buffer_head *bh = bd->bd_bh;
552 struct gfs2_glock *gl = bd->bd_gl;
553
554 gfs2_remove_from_ail(bd);
555 bd->bd_bh = NULL;
556 bh->b_private = NULL;
557 bd->bd_blkno = bh->b_blocknr;
558 bd->bd_ops = &gfs2_revoke_lops;
559 sdp->sd_log_num_revoke++;
560 atomic_inc(&gl->gl_revokes);
561 set_bit(GLF_LFLUSH, &gl->gl_flags);
562 list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
563}
564
565void gfs2_write_revokes(struct gfs2_sbd *sdp)
566{
567 struct gfs2_trans *tr;
568 struct gfs2_bufdata *bd, *tmp;
569 int have_revokes = 0;
570 int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
571
572 gfs2_ail1_empty(sdp);
573 spin_lock(&sdp->sd_ail_lock);
574 list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
575 list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
576 if (list_empty(&bd->bd_list)) {
577 have_revokes = 1;
578 goto done;
579 }
580 }
581 }
582done:
583 spin_unlock(&sdp->sd_ail_lock);
584 if (have_revokes == 0)
585 return;
586 while (sdp->sd_log_num_revoke > max_revokes)
587 max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
588 max_revokes -= sdp->sd_log_num_revoke;
589 if (!sdp->sd_log_num_revoke) {
590 atomic_dec(&sdp->sd_log_blks_free);
591 /* If no blocks have been reserved, we need to also
592 * reserve a block for the header */
593 if (!sdp->sd_log_blks_reserved)
594 atomic_dec(&sdp->sd_log_blks_free);
595 }
596 gfs2_log_lock(sdp);
597 spin_lock(&sdp->sd_ail_lock);
598 list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
599 list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
600 if (max_revokes == 0)
601 goto out_of_blocks;
602 if (!list_empty(&bd->bd_list))
603 continue;
604 gfs2_add_revoke(sdp, bd);
605 max_revokes--;
606 }
607 }
608out_of_blocks:
609 spin_unlock(&sdp->sd_ail_lock);
610 gfs2_log_unlock(sdp);
611
612 if (!sdp->sd_log_num_revoke) {
613 atomic_inc(&sdp->sd_log_blks_free);
614 if (!sdp->sd_log_blks_reserved)
615 atomic_inc(&sdp->sd_log_blks_free);
616 }
617}
618
548/** 619/**
549 * log_write_header - Get and initialize a journal header buffer 620 * log_write_header - Get and initialize a journal header buffer
550 * @sdp: The GFS2 superblock 621 * @sdp: The GFS2 superblock
@@ -562,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
562 lh = page_address(page); 633 lh = page_address(page);
563 clear_page(lh); 634 clear_page(lh);
564 635
565 gfs2_ail1_empty(sdp);
566 tail = current_tail(sdp); 636 tail = current_tail(sdp);
567 637
568 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 638 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3566f35915e0..37216634f0aa 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
72extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); 72extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
73extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); 73extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
74extern int gfs2_logd(void *data); 74extern int gfs2_logd(void *data);
75extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
76extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
75 77
76#endif /* __LOG_DOT_H__ */ 78#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index c5fa758fd844..17c5b5d7dc88 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -16,6 +16,7 @@
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h> 17#include <linux/bio.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/list_sort.h>
19 20
20#include "gfs2.h" 21#include "gfs2.h"
21#include "incore.h" 22#include "incore.h"
@@ -212,7 +213,7 @@ static void gfs2_end_log_write(struct bio *bio, int error)
212 fs_err(sdp, "Error %d writing to log\n", error); 213 fs_err(sdp, "Error %d writing to log\n", error);
213 } 214 }
214 215
215 bio_for_each_segment(bvec, bio, i) { 216 bio_for_each_segment_all(bvec, bio, i) {
216 page = bvec->bv_page; 217 page = bvec->bv_page;
217 if (page_has_buffers(page)) 218 if (page_has_buffers(page))
218 gfs2_end_log_write_bh(sdp, bvec, error); 219 gfs2_end_log_write_bh(sdp, bvec, error);
@@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh)
401 kunmap_atomic(kaddr); 402 kunmap_atomic(kaddr);
402} 403}
403 404
405static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
406{
407 struct gfs2_bufdata *bda, *bdb;
408
409 bda = list_entry(a, struct gfs2_bufdata, bd_list);
410 bdb = list_entry(b, struct gfs2_bufdata, bd_list);
411
412 if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
413 return -1;
414 if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
415 return 1;
416 return 0;
417}
418
404static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, 419static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
405 unsigned int total, struct list_head *blist, 420 unsigned int total, struct list_head *blist,
406 bool is_databuf) 421 bool is_databuf)
@@ -413,13 +428,16 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
413 __be64 *ptr; 428 __be64 *ptr;
414 429
415 gfs2_log_lock(sdp); 430 gfs2_log_lock(sdp);
431 list_sort(NULL, blist, blocknr_cmp);
416 bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list); 432 bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
417 while(total) { 433 while(total) {
418 num = total; 434 num = total;
419 if (total > limit) 435 if (total > limit)
420 num = limit; 436 num = limit;
421 gfs2_log_unlock(sdp); 437 gfs2_log_unlock(sdp);
422 page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num); 438 page = gfs2_get_log_desc(sdp,
439 is_databuf ? GFS2_LOG_DESC_JDATA :
440 GFS2_LOG_DESC_METADATA, num + 1, num);
423 ld = page_address(page); 441 ld = page_address(page);
424 gfs2_log_lock(sdp); 442 gfs2_log_lock(sdp);
425 ptr = (__be64 *)(ld + 1); 443 ptr = (__be64 *)(ld + 1);
@@ -588,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
588 struct page *page; 606 struct page *page;
589 unsigned int length; 607 unsigned int length;
590 608
609 gfs2_write_revokes(sdp);
591 if (!sdp->sd_log_num_revoke) 610 if (!sdp->sd_log_num_revoke)
592 return; 611 return;
593 612
@@ -834,10 +853,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
834 .lo_name = "revoke", 853 .lo_name = "revoke",
835}; 854};
836 855
837const struct gfs2_log_operations gfs2_rg_lops = {
838 .lo_name = "rg",
839};
840
841const struct gfs2_log_operations gfs2_databuf_lops = { 856const struct gfs2_log_operations gfs2_databuf_lops = {
842 .lo_before_commit = databuf_lo_before_commit, 857 .lo_before_commit = databuf_lo_before_commit,
843 .lo_after_commit = databuf_lo_after_commit, 858 .lo_after_commit = databuf_lo_after_commit,
@@ -849,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
849const struct gfs2_log_operations *gfs2_log_ops[] = { 864const struct gfs2_log_operations *gfs2_log_ops[] = {
850 &gfs2_databuf_lops, 865 &gfs2_databuf_lops,
851 &gfs2_buf_lops, 866 &gfs2_buf_lops,
852 &gfs2_rg_lops,
853 &gfs2_revoke_lops, 867 &gfs2_revoke_lops,
854 NULL, 868 NULL,
855}; 869};
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 87e062e05c92..9ca2e6438419 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -23,7 +23,6 @@
23extern const struct gfs2_log_operations gfs2_glock_lops; 23extern const struct gfs2_log_operations gfs2_glock_lops;
24extern const struct gfs2_log_operations gfs2_buf_lops; 24extern const struct gfs2_log_operations gfs2_buf_lops;
25extern const struct gfs2_log_operations gfs2_revoke_lops; 25extern const struct gfs2_log_operations gfs2_revoke_lops;
26extern const struct gfs2_log_operations gfs2_rg_lops;
27extern const struct gfs2_log_operations gfs2_databuf_lops; 26extern const struct gfs2_log_operations gfs2_databuf_lops;
28 27
29extern const struct gfs2_log_operations *gfs2_log_ops[]; 28extern const struct gfs2_log_operations *gfs2_log_ops[];
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 1a89afb68472..0da390686c08 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -296,10 +296,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
296 if (bd) { 296 if (bd) {
297 spin_lock(&sdp->sd_ail_lock); 297 spin_lock(&sdp->sd_ail_lock);
298 if (bd->bd_tr) { 298 if (bd->bd_tr) {
299 gfs2_remove_from_ail(bd);
300 bh->b_private = NULL;
301 bd->bd_bh = NULL;
302 bd->bd_blkno = bh->b_blocknr;
303 gfs2_trans_add_revoke(sdp, bd); 299 gfs2_trans_add_revoke(sdp, bd);
304 } 300 }
305 spin_unlock(&sdp->sd_ail_lock); 301 spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..0262c190b6f9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
916 goto fail_quotad; 916 goto fail_quotad;
917 917
918 p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); 918 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
919 error = IS_ERR(p); 919 if (IS_ERR(p)) {
920 if (error) { 920 error = PTR_ERR(p);
921 fs_err(sdp, "can't start logd thread: %d\n", error); 921 fs_err(sdp, "can't start logd thread: %d\n", error);
922 return error; 922 return error;
923 } 923 }
924 sdp->sd_logd_process = p; 924 sdp->sd_logd_process = p;
925 925
926 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); 926 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
927 error = IS_ERR(p); 927 if (IS_ERR(p)) {
928 if (error) { 928 error = PTR_ERR(p);
929 fs_err(sdp, "can't start quotad thread: %d\n", error); 929 fs_err(sdp, "can't start quotad thread: %d\n", error);
930 goto fail; 930 goto fail;
931 } 931 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c7c840e916f8..3768c2f40e43 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -121,7 +121,7 @@ static u64 qd2index(struct gfs2_quota_data *qd)
121{ 121{
122 struct kqid qid = qd->qd_id; 122 struct kqid qid = qd->qd_id;
123 return (2 * (u64)from_kqid(&init_user_ns, qid)) + 123 return (2 * (u64)from_kqid(&init_user_ns, qid)) +
124 (qid.type == USRQUOTA) ? 0 : 1; 124 ((qid.type == USRQUOTA) ? 0 : 1);
125} 125}
126 126
127static u64 qd2offset(struct gfs2_quota_data *qd) 127static u64 qd2offset(struct gfs2_quota_data *qd)
@@ -721,7 +721,7 @@ get_a_page:
721 goto unlock_out; 721 goto unlock_out;
722 } 722 }
723 723
724 gfs2_trans_add_meta(ip->i_gl, bh); 724 gfs2_trans_add_data(ip->i_gl, bh);
725 725
726 kaddr = kmap_atomic(page); 726 kaddr = kmap_atomic(page);
727 if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) 727 if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
@@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type)
1154 return error; 1154 return error;
1155} 1155}
1156 1156
1157static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
1158{
1159 return gfs2_quota_sync(sb, type);
1160}
1161
1162int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) 1157int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
1163{ 1158{
1164 struct gfs2_quota_data *qd; 1159 struct gfs2_quota_data *qd;
@@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data)
1414 &tune->gt_statfs_quantum); 1409 &tune->gt_statfs_quantum);
1415 1410
1416 /* Update quota file */ 1411 /* Update quota file */
1417 quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t, 1412 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
1418 &quotad_timeo, &tune->gt_quota_quantum); 1413 &quotad_timeo, &tune->gt_quota_quantum);
1419 1414
1420 /* Check for & recover partially truncated inodes */ 1415 /* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 0c5a575b513e..69317435faa7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -638,8 +638,10 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
638 */ 638 */
639void gfs2_rs_delete(struct gfs2_inode *ip) 639void gfs2_rs_delete(struct gfs2_inode *ip)
640{ 640{
641 struct inode *inode = &ip->i_inode;
642
641 down_write(&ip->i_rw_mutex); 643 down_write(&ip->i_rw_mutex);
642 if (ip->i_res) { 644 if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) {
643 gfs2_rs_deltree(ip->i_res); 645 gfs2_rs_deltree(ip->i_res);
644 BUG_ON(ip->i_res->rs_free); 646 BUG_ON(ip->i_res->rs_free);
645 kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); 647 kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
@@ -1286,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1286 minlen = max_t(u64, r.minlen, 1288 minlen = max_t(u64, r.minlen,
1287 q->limits.discard_granularity) >> bs_shift; 1289 q->limits.discard_granularity) >> bs_shift;
1288 1290
1291 if (end <= start || minlen > sdp->sd_max_rg_data)
1292 return -EINVAL;
1293
1289 rgd = gfs2_blk2rgrpd(sdp, start, 0); 1294 rgd = gfs2_blk2rgrpd(sdp, start, 0);
1290 rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0); 1295 rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1291 1296
1292 if (end <= start || 1297 if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1293 minlen > sdp->sd_max_rg_data || 1298 && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1294 start > rgd_end->rd_data0 + rgd_end->rd_data) 1299 return -EINVAL; /* start is beyond the end of the fs */
1295 return -EINVAL;
1296 1300
1297 while (1) { 1301 while (1) {
1298 1302
@@ -1334,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1334 } 1338 }
1335 1339
1336out: 1340out:
1337 r.len = trimmed << 9; 1341 r.len = trimmed << bs_shift;
1338 if (copy_to_user(argp, &r, sizeof(r))) 1342 if (copy_to_user(argp, &r, sizeof(r)))
1339 return -EFAULT; 1343 return -EFAULT;
1340 1344
@@ -1401,9 +1405,14 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1401 u32 extlen; 1405 u32 extlen;
1402 u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; 1406 u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
1403 int ret; 1407 int ret;
1408 struct inode *inode = &ip->i_inode;
1404 1409
1405 extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); 1410 if (S_ISDIR(inode->i_mode))
1406 extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); 1411 extlen = 1;
1412 else {
1413 extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
1414 extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
1415 }
1407 if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) 1416 if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1408 return; 1417 return;
1409 1418
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 917c8e1eb4ae..e5639dec66c4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1444,6 +1444,7 @@ static void gfs2_evict_inode(struct inode *inode)
1444 /* Must not read inode block until block type has been verified */ 1444 /* Must not read inode block until block type has been verified */
1445 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); 1445 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
1446 if (unlikely(error)) { 1446 if (unlikely(error)) {
1447 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1447 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 1448 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1448 goto out; 1449 goto out;
1449 } 1450 }
@@ -1514,8 +1515,10 @@ out_unlock:
1514 if (gfs2_rs_active(ip->i_res)) 1515 if (gfs2_rs_active(ip->i_res))
1515 gfs2_rs_deltree(ip->i_res); 1516 gfs2_rs_deltree(ip->i_res);
1516 1517
1517 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) 1518 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
1519 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1518 gfs2_glock_dq(&ip->i_iopen_gh); 1520 gfs2_glock_dq(&ip->i_iopen_gh);
1521 }
1519 gfs2_holder_uninit(&ip->i_iopen_gh); 1522 gfs2_holder_uninit(&ip->i_iopen_gh);
1520 gfs2_glock_dq_uninit(&gh); 1523 gfs2_glock_dq_uninit(&gh);
1521 if (error && error != GLR_TRYFAILED && error != -EROFS) 1524 if (error && error != GLR_TRYFAILED && error != -EROFS)
@@ -1534,6 +1537,7 @@ out:
1534 ip->i_gl = NULL; 1537 ip->i_gl = NULL;
1535 if (ip->i_iopen_gh.gh_gl) { 1538 if (ip->i_iopen_gh.gh_gl) {
1536 ip->i_iopen_gh.gh_gl->gl_object = NULL; 1539 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1540 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1537 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 1541 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1538 } 1542 }
1539} 1543}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7374907742a8..2b20d7046bf3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -270,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
270 270
271void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) 271void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
272{ 272{
273 struct gfs2_glock *gl = bd->bd_gl;
274 struct gfs2_trans *tr = current->journal_info; 273 struct gfs2_trans *tr = current->journal_info;
275 274
276 BUG_ON(!list_empty(&bd->bd_list)); 275 BUG_ON(!list_empty(&bd->bd_list));
277 BUG_ON(!list_empty(&bd->bd_ail_st_list)); 276 gfs2_add_revoke(sdp, bd);
278 BUG_ON(!list_empty(&bd->bd_ail_gl_list));
279 bd->bd_ops = &gfs2_revoke_lops;
280 tr->tr_touched = 1; 277 tr->tr_touched = 1;
281 tr->tr_num_revoke++; 278 tr->tr_num_revoke++;
282 sdp->sd_log_num_revoke++;
283 atomic_inc(&gl->gl_revokes);
284 set_bit(GLF_LFLUSH, &gl->gl_flags);
285 list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
286} 279}
287 280
288void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) 281void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index f3b1a15ccd59..d3fa6bd9503e 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -415,7 +415,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
415 spin_lock(&tree->hash_lock); 415 spin_lock(&tree->hash_lock);
416 node = hfs_bnode_findhash(tree, num); 416 node = hfs_bnode_findhash(tree, num);
417 spin_unlock(&tree->hash_lock); 417 spin_unlock(&tree->hash_lock);
418 BUG_ON(node); 418 if (node) {
419 pr_crit("new node %u already hashed?\n", num);
420 WARN_ON(1);
421 return node;
422 }
419 node = __hfs_bnode_create(tree, num); 423 node = __hfs_bnode_create(tree, num);
420 if (!node) 424 if (!node)
421 return ERR_PTR(-ENOMEM); 425 return ERR_PTR(-ENOMEM);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index e0101b6fb0d7..145566851e7a 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -51,9 +51,9 @@ done:
51/* 51/*
52 * hfs_readdir 52 * hfs_readdir
53 */ 53 */
54static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 54static int hfs_readdir(struct file *file, struct dir_context *ctx)
55{ 55{
56 struct inode *inode = file_inode(filp); 56 struct inode *inode = file_inode(file);
57 struct super_block *sb = inode->i_sb; 57 struct super_block *sb = inode->i_sb;
58 int len, err; 58 int len, err;
59 char strbuf[HFS_MAX_NAMELEN]; 59 char strbuf[HFS_MAX_NAMELEN];
@@ -62,7 +62,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
62 struct hfs_readdir_data *rd; 62 struct hfs_readdir_data *rd;
63 u16 type; 63 u16 type;
64 64
65 if (filp->f_pos >= inode->i_size) 65 if (ctx->pos >= inode->i_size)
66 return 0; 66 return 0;
67 67
68 err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); 68 err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
@@ -73,14 +73,13 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
73 if (err) 73 if (err)
74 goto out; 74 goto out;
75 75
76 switch ((u32)filp->f_pos) { 76 if (ctx->pos == 0) {
77 case 0:
78 /* This is completely artificial... */ 77 /* This is completely artificial... */
79 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) 78 if (!dir_emit_dot(file, ctx))
80 goto out; 79 goto out;
81 filp->f_pos++; 80 ctx->pos = 1;
82 /* fall through */ 81 }
83 case 1: 82 if (ctx->pos == 1) {
84 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { 83 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
85 err = -EIO; 84 err = -EIO;
86 goto out; 85 goto out;
@@ -97,18 +96,16 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
97 // err = -EIO; 96 // err = -EIO;
98 // goto out; 97 // goto out;
99 //} 98 //}
100 if (filldir(dirent, "..", 2, 1, 99 if (!dir_emit(ctx, "..", 2,
101 be32_to_cpu(entry.thread.ParID), DT_DIR)) 100 be32_to_cpu(entry.thread.ParID), DT_DIR))
102 goto out; 101 goto out;
103 filp->f_pos++; 102 ctx->pos = 2;
104 /* fall through */
105 default:
106 if (filp->f_pos >= inode->i_size)
107 goto out;
108 err = hfs_brec_goto(&fd, filp->f_pos - 1);
109 if (err)
110 goto out;
111 } 103 }
104 if (ctx->pos >= inode->i_size)
105 goto out;
106 err = hfs_brec_goto(&fd, ctx->pos - 1);
107 if (err)
108 goto out;
112 109
113 for (;;) { 110 for (;;) {
114 if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { 111 if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
@@ -131,7 +128,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
131 err = -EIO; 128 err = -EIO;
132 goto out; 129 goto out;
133 } 130 }
134 if (filldir(dirent, strbuf, len, filp->f_pos, 131 if (!dir_emit(ctx, strbuf, len,
135 be32_to_cpu(entry.dir.DirID), DT_DIR)) 132 be32_to_cpu(entry.dir.DirID), DT_DIR))
136 break; 133 break;
137 } else if (type == HFS_CDR_FIL) { 134 } else if (type == HFS_CDR_FIL) {
@@ -140,7 +137,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
140 err = -EIO; 137 err = -EIO;
141 goto out; 138 goto out;
142 } 139 }
143 if (filldir(dirent, strbuf, len, filp->f_pos, 140 if (!dir_emit(ctx, strbuf, len,
144 be32_to_cpu(entry.file.FlNum), DT_REG)) 141 be32_to_cpu(entry.file.FlNum), DT_REG))
145 break; 142 break;
146 } else { 143 } else {
@@ -148,22 +145,22 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
148 err = -EIO; 145 err = -EIO;
149 goto out; 146 goto out;
150 } 147 }
151 filp->f_pos++; 148 ctx->pos++;
152 if (filp->f_pos >= inode->i_size) 149 if (ctx->pos >= inode->i_size)
153 goto out; 150 goto out;
154 err = hfs_brec_goto(&fd, 1); 151 err = hfs_brec_goto(&fd, 1);
155 if (err) 152 if (err)
156 goto out; 153 goto out;
157 } 154 }
158 rd = filp->private_data; 155 rd = file->private_data;
159 if (!rd) { 156 if (!rd) {
160 rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); 157 rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
161 if (!rd) { 158 if (!rd) {
162 err = -ENOMEM; 159 err = -ENOMEM;
163 goto out; 160 goto out;
164 } 161 }
165 filp->private_data = rd; 162 file->private_data = rd;
166 rd->file = filp; 163 rd->file = file;
167 list_add(&rd->list, &HFS_I(inode)->open_dir_list); 164 list_add(&rd->list, &HFS_I(inode)->open_dir_list);
168 } 165 }
169 memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key)); 166 memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
@@ -306,7 +303,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
306 303
307const struct file_operations hfs_dir_operations = { 304const struct file_operations hfs_dir_operations = {
308 .read = generic_read_dir, 305 .read = generic_read_dir,
309 .readdir = hfs_readdir, 306 .iterate = hfs_readdir,
310 .llseek = generic_file_llseek, 307 .llseek = generic_file_llseek,
311 .release = hfs_dir_release, 308 .release = hfs_dir_release,
312}; 309};
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index a73b11839a41..0524cda47a6e 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -229,13 +229,10 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
229/* string.c */ 229/* string.c */
230extern const struct dentry_operations hfs_dentry_operations; 230extern const struct dentry_operations hfs_dentry_operations;
231 231
232extern int hfs_hash_dentry(const struct dentry *, const struct inode *, 232extern int hfs_hash_dentry(const struct dentry *, struct qstr *);
233 struct qstr *);
234extern int hfs_strcmp(const unsigned char *, unsigned int, 233extern int hfs_strcmp(const unsigned char *, unsigned int,
235 const unsigned char *, unsigned int); 234 const unsigned char *, unsigned int);
236extern int hfs_compare_dentry(const struct dentry *parent, 235extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
237 const struct inode *pinode,
238 const struct dentry *dentry, const struct inode *inode,
239 unsigned int len, const char *str, const struct qstr *name); 236 unsigned int len, const char *str, const struct qstr *name);
240 237
241/* trans.c */ 238/* trans.c */
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 495a976a3cc9..85b610c3909f 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,8 +51,7 @@ static unsigned char caseorder[256] = {
51/* 51/*
52 * Hash a string to an integer in a case-independent way 52 * Hash a string to an integer in a case-independent way
53 */ 53 */
54int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, 54int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
55 struct qstr *this)
56{ 55{
57 const unsigned char *name = this->name; 56 const unsigned char *name = this->name;
58 unsigned int hash, len = this->len; 57 unsigned int hash, len = this->len;
@@ -93,8 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
93 * Test for equality of two strings in the HFS filename character ordering. 92 * Test for equality of two strings in the HFS filename character ordering.
94 * return 1 on failure and 0 on success 93 * return 1 on failure and 0 on success
95 */ 94 */
96int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode, 95int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
97 const struct dentry *dentry, const struct inode *inode,
98 unsigned int len, const char *str, const struct qstr *name) 96 unsigned int len, const char *str, const struct qstr *name)
99{ 97{
100 const unsigned char *n1, *n2; 98 const unsigned char *n1, *n2;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index a37ac934732f..d8ce4bd17fc5 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -121,9 +121,9 @@ fail:
121 return ERR_PTR(err); 121 return ERR_PTR(err);
122} 122}
123 123
124static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) 124static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
125{ 125{
126 struct inode *inode = file_inode(filp); 126 struct inode *inode = file_inode(file);
127 struct super_block *sb = inode->i_sb; 127 struct super_block *sb = inode->i_sb;
128 int len, err; 128 int len, err;
129 char strbuf[HFSPLUS_MAX_STRLEN + 1]; 129 char strbuf[HFSPLUS_MAX_STRLEN + 1];
@@ -132,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
132 struct hfsplus_readdir_data *rd; 132 struct hfsplus_readdir_data *rd;
133 u16 type; 133 u16 type;
134 134
135 if (filp->f_pos >= inode->i_size) 135 if (file->f_pos >= inode->i_size)
136 return 0; 136 return 0;
137 137
138 err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); 138 err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
@@ -143,14 +143,13 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
143 if (err) 143 if (err)
144 goto out; 144 goto out;
145 145
146 switch ((u32)filp->f_pos) { 146 if (ctx->pos == 0) {
147 case 0:
148 /* This is completely artificial... */ 147 /* This is completely artificial... */
149 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) 148 if (!dir_emit_dot(file, ctx))
150 goto out; 149 goto out;
151 filp->f_pos++; 150 ctx->pos = 1;
152 /* fall through */ 151 }
153 case 1: 152 if (ctx->pos == 1) {
154 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { 153 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
155 err = -EIO; 154 err = -EIO;
156 goto out; 155 goto out;
@@ -168,19 +167,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
168 err = -EIO; 167 err = -EIO;
169 goto out; 168 goto out;
170 } 169 }
171 if (filldir(dirent, "..", 2, 1, 170 if (!dir_emit(ctx, "..", 2,
172 be32_to_cpu(entry.thread.parentID), DT_DIR)) 171 be32_to_cpu(entry.thread.parentID), DT_DIR))
173 goto out; 172 goto out;
174 filp->f_pos++; 173 ctx->pos = 2;
175 /* fall through */
176 default:
177 if (filp->f_pos >= inode->i_size)
178 goto out;
179 err = hfs_brec_goto(&fd, filp->f_pos - 1);
180 if (err)
181 goto out;
182 } 174 }
183 175 if (ctx->pos >= inode->i_size)
176 goto out;
177 err = hfs_brec_goto(&fd, ctx->pos - 1);
178 if (err)
179 goto out;
184 for (;;) { 180 for (;;) {
185 if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { 181 if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
186 pr_err("walked past end of dir\n"); 182 pr_err("walked past end of dir\n");
@@ -211,7 +207,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
211 HFSPLUS_SB(sb)->hidden_dir->i_ino == 207 HFSPLUS_SB(sb)->hidden_dir->i_ino ==
212 be32_to_cpu(entry.folder.id)) 208 be32_to_cpu(entry.folder.id))
213 goto next; 209 goto next;
214 if (filldir(dirent, strbuf, len, filp->f_pos, 210 if (!dir_emit(ctx, strbuf, len,
215 be32_to_cpu(entry.folder.id), DT_DIR)) 211 be32_to_cpu(entry.folder.id), DT_DIR))
216 break; 212 break;
217 } else if (type == HFSPLUS_FILE) { 213 } else if (type == HFSPLUS_FILE) {
@@ -220,7 +216,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
220 err = -EIO; 216 err = -EIO;
221 goto out; 217 goto out;
222 } 218 }
223 if (filldir(dirent, strbuf, len, filp->f_pos, 219 if (!dir_emit(ctx, strbuf, len,
224 be32_to_cpu(entry.file.id), DT_REG)) 220 be32_to_cpu(entry.file.id), DT_REG))
225 break; 221 break;
226 } else { 222 } else {
@@ -229,22 +225,22 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
229 goto out; 225 goto out;
230 } 226 }
231next: 227next:
232 filp->f_pos++; 228 ctx->pos++;
233 if (filp->f_pos >= inode->i_size) 229 if (ctx->pos >= inode->i_size)
234 goto out; 230 goto out;
235 err = hfs_brec_goto(&fd, 1); 231 err = hfs_brec_goto(&fd, 1);
236 if (err) 232 if (err)
237 goto out; 233 goto out;
238 } 234 }
239 rd = filp->private_data; 235 rd = file->private_data;
240 if (!rd) { 236 if (!rd) {
241 rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL); 237 rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
242 if (!rd) { 238 if (!rd) {
243 err = -ENOMEM; 239 err = -ENOMEM;
244 goto out; 240 goto out;
245 } 241 }
246 filp->private_data = rd; 242 file->private_data = rd;
247 rd->file = filp; 243 rd->file = file;
248 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); 244 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
249 } 245 }
250 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); 246 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
@@ -538,7 +534,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
538const struct file_operations hfsplus_dir_operations = { 534const struct file_operations hfsplus_dir_operations = {
539 .fsync = hfsplus_file_fsync, 535 .fsync = hfsplus_file_fsync,
540 .read = generic_read_dir, 536 .read = generic_read_dir,
541 .readdir = hfsplus_readdir, 537 .iterate = hfsplus_readdir,
542 .unlocked_ioctl = hfsplus_ioctl, 538 .unlocked_ioctl = hfsplus_ioctl,
543 .llseek = generic_file_llseek, 539 .llseek = generic_file_llseek,
544 .release = hfsplus_dir_release, 540 .release = hfsplus_dir_release,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 60b0a3388b26..ede79317cfb8 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -495,11 +495,8 @@ int hfsplus_uni2asc(struct super_block *,
495 const struct hfsplus_unistr *, char *, int *); 495 const struct hfsplus_unistr *, char *, int *);
496int hfsplus_asc2uni(struct super_block *, 496int hfsplus_asc2uni(struct super_block *,
497 struct hfsplus_unistr *, int, const char *, int); 497 struct hfsplus_unistr *, int, const char *, int);
498int hfsplus_hash_dentry(const struct dentry *dentry, 498int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
499 const struct inode *inode, struct qstr *str); 499int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
500int hfsplus_compare_dentry(const struct dentry *parent,
501 const struct inode *pinode,
502 const struct dentry *dentry, const struct inode *inode,
503 unsigned int len, const char *str, const struct qstr *name); 500 unsigned int len, const char *str, const struct qstr *name);
504 501
505/* wrapper.c */ 502/* wrapper.c */
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 2c2e47dcfdd8..e8ef121a4d8b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -334,8 +334,7 @@ int hfsplus_asc2uni(struct super_block *sb,
334 * Composed unicode characters are decomposed and case-folding is performed 334 * Composed unicode characters are decomposed and case-folding is performed
335 * if the appropriate bits are (un)set on the superblock. 335 * if the appropriate bits are (un)set on the superblock.
336 */ 336 */
337int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, 337int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
338 struct qstr *str)
339{ 338{
340 struct super_block *sb = dentry->d_sb; 339 struct super_block *sb = dentry->d_sb;
341 const char *astr; 340 const char *astr;
@@ -386,9 +385,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
386 * Composed unicode characters are decomposed and case-folding is performed 385 * Composed unicode characters are decomposed and case-folding is performed
387 * if the appropriate bits are (un)set on the superblock. 386 * if the appropriate bits are (un)set on the superblock.
388 */ 387 */
389int hfsplus_compare_dentry(const struct dentry *parent, 388int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
390 const struct inode *pinode,
391 const struct dentry *dentry, const struct inode *inode,
392 unsigned int len, const char *str, const struct qstr *name) 389 unsigned int len, const char *str, const struct qstr *name)
393{ 390{
394 struct super_block *sb = parent->d_sb; 391 struct super_block *sb = parent->d_sb;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 32f35f187989..cddb05217512 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -277,7 +277,7 @@ static const struct super_operations hostfs_sbops = {
277 .show_options = hostfs_show_options, 277 .show_options = hostfs_show_options,
278}; 278};
279 279
280int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) 280int hostfs_readdir(struct file *file, struct dir_context *ctx)
281{ 281{
282 void *dir; 282 void *dir;
283 char *name; 283 char *name;
@@ -292,12 +292,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
292 __putname(name); 292 __putname(name);
293 if (dir == NULL) 293 if (dir == NULL)
294 return -error; 294 return -error;
295 next = file->f_pos; 295 next = ctx->pos;
296 while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) { 296 while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
297 error = (*filldir)(ent, name, len, file->f_pos, 297 if (!dir_emit(ctx, name, len, ino, type))
298 ino, type); 298 break;
299 if (error) break; 299 ctx->pos = next;
300 file->f_pos = next;
301 } 300 }
302 close_dir(dir); 301 close_dir(dir);
303 return 0; 302 return 0;
@@ -393,7 +392,7 @@ static const struct file_operations hostfs_file_fops = {
393 392
394static const struct file_operations hostfs_dir_fops = { 393static const struct file_operations hostfs_dir_fops = {
395 .llseek = generic_file_llseek, 394 .llseek = generic_file_llseek,
396 .readdir = hostfs_readdir, 395 .iterate = hostfs_readdir,
397 .read = generic_read_dir, 396 .read = generic_read_dir,
398}; 397};
399 398
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 05d4816e4e77..fa27980f2229 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,8 +12,7 @@
12 * Note: the dentry argument is the parent dentry. 12 * Note: the dentry argument is the parent dentry.
13 */ 13 */
14 14
15static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, 15static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
16 struct qstr *qstr)
17{ 16{
18 unsigned long hash; 17 unsigned long hash;
19 int i; 18 int i;
@@ -35,9 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *ino
35 return 0; 34 return 0;
36} 35}
37 36
38static int hpfs_compare_dentry(const struct dentry *parent, 37static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
39 const struct inode *pinode,
40 const struct dentry *dentry, const struct inode *inode,
41 unsigned int len, const char *str, const struct qstr *name) 38 unsigned int len, const char *str, const struct qstr *name)
42{ 39{
43 unsigned al = len; 40 unsigned al = len;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 546f6d39713a..292b1acb9b81 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -33,36 +33,38 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
33 if (whence == SEEK_DATA || whence == SEEK_HOLE) 33 if (whence == SEEK_DATA || whence == SEEK_HOLE)
34 return -EINVAL; 34 return -EINVAL;
35 35
36 mutex_lock(&i->i_mutex);
36 hpfs_lock(s); 37 hpfs_lock(s);
37 38
38 /*printk("dir lseek\n");*/ 39 /*printk("dir lseek\n");*/
39 if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; 40 if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
40 mutex_lock(&i->i_mutex);
41 pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; 41 pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
42 while (pos != new_off) { 42 while (pos != new_off) {
43 if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); 43 if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
44 else goto fail; 44 else goto fail;
45 if (pos == 12) goto fail; 45 if (pos == 12) goto fail;
46 } 46 }
47 mutex_unlock(&i->i_mutex); 47 hpfs_add_pos(i, &filp->f_pos);
48ok: 48ok:
49 filp->f_pos = new_off;
49 hpfs_unlock(s); 50 hpfs_unlock(s);
50 return filp->f_pos = new_off;
51fail:
52 mutex_unlock(&i->i_mutex); 51 mutex_unlock(&i->i_mutex);
52 return new_off;
53fail:
53 /*printk("illegal lseek: %016llx\n", new_off);*/ 54 /*printk("illegal lseek: %016llx\n", new_off);*/
54 hpfs_unlock(s); 55 hpfs_unlock(s);
56 mutex_unlock(&i->i_mutex);
55 return -ESPIPE; 57 return -ESPIPE;
56} 58}
57 59
58static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 60static int hpfs_readdir(struct file *file, struct dir_context *ctx)
59{ 61{
60 struct inode *inode = file_inode(filp); 62 struct inode *inode = file_inode(file);
61 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); 63 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
62 struct quad_buffer_head qbh; 64 struct quad_buffer_head qbh;
63 struct hpfs_dirent *de; 65 struct hpfs_dirent *de;
64 int lc; 66 int lc;
65 long old_pos; 67 loff_t next_pos;
66 unsigned char *tempname; 68 unsigned char *tempname;
67 int c1, c2 = 0; 69 int c1, c2 = 0;
68 int ret = 0; 70 int ret = 0;
@@ -103,11 +105,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
103 } 105 }
104 } 106 }
105 lc = hpfs_sb(inode->i_sb)->sb_lowercase; 107 lc = hpfs_sb(inode->i_sb)->sb_lowercase;
106 if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */ 108 if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */
107 filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */ 109 ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */
108 goto out; 110 goto out;
109 } 111 }
110 if (filp->f_pos == 13) { 112 if (ctx->pos == 13) {
111 ret = -ENOENT; 113 ret = -ENOENT;
112 goto out; 114 goto out;
113 } 115 }
@@ -118,33 +120,34 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
118 accepted by filldir, but what can I do? 120 accepted by filldir, but what can I do?
119 maybe killall -9 ls helps */ 121 maybe killall -9 ls helps */
120 if (hpfs_sb(inode->i_sb)->sb_chk) 122 if (hpfs_sb(inode->i_sb)->sb_chk)
121 if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) { 123 if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) {
122 ret = -EFSERROR; 124 ret = -EFSERROR;
123 goto out; 125 goto out;
124 } 126 }
125 if (filp->f_pos == 12) 127 if (ctx->pos == 12)
126 goto out; 128 goto out;
127 if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) { 129 if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
128 printk("HPFS: warning: pos==%d\n",(int)filp->f_pos); 130 printk("HPFS: warning: pos==%d\n",(int)ctx->pos);
129 goto out; 131 goto out;
130 } 132 }
131 if (filp->f_pos == 0) { 133 if (ctx->pos == 0) {
132 if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) 134 if (!dir_emit_dot(file, ctx))
133 goto out; 135 goto out;
134 filp->f_pos = 11; 136 ctx->pos = 11;
135 } 137 }
136 if (filp->f_pos == 11) { 138 if (ctx->pos == 11) {
137 if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0) 139 if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR))
138 goto out; 140 goto out;
139 filp->f_pos = 1; 141 ctx->pos = 1;
140 } 142 }
141 if (filp->f_pos == 1) { 143 if (ctx->pos == 1) {
142 filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; 144 ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
143 hpfs_add_pos(inode, &filp->f_pos); 145 hpfs_add_pos(inode, &file->f_pos);
144 filp->f_version = inode->i_version; 146 file->f_version = inode->i_version;
145 } 147 }
146 old_pos = filp->f_pos; 148 next_pos = ctx->pos;
147 if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) { 149 if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
150 ctx->pos = next_pos;
148 ret = -EIOERROR; 151 ret = -EIOERROR;
149 goto out; 152 goto out;
150 } 153 }
@@ -152,20 +155,21 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
152 if (hpfs_sb(inode->i_sb)->sb_chk) { 155 if (hpfs_sb(inode->i_sb)->sb_chk) {
153 if (de->first && !de->last && (de->namelen != 2 156 if (de->first && !de->last && (de->namelen != 2
154 || de ->name[0] != 1 || de->name[1] != 1)) 157 || de ->name[0] != 1 || de->name[1] != 1))
155 hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos); 158 hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos);
156 if (de->last && (de->namelen != 1 || de ->name[0] != 255)) 159 if (de->last && (de->namelen != 1 || de ->name[0] != 255))
157 hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos); 160 hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos);
158 } 161 }
159 hpfs_brelse4(&qbh); 162 hpfs_brelse4(&qbh);
163 ctx->pos = next_pos;
160 goto again; 164 goto again;
161 } 165 }
162 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); 166 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
163 if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) { 167 if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) {
164 filp->f_pos = old_pos;
165 if (tempname != de->name) kfree(tempname); 168 if (tempname != de->name) kfree(tempname);
166 hpfs_brelse4(&qbh); 169 hpfs_brelse4(&qbh);
167 goto out; 170 goto out;
168 } 171 }
172 ctx->pos = next_pos;
169 if (tempname != de->name) kfree(tempname); 173 if (tempname != de->name) kfree(tempname);
170 hpfs_brelse4(&qbh); 174 hpfs_brelse4(&qbh);
171 } 175 }
@@ -320,7 +324,7 @@ const struct file_operations hpfs_dir_ops =
320{ 324{
321 .llseek = hpfs_dir_lseek, 325 .llseek = hpfs_dir_lseek,
322 .read = generic_read_dir, 326 .read = generic_read_dir,
323 .readdir = hpfs_readdir, 327 .iterate = hpfs_readdir,
324 .release = hpfs_dir_release, 328 .release = hpfs_dir_release,
325 .fsync = hpfs_file_fsync, 329 .fsync = hpfs_file_fsync,
326}; 330};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3027f4dbbab5..e4ba5fe4c3b5 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -109,10 +109,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
109{ 109{
110 struct inode *inode = mapping->host; 110 struct inode *inode = mapping->host;
111 111
112 hpfs_lock(inode->i_sb);
113
112 if (to > inode->i_size) { 114 if (to > inode->i_size) {
113 truncate_pagecache(inode, to, inode->i_size); 115 truncate_pagecache(inode, to, inode->i_size);
114 hpfs_truncate(inode); 116 hpfs_truncate(inode);
115 } 117 }
118
119 hpfs_unlock(inode->i_sb);
116} 120}
117 121
118static int hpfs_write_begin(struct file *file, struct address_space *mapping, 122static int hpfs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index cd3e38972c86..fc90ab11c340 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -542,8 +542,8 @@ static const struct file_operations hppfs_file_fops = {
542}; 542};
543 543
544struct hppfs_dirent { 544struct hppfs_dirent {
545 void *vfs_dirent; 545 struct dir_context ctx;
546 filldir_t filldir; 546 struct dir_context *caller;
547 struct dentry *dentry; 547 struct dentry *dentry;
548}; 548};
549 549
@@ -555,34 +555,29 @@ static int hppfs_filldir(void *d, const char *name, int size,
555 if (file_removed(dirent->dentry, name)) 555 if (file_removed(dirent->dentry, name))
556 return 0; 556 return 0;
557 557
558 return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset, 558 dirent->caller->pos = dirent->ctx.pos;
559 inode, type); 559 return !dir_emit(dirent->caller, name, size, inode, type);
560} 560}
561 561
562static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) 562static int hppfs_readdir(struct file *file, struct dir_context *ctx)
563{ 563{
564 struct hppfs_private *data = file->private_data; 564 struct hppfs_private *data = file->private_data;
565 struct file *proc_file = data->proc_file; 565 struct file *proc_file = data->proc_file;
566 int (*readdir)(struct file *, void *, filldir_t); 566 struct hppfs_dirent d = {
567 struct hppfs_dirent dirent = ((struct hppfs_dirent) 567 .ctx.actor = hppfs_filldir,
568 { .vfs_dirent = ent, 568 .caller = ctx,
569 .filldir = filldir, 569 .dentry = file->f_path.dentry
570 .dentry = file->f_path.dentry 570 };
571 });
572 int err; 571 int err;
573 572 proc_file->f_pos = ctx->pos;
574 readdir = file_inode(proc_file)->i_fop->readdir; 573 err = iterate_dir(proc_file, &d.ctx);
575 574 ctx->pos = d.ctx.pos;
576 proc_file->f_pos = file->f_pos;
577 err = (*readdir)(proc_file, &dirent, hppfs_filldir);
578 file->f_pos = proc_file->f_pos;
579
580 return err; 575 return err;
581} 576}
582 577
583static const struct file_operations hppfs_dir_fops = { 578static const struct file_operations hppfs_dir_fops = {
584 .owner = NULL, 579 .owner = NULL,
585 .readdir = hppfs_readdir, 580 .iterate = hppfs_readdir,
586 .open = hppfs_dir_open, 581 .open = hppfs_dir_open,
587 .llseek = default_llseek, 582 .llseek = default_llseek,
588 .release = hppfs_release, 583 .release = hppfs_release,
diff --git a/fs/inode.c b/fs/inode.c
index 00d5fc3b86e1..d6dfb09c8280 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -333,8 +333,10 @@ EXPORT_SYMBOL(set_nlink);
333 */ 333 */
334void inc_nlink(struct inode *inode) 334void inc_nlink(struct inode *inode)
335{ 335{
336 if (WARN_ON(inode->i_nlink == 0)) 336 if (unlikely(inode->i_nlink == 0)) {
337 WARN_ON(!(inode->i_state & I_LINKABLE));
337 atomic_long_dec(&inode->i_sb->s_remove_count); 338 atomic_long_dec(&inode->i_sb->s_remove_count);
339 }
338 340
339 inode->__i_nlink++; 341 inode->__i_nlink++;
340} 342}
diff --git a/fs/internal.h b/fs/internal.h
index eaa75f75b625..7c5f01cf619d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -96,11 +96,12 @@ struct open_flags {
96 umode_t mode; 96 umode_t mode;
97 int acc_mode; 97 int acc_mode;
98 int intent; 98 int intent;
99 int lookup_flags;
99}; 100};
100extern struct file *do_filp_open(int dfd, struct filename *pathname, 101extern struct file *do_filp_open(int dfd, struct filename *pathname,
101 const struct open_flags *op, int flags); 102 const struct open_flags *op);
102extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, 103extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
103 const char *, const struct open_flags *, int lookup_flags); 104 const char *, const struct open_flags *);
104 105
105extern long do_handle_open(int mountdirfd, 106extern long do_handle_open(int mountdirfd,
106 struct file_handle __user *ufh, int open_flag); 107 struct file_handle __user *ufh, int open_flag);
@@ -130,6 +131,13 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
130 * read_write.c 131 * read_write.c
131 */ 132 */
132extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); 133extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
134extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
135
136/*
137 * splice.c
138 */
139extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
140 loff_t *opos, size_t len, unsigned int flags);
133 141
134/* 142/*
135 * pipe.c 143 * pipe.c
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index a7d5c3c3d4e6..b943cbd963bb 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -78,8 +78,8 @@ int get_acorn_filename(struct iso_directory_record *de,
78/* 78/*
79 * This should _really_ be cleaned up some day.. 79 * This should _really_ be cleaned up some day..
80 */ 80 */
81static int do_isofs_readdir(struct inode *inode, struct file *filp, 81static int do_isofs_readdir(struct inode *inode, struct file *file,
82 void *dirent, filldir_t filldir, 82 struct dir_context *ctx,
83 char *tmpname, struct iso_directory_record *tmpde) 83 char *tmpname, struct iso_directory_record *tmpde)
84{ 84{
85 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 85 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -94,10 +94,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
94 struct iso_directory_record *de; 94 struct iso_directory_record *de;
95 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); 95 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
96 96
97 offset = filp->f_pos & (bufsize - 1); 97 offset = ctx->pos & (bufsize - 1);
98 block = filp->f_pos >> bufbits; 98 block = ctx->pos >> bufbits;
99 99
100 while (filp->f_pos < inode->i_size) { 100 while (ctx->pos < inode->i_size) {
101 int de_len; 101 int de_len;
102 102
103 if (!bh) { 103 if (!bh) {
@@ -108,7 +108,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
108 108
109 de = (struct iso_directory_record *) (bh->b_data + offset); 109 de = (struct iso_directory_record *) (bh->b_data + offset);
110 110
111 de_len = *(unsigned char *) de; 111 de_len = *(unsigned char *)de;
112 112
113 /* 113 /*
114 * If the length byte is zero, we should move on to the next 114 * If the length byte is zero, we should move on to the next
@@ -119,8 +119,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
119 if (de_len == 0) { 119 if (de_len == 0) {
120 brelse(bh); 120 brelse(bh);
121 bh = NULL; 121 bh = NULL;
122 filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); 122 ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
123 block = filp->f_pos >> bufbits; 123 block = ctx->pos >> bufbits;
124 offset = 0; 124 offset = 0;
125 continue; 125 continue;
126 } 126 }
@@ -164,16 +164,16 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
164 164
165 if (de->flags[-sbi->s_high_sierra] & 0x80) { 165 if (de->flags[-sbi->s_high_sierra] & 0x80) {
166 first_de = 0; 166 first_de = 0;
167 filp->f_pos += de_len; 167 ctx->pos += de_len;
168 continue; 168 continue;
169 } 169 }
170 first_de = 1; 170 first_de = 1;
171 171
172 /* Handle the case of the '.' directory */ 172 /* Handle the case of the '.' directory */
173 if (de->name_len[0] == 1 && de->name[0] == 0) { 173 if (de->name_len[0] == 1 && de->name[0] == 0) {
174 if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) 174 if (!dir_emit_dot(file, ctx))
175 break; 175 break;
176 filp->f_pos += de_len; 176 ctx->pos += de_len;
177 continue; 177 continue;
178 } 178 }
179 179
@@ -181,10 +181,9 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
181 181
182 /* Handle the case of the '..' directory */ 182 /* Handle the case of the '..' directory */
183 if (de->name_len[0] == 1 && de->name[0] == 1) { 183 if (de->name_len[0] == 1 && de->name[0] == 1) {
184 inode_number = parent_ino(filp->f_path.dentry); 184 if (!dir_emit_dotdot(file, ctx))
185 if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
186 break; 185 break;
187 filp->f_pos += de_len; 186 ctx->pos += de_len;
188 continue; 187 continue;
189 } 188 }
190 189
@@ -198,7 +197,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
198 if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) || 197 if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
199 (!sbi->s_showassoc && 198 (!sbi->s_showassoc &&
200 (de->flags[-sbi->s_high_sierra] & 4))) { 199 (de->flags[-sbi->s_high_sierra] & 4))) {
201 filp->f_pos += de_len; 200 ctx->pos += de_len;
202 continue; 201 continue;
203 } 202 }
204 203
@@ -230,10 +229,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
230 } 229 }
231 } 230 }
232 if (len > 0) { 231 if (len > 0) {
233 if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0) 232 if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN))
234 break; 233 break;
235 } 234 }
236 filp->f_pos += de_len; 235 ctx->pos += de_len;
237 236
238 continue; 237 continue;
239 } 238 }
@@ -247,13 +246,12 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
247 * handling split directory entries.. The real work is done by 246 * handling split directory entries.. The real work is done by
248 * "do_isofs_readdir()". 247 * "do_isofs_readdir()".
249 */ 248 */
250static int isofs_readdir(struct file *filp, 249static int isofs_readdir(struct file *file, struct dir_context *ctx)
251 void *dirent, filldir_t filldir)
252{ 250{
253 int result; 251 int result;
254 char *tmpname; 252 char *tmpname;
255 struct iso_directory_record *tmpde; 253 struct iso_directory_record *tmpde;
256 struct inode *inode = file_inode(filp); 254 struct inode *inode = file_inode(file);
257 255
258 tmpname = (char *)__get_free_page(GFP_KERNEL); 256 tmpname = (char *)__get_free_page(GFP_KERNEL);
259 if (tmpname == NULL) 257 if (tmpname == NULL)
@@ -261,7 +259,7 @@ static int isofs_readdir(struct file *filp,
261 259
262 tmpde = (struct iso_directory_record *) (tmpname+1024); 260 tmpde = (struct iso_directory_record *) (tmpname+1024);
263 261
264 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); 262 result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
265 263
266 free_page((unsigned long) tmpname); 264 free_page((unsigned long) tmpname);
267 return result; 265 return result;
@@ -271,7 +269,7 @@ const struct file_operations isofs_dir_operations =
271{ 269{
272 .llseek = generic_file_llseek, 270 .llseek = generic_file_llseek,
273 .read = generic_read_dir, 271 .read = generic_read_dir,
274 .readdir = isofs_readdir, 272 .iterate = isofs_readdir,
275}; 273};
276 274
277/* 275/*
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d9b8aebdeb22..c348d6d88624 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -28,31 +28,23 @@
28 28
29#define BEQUIET 29#define BEQUIET
30 30
31static int isofs_hashi(const struct dentry *parent, const struct inode *inode, 31static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
32 struct qstr *qstr); 32static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
33static int isofs_hash(const struct dentry *parent, const struct inode *inode,
34 struct qstr *qstr);
35static int isofs_dentry_cmpi(const struct dentry *parent, 33static int isofs_dentry_cmpi(const struct dentry *parent,
36 const struct inode *pinode, 34 const struct dentry *dentry,
37 const struct dentry *dentry, const struct inode *inode,
38 unsigned int len, const char *str, const struct qstr *name); 35 unsigned int len, const char *str, const struct qstr *name);
39static int isofs_dentry_cmp(const struct dentry *parent, 36static int isofs_dentry_cmp(const struct dentry *parent,
40 const struct inode *pinode, 37 const struct dentry *dentry,
41 const struct dentry *dentry, const struct inode *inode,
42 unsigned int len, const char *str, const struct qstr *name); 38 unsigned int len, const char *str, const struct qstr *name);
43 39
44#ifdef CONFIG_JOLIET 40#ifdef CONFIG_JOLIET
45static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode, 41static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
46 struct qstr *qstr); 42static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
47static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
48 struct qstr *qstr);
49static int isofs_dentry_cmpi_ms(const struct dentry *parent, 43static int isofs_dentry_cmpi_ms(const struct dentry *parent,
50 const struct inode *pinode, 44 const struct dentry *dentry,
51 const struct dentry *dentry, const struct inode *inode,
52 unsigned int len, const char *str, const struct qstr *name); 45 unsigned int len, const char *str, const struct qstr *name);
53static int isofs_dentry_cmp_ms(const struct dentry *parent, 46static int isofs_dentry_cmp_ms(const struct dentry *parent,
54 const struct inode *pinode, 47 const struct dentry *dentry,
55 const struct dentry *dentry, const struct inode *inode,
56 unsigned int len, const char *str, const struct qstr *name); 48 unsigned int len, const char *str, const struct qstr *name);
57#endif 49#endif
58 50
@@ -265,30 +257,26 @@ static int isofs_dentry_cmp_common(
265} 257}
266 258
267static int 259static int
268isofs_hash(const struct dentry *dentry, const struct inode *inode, 260isofs_hash(const struct dentry *dentry, struct qstr *qstr)
269 struct qstr *qstr)
270{ 261{
271 return isofs_hash_common(dentry, qstr, 0); 262 return isofs_hash_common(dentry, qstr, 0);
272} 263}
273 264
274static int 265static int
275isofs_hashi(const struct dentry *dentry, const struct inode *inode, 266isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
276 struct qstr *qstr)
277{ 267{
278 return isofs_hashi_common(dentry, qstr, 0); 268 return isofs_hashi_common(dentry, qstr, 0);
279} 269}
280 270
281static int 271static int
282isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode, 272isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
283 const struct dentry *dentry, const struct inode *inode,
284 unsigned int len, const char *str, const struct qstr *name) 273 unsigned int len, const char *str, const struct qstr *name)
285{ 274{
286 return isofs_dentry_cmp_common(len, str, name, 0, 0); 275 return isofs_dentry_cmp_common(len, str, name, 0, 0);
287} 276}
288 277
289static int 278static int
290isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, 279isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
291 const struct dentry *dentry, const struct inode *inode,
292 unsigned int len, const char *str, const struct qstr *name) 280 unsigned int len, const char *str, const struct qstr *name)
293{ 281{
294 return isofs_dentry_cmp_common(len, str, name, 0, 1); 282 return isofs_dentry_cmp_common(len, str, name, 0, 1);
@@ -296,30 +284,26 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
296 284
297#ifdef CONFIG_JOLIET 285#ifdef CONFIG_JOLIET
298static int 286static int
299isofs_hash_ms(const struct dentry *dentry, const struct inode *inode, 287isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
300 struct qstr *qstr)
301{ 288{
302 return isofs_hash_common(dentry, qstr, 1); 289 return isofs_hash_common(dentry, qstr, 1);
303} 290}
304 291
305static int 292static int
306isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode, 293isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
307 struct qstr *qstr)
308{ 294{
309 return isofs_hashi_common(dentry, qstr, 1); 295 return isofs_hashi_common(dentry, qstr, 1);
310} 296}
311 297
312static int 298static int
313isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode, 299isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry,
314 const struct dentry *dentry, const struct inode *inode,
315 unsigned int len, const char *str, const struct qstr *name) 300 unsigned int len, const char *str, const struct qstr *name)
316{ 301{
317 return isofs_dentry_cmp_common(len, str, name, 1, 0); 302 return isofs_dentry_cmp_common(len, str, name, 1, 0);
318} 303}
319 304
320static int 305static int
321isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode, 306isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry,
322 const struct dentry *dentry, const struct inode *inode,
323 unsigned int len, const char *str, const struct qstr *name) 307 unsigned int len, const char *str, const struct qstr *name)
324{ 308{
325 return isofs_dentry_cmp_common(len, str, name, 1, 1); 309 return isofs_dentry_cmp_common(len, str, name, 1, 1);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c167028844ed..95295640d9c8 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,8 +37,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
37 37
38 qstr.name = compare; 38 qstr.name = compare;
39 qstr.len = dlen; 39 qstr.len = dlen;
40 return dentry->d_op->d_compare(NULL, NULL, NULL, NULL, 40 return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
41 dentry->d_name.len, dentry->d_name.name, &qstr);
42} 41}
43 42
44/* 43/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e3e255c0a509..be0c39b66fe0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -2019,16 +2019,20 @@ zap_buffer_unlocked:
2019 * void journal_invalidatepage() - invalidate a journal page 2019 * void journal_invalidatepage() - invalidate a journal page
2020 * @journal: journal to use for flush 2020 * @journal: journal to use for flush
2021 * @page: page to flush 2021 * @page: page to flush
2022 * @offset: length of page to invalidate. 2022 * @offset: offset of the range to invalidate
2023 * @length: length of the range to invalidate
2023 * 2024 *
2024 * Reap page buffers containing data after offset in page. 2025 * Reap page buffers containing data in specified range in page.
2025 */ 2026 */
2026void journal_invalidatepage(journal_t *journal, 2027void journal_invalidatepage(journal_t *journal,
2027 struct page *page, 2028 struct page *page,
2028 unsigned long offset) 2029 unsigned int offset,
2030 unsigned int length)
2029{ 2031{
2030 struct buffer_head *head, *bh, *next; 2032 struct buffer_head *head, *bh, *next;
2033 unsigned int stop = offset + length;
2031 unsigned int curr_off = 0; 2034 unsigned int curr_off = 0;
2035 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2032 int may_free = 1; 2036 int may_free = 1;
2033 2037
2034 if (!PageLocked(page)) 2038 if (!PageLocked(page))
@@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
2036 if (!page_has_buffers(page)) 2040 if (!page_has_buffers(page))
2037 return; 2041 return;
2038 2042
2043 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
2044
2039 /* We will potentially be playing with lists other than just the 2045 /* We will potentially be playing with lists other than just the
2040 * data lists (especially for journaled data mode), so be 2046 * data lists (especially for journaled data mode), so be
2041 * cautious in our locking. */ 2047 * cautious in our locking. */
@@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
2045 unsigned int next_off = curr_off + bh->b_size; 2051 unsigned int next_off = curr_off + bh->b_size;
2046 next = bh->b_this_page; 2052 next = bh->b_this_page;
2047 2053
2054 if (next_off > stop)
2055 return;
2056
2048 if (offset <= curr_off) { 2057 if (offset <= curr_off) {
2049 /* This block is wholly outside the truncation point */ 2058 /* This block is wholly outside the truncation point */
2050 lock_buffer(bh); 2059 lock_buffer(bh);
2051 may_free &= journal_unmap_buffer(journal, bh, 2060 may_free &= journal_unmap_buffer(journal, bh,
2052 offset > 0); 2061 partial_page);
2053 unlock_buffer(bh); 2062 unlock_buffer(bh);
2054 } 2063 }
2055 curr_off = next_off; 2064 curr_off = next_off;
@@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
2057 2066
2058 } while (bh != head); 2067 } while (bh != head);
2059 2068
2060 if (!offset) { 2069 if (!partial_page) {
2061 if (may_free && try_to_free_buffers(page)) 2070 if (may_free && try_to_free_buffers(page))
2062 J_ASSERT(!page_has_buffers(page)); 2071 J_ASSERT(!page_has_buffers(page));
2063 } 2072 }
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944da..5a9f5534d57b 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
20 20
21config JBD2_DEBUG 21config JBD2_DEBUG
22 bool "JBD2 (ext4) debugging support" 22 bool "JBD2 (ext4) debugging support"
23 depends on JBD2 && DEBUG_FS 23 depends on JBD2
24 help 24 help
25 If you are using the ext4 journaled file system (or 25 If you are using the ext4 journaled file system (or
26 potentially any other filesystem/device using JBD2), this option 26 potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
29 By default, the debugging output will be turned off. 29 By default, the debugging output will be turned off.
30 30
31 If you select Y here, then you will be able to turn on debugging 31 If you select Y here, then you will be able to turn on debugging
32 with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a 32 with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
33 number between 1 and 5. The higher the number, the more debugging 33 number between 1 and 5. The higher the number, the more debugging
34 output is generated. To turn debugging off again, do 34 output is generated. To turn debugging off again, do
35 "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". 35 "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81cf..7f34f4716165 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
120 int nblocks, space_left; 120 int nblocks, space_left;
121 /* assert_spin_locked(&journal->j_state_lock); */ 121 /* assert_spin_locked(&journal->j_state_lock); */
122 122
123 nblocks = jbd_space_needed(journal); 123 nblocks = jbd2_space_needed(journal);
124 while (__jbd2_log_space_left(journal) < nblocks) { 124 while (jbd2_log_space_left(journal) < nblocks) {
125 if (journal->j_flags & JBD2_ABORT) 125 if (journal->j_flags & JBD2_ABORT)
126 return; 126 return;
127 write_unlock(&journal->j_state_lock); 127 write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
140 */ 140 */
141 write_lock(&journal->j_state_lock); 141 write_lock(&journal->j_state_lock);
142 spin_lock(&journal->j_list_lock); 142 spin_lock(&journal->j_list_lock);
143 nblocks = jbd_space_needed(journal); 143 nblocks = jbd2_space_needed(journal);
144 space_left = __jbd2_log_space_left(journal); 144 space_left = jbd2_log_space_left(journal);
145 if (space_left < nblocks) { 145 if (space_left < nblocks) {
146 int chkpt = journal->j_checkpoint_transactions != NULL; 146 int chkpt = journal->j_checkpoint_transactions != NULL;
147 tid_t tid = 0; 147 tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
156 /* We were able to recover space; yay! */ 156 /* We were able to recover space; yay! */
157 ; 157 ;
158 } else if (tid) { 158 } else if (tid) {
159 /*
160 * jbd2_journal_commit_transaction() may want
161 * to take the checkpoint_mutex if JBD2_FLUSHED
162 * is set. So we need to temporarily drop it.
163 */
164 mutex_unlock(&journal->j_checkpoint_mutex);
159 jbd2_log_wait_commit(journal, tid); 165 jbd2_log_wait_commit(journal, tid);
166 write_lock(&journal->j_state_lock);
167 continue;
160 } else { 168 } else {
161 printk(KERN_ERR "%s: needed %d blocks and " 169 printk(KERN_ERR "%s: needed %d blocks and "
162 "only had %d space available\n", 170 "only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
625 633
626 __jbd2_journal_drop_transaction(journal, transaction); 634 __jbd2_journal_drop_transaction(journal, transaction);
627 jbd2_journal_free_transaction(transaction); 635 jbd2_journal_free_transaction(transaction);
628
629 /* Just in case anybody was waiting for more transactions to be
630 checkpointed... */
631 wake_up(&journal->j_wait_logspace);
632 ret = 1; 636 ret = 1;
633out: 637out:
634 return ret; 638 return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
690 J_ASSERT(transaction->t_state == T_FINISHED); 694 J_ASSERT(transaction->t_state == T_FINISHED);
691 J_ASSERT(transaction->t_buffers == NULL); 695 J_ASSERT(transaction->t_buffers == NULL);
692 J_ASSERT(transaction->t_forget == NULL); 696 J_ASSERT(transaction->t_forget == NULL);
693 J_ASSERT(transaction->t_iobuf_list == NULL);
694 J_ASSERT(transaction->t_shadow_list == NULL); 697 J_ASSERT(transaction->t_shadow_list == NULL);
695 J_ASSERT(transaction->t_log_list == NULL);
696 J_ASSERT(transaction->t_checkpoint_list == NULL); 698 J_ASSERT(transaction->t_checkpoint_list == NULL);
697 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 699 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
698 J_ASSERT(atomic_read(&transaction->t_updates) == 0); 700 J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0f53946f13c1..559bec1a37b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
30#include <trace/events/jbd2.h> 30#include <trace/events/jbd2.h>
31 31
32/* 32/*
33 * Default IO end handler for temporary BJ_IO buffer_heads. 33 * IO end handler for temporary buffer_heads handling writes to the journal.
34 */ 34 */
35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36{ 36{
37 struct buffer_head *orig_bh = bh->b_private;
38
37 BUFFER_TRACE(bh, ""); 39 BUFFER_TRACE(bh, "");
38 if (uptodate) 40 if (uptodate)
39 set_buffer_uptodate(bh); 41 set_buffer_uptodate(bh);
40 else 42 else
41 clear_buffer_uptodate(bh); 43 clear_buffer_uptodate(bh);
44 if (orig_bh) {
45 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 smp_mb__after_clear_bit();
47 wake_up_bit(&orig_bh->b_state, BH_Shadow);
48 }
42 unlock_buffer(bh); 49 unlock_buffer(bh);
43} 50}
44 51
@@ -85,8 +92,7 @@ nope:
85 __brelse(bh); 92 __brelse(bh);
86} 93}
87 94
88static void jbd2_commit_block_csum_set(journal_t *j, 95static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
89 struct journal_head *descriptor)
90{ 96{
91 struct commit_header *h; 97 struct commit_header *h;
92 __u32 csum; 98 __u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
94 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 100 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
95 return; 101 return;
96 102
97 h = (struct commit_header *)(jh2bh(descriptor)->b_data); 103 h = (struct commit_header *)(bh->b_data);
98 h->h_chksum_type = 0; 104 h->h_chksum_type = 0;
99 h->h_chksum_size = 0; 105 h->h_chksum_size = 0;
100 h->h_chksum[0] = 0; 106 h->h_chksum[0] = 0;
101 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 107 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
102 j->j_blocksize);
103 h->h_chksum[0] = cpu_to_be32(csum); 108 h->h_chksum[0] = cpu_to_be32(csum);
104} 109}
105 110
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
116 struct buffer_head **cbh, 121 struct buffer_head **cbh,
117 __u32 crc32_sum) 122 __u32 crc32_sum)
118{ 123{
119 struct journal_head *descriptor;
120 struct commit_header *tmp; 124 struct commit_header *tmp;
121 struct buffer_head *bh; 125 struct buffer_head *bh;
122 int ret; 126 int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
127 if (is_journal_aborted(journal)) 131 if (is_journal_aborted(journal))
128 return 0; 132 return 0;
129 133
130 descriptor = jbd2_journal_get_descriptor_buffer(journal); 134 bh = jbd2_journal_get_descriptor_buffer(journal);
131 if (!descriptor) 135 if (!bh)
132 return 1; 136 return 1;
133 137
134 bh = jh2bh(descriptor);
135
136 tmp = (struct commit_header *)bh->b_data; 138 tmp = (struct commit_header *)bh->b_data;
137 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
138 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
146 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 148 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
147 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 149 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
148 } 150 }
149 jbd2_commit_block_csum_set(journal, descriptor); 151 jbd2_commit_block_csum_set(journal, bh);
150 152
151 JBUFFER_TRACE(descriptor, "submit commit block"); 153 BUFFER_TRACE(bh, "submit commit block");
152 lock_buffer(bh); 154 lock_buffer(bh);
153 clear_buffer_dirty(bh); 155 clear_buffer_dirty(bh);
154 set_buffer_uptodate(bh); 156 set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
180 if (unlikely(!buffer_uptodate(bh))) 182 if (unlikely(!buffer_uptodate(bh)))
181 ret = -EIO; 183 ret = -EIO;
182 put_bh(bh); /* One for getblk() */ 184 put_bh(bh); /* One for getblk() */
183 jbd2_journal_put_journal_head(bh2jh(bh));
184 185
185 return ret; 186 return ret;
186} 187}
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
321} 322}
322 323
323static void jbd2_descr_block_csum_set(journal_t *j, 324static void jbd2_descr_block_csum_set(journal_t *j,
324 struct journal_head *descriptor) 325 struct buffer_head *bh)
325{ 326{
326 struct jbd2_journal_block_tail *tail; 327 struct jbd2_journal_block_tail *tail;
327 __u32 csum; 328 __u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
329 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 330 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
330 return; 331 return;
331 332
332 tail = (struct jbd2_journal_block_tail *) 333 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
333 (jh2bh(descriptor)->b_data + j->j_blocksize -
334 sizeof(struct jbd2_journal_block_tail)); 334 sizeof(struct jbd2_journal_block_tail));
335 tail->t_checksum = 0; 335 tail->t_checksum = 0;
336 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 336 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
337 j->j_blocksize);
338 tail->t_checksum = cpu_to_be32(csum); 337 tail->t_checksum = cpu_to_be32(csum);
339} 338}
340 339
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
343{ 342{
344 struct page *page = bh->b_page; 343 struct page *page = bh->b_page;
345 __u8 *addr; 344 __u8 *addr;
346 __u32 csum; 345 __u32 csum32;
347 346
348 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 347 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
349 return; 348 return;
350 349
351 sequence = cpu_to_be32(sequence); 350 sequence = cpu_to_be32(sequence);
352 addr = kmap_atomic(page); 351 addr = kmap_atomic(page);
353 csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 352 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
354 sizeof(sequence)); 353 sizeof(sequence));
355 csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data), 354 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
356 bh->b_size); 355 bh->b_size);
357 kunmap_atomic(addr); 356 kunmap_atomic(addr);
358 357
359 tag->t_checksum = cpu_to_be32(csum); 358 /* We only have space to store the lower 16 bits of the crc32c. */
359 tag->t_checksum = cpu_to_be16(csum32);
360} 360}
361/* 361/*
362 * jbd2_journal_commit_transaction 362 * jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
368{ 368{
369 struct transaction_stats_s stats; 369 struct transaction_stats_s stats;
370 transaction_t *commit_transaction; 370 transaction_t *commit_transaction;
371 struct journal_head *jh, *new_jh, *descriptor; 371 struct journal_head *jh;
372 struct buffer_head *descriptor;
372 struct buffer_head **wbuf = journal->j_wbuf; 373 struct buffer_head **wbuf = journal->j_wbuf;
373 int bufs; 374 int bufs;
374 int flags; 375 int flags;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
392 tid_t first_tid; 393 tid_t first_tid;
393 int update_tail; 394 int update_tail;
394 int csum_size = 0; 395 int csum_size = 0;
396 LIST_HEAD(io_bufs);
397 LIST_HEAD(log_bufs);
395 398
396 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 399 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
397 csum_size = sizeof(struct jbd2_journal_block_tail); 400 csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
424 J_ASSERT(journal->j_committing_transaction == NULL); 427 J_ASSERT(journal->j_committing_transaction == NULL);
425 428
426 commit_transaction = journal->j_running_transaction; 429 commit_transaction = journal->j_running_transaction;
427 J_ASSERT(commit_transaction->t_state == T_RUNNING);
428 430
429 trace_jbd2_start_commit(journal, commit_transaction); 431 trace_jbd2_start_commit(journal, commit_transaction);
430 jbd_debug(1, "JBD2: starting commit of transaction %d\n", 432 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
431 commit_transaction->t_tid); 433 commit_transaction->t_tid);
432 434
433 write_lock(&journal->j_state_lock); 435 write_lock(&journal->j_state_lock);
436 J_ASSERT(commit_transaction->t_state == T_RUNNING);
434 commit_transaction->t_state = T_LOCKED; 437 commit_transaction->t_state = T_LOCKED;
435 438
436 trace_jbd2_commit_locking(journal, commit_transaction); 439 trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
520 */ 523 */
521 jbd2_journal_switch_revoke_table(journal); 524 jbd2_journal_switch_revoke_table(journal);
522 525
526 /*
527 * Reserved credits cannot be claimed anymore, free them
528 */
529 atomic_sub(atomic_read(&journal->j_reserved_credits),
530 &commit_transaction->t_outstanding_credits);
531
523 trace_jbd2_commit_flushing(journal, commit_transaction); 532 trace_jbd2_commit_flushing(journal, commit_transaction);
524 stats.run.rs_flushing = jiffies; 533 stats.run.rs_flushing = jiffies;
525 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, 534 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
533 wake_up(&journal->j_wait_transaction_locked); 542 wake_up(&journal->j_wait_transaction_locked);
534 write_unlock(&journal->j_state_lock); 543 write_unlock(&journal->j_state_lock);
535 544
536 jbd_debug(3, "JBD2: commit phase 2\n"); 545 jbd_debug(3, "JBD2: commit phase 2a\n");
537 546
538 /* 547 /*
539 * Now start flushing things to disk, in the order they appear 548 * Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
545 554
546 blk_start_plug(&plug); 555 blk_start_plug(&plug);
547 jbd2_journal_write_revoke_records(journal, commit_transaction, 556 jbd2_journal_write_revoke_records(journal, commit_transaction,
548 WRITE_SYNC); 557 &log_bufs, WRITE_SYNC);
549 blk_finish_plug(&plug); 558 blk_finish_plug(&plug);
550 559
551 jbd_debug(3, "JBD2: commit phase 2\n"); 560 jbd_debug(3, "JBD2: commit phase 2b\n");
552 561
553 /* 562 /*
554 * Way to go: we have now written out all of the data for a 563 * Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
571 atomic_read(&commit_transaction->t_outstanding_credits)); 580 atomic_read(&commit_transaction->t_outstanding_credits));
572 581
573 err = 0; 582 err = 0;
574 descriptor = NULL;
575 bufs = 0; 583 bufs = 0;
584 descriptor = NULL;
576 blk_start_plug(&plug); 585 blk_start_plug(&plug);
577 while (commit_transaction->t_buffers) { 586 while (commit_transaction->t_buffers) {
578 587
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
604 record the metadata buffer. */ 613 record the metadata buffer. */
605 614
606 if (!descriptor) { 615 if (!descriptor) {
607 struct buffer_head *bh;
608
609 J_ASSERT (bufs == 0); 616 J_ASSERT (bufs == 0);
610 617
611 jbd_debug(4, "JBD2: get descriptor\n"); 618 jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
616 continue; 623 continue;
617 } 624 }
618 625
619 bh = jh2bh(descriptor);
620 jbd_debug(4, "JBD2: got buffer %llu (%p)\n", 626 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
621 (unsigned long long)bh->b_blocknr, bh->b_data); 627 (unsigned long long)descriptor->b_blocknr,
622 header = (journal_header_t *)&bh->b_data[0]; 628 descriptor->b_data);
629 header = (journal_header_t *)descriptor->b_data;
623 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 630 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
624 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); 631 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
625 header->h_sequence = cpu_to_be32(commit_transaction->t_tid); 632 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
626 633
627 tagp = &bh->b_data[sizeof(journal_header_t)]; 634 tagp = &descriptor->b_data[sizeof(journal_header_t)];
628 space_left = bh->b_size - sizeof(journal_header_t); 635 space_left = descriptor->b_size -
636 sizeof(journal_header_t);
629 first_tag = 1; 637 first_tag = 1;
630 set_buffer_jwrite(bh); 638 set_buffer_jwrite(descriptor);
631 set_buffer_dirty(bh); 639 set_buffer_dirty(descriptor);
632 wbuf[bufs++] = bh; 640 wbuf[bufs++] = descriptor;
633 641
634 /* Record it so that we can wait for IO 642 /* Record it so that we can wait for IO
635 completion later */ 643 completion later */
636 BUFFER_TRACE(bh, "ph3: file as descriptor"); 644 BUFFER_TRACE(descriptor, "ph3: file as descriptor");
637 jbd2_journal_file_buffer(descriptor, commit_transaction, 645 jbd2_file_log_bh(&log_bufs, descriptor);
638 BJ_LogCtl);
639 } 646 }
640 647
641 /* Where is the buffer to be written? */ 648 /* Where is the buffer to be written? */
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
658 665
659 /* Bump b_count to prevent truncate from stumbling over 666 /* Bump b_count to prevent truncate from stumbling over
660 the shadowed buffer! @@@ This can go if we ever get 667 the shadowed buffer! @@@ This can go if we ever get
661 rid of the BJ_IO/BJ_Shadow pairing of buffers. */ 668 rid of the shadow pairing of buffers. */
662 atomic_inc(&jh2bh(jh)->b_count); 669 atomic_inc(&jh2bh(jh)->b_count);
663 670
664 /* Make a temporary IO buffer with which to write it out
665 (this will requeue both the metadata buffer and the
666 temporary IO buffer). new_bh goes on BJ_IO*/
667
668 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
669 /* 671 /*
670 * akpm: jbd2_journal_write_metadata_buffer() sets 672 * Make a temporary IO buffer with which to write it out
671 * new_bh->b_transaction to commit_transaction. 673 * (this will requeue the metadata buffer to BJ_Shadow).
672 * We need to clean this up before we release new_bh
673 * (which is of type BJ_IO)
674 */ 674 */
675 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
675 JBUFFER_TRACE(jh, "ph3: write metadata"); 676 JBUFFER_TRACE(jh, "ph3: write metadata");
676 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 677 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
677 jh, &new_jh, blocknr); 678 jh, &wbuf[bufs], blocknr);
678 if (flags < 0) { 679 if (flags < 0) {
679 jbd2_journal_abort(journal, flags); 680 jbd2_journal_abort(journal, flags);
680 continue; 681 continue;
681 } 682 }
682 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 683 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
683 wbuf[bufs++] = jh2bh(new_jh);
684 684
685 /* Record the new block's tag in the current descriptor 685 /* Record the new block's tag in the current descriptor
686 buffer */ 686 buffer */
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
694 tag = (journal_block_tag_t *) tagp; 694 tag = (journal_block_tag_t *) tagp;
695 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); 695 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
696 tag->t_flags = cpu_to_be16(tag_flag); 696 tag->t_flags = cpu_to_be16(tag_flag);
697 jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh), 697 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
698 commit_transaction->t_tid); 698 commit_transaction->t_tid);
699 tagp += tag_bytes; 699 tagp += tag_bytes;
700 space_left -= tag_bytes; 700 space_left -= tag_bytes;
701 bufs++;
701 702
702 if (first_tag) { 703 if (first_tag) {
703 memcpy (tagp, journal->j_uuid, 16); 704 memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@ start_journal_io:
809 the log. Before we can commit it, wait for the IO so far to 810 the log. Before we can commit it, wait for the IO so far to
810 complete. Control buffers being written are on the 811 complete. Control buffers being written are on the
811 transaction's t_log_list queue, and metadata buffers are on 812 transaction's t_log_list queue, and metadata buffers are on
812 the t_iobuf_list queue. 813 the io_bufs list.
813 814
814 Wait for the buffers in reverse order. That way we are 815 Wait for the buffers in reverse order. That way we are
815 less likely to be woken up until all IOs have completed, and 816 less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@ start_journal_io:
818 819
819 jbd_debug(3, "JBD2: commit phase 3\n"); 820 jbd_debug(3, "JBD2: commit phase 3\n");
820 821
821 /* 822 while (!list_empty(&io_bufs)) {
822 * akpm: these are BJ_IO, and j_list_lock is not needed. 823 struct buffer_head *bh = list_entry(io_bufs.prev,
823 * See __journal_try_to_free_buffer. 824 struct buffer_head,
824 */ 825 b_assoc_buffers);
825wait_for_iobuf:
826 while (commit_transaction->t_iobuf_list != NULL) {
827 struct buffer_head *bh;
828 826
829 jh = commit_transaction->t_iobuf_list->b_tprev; 827 wait_on_buffer(bh);
830 bh = jh2bh(jh); 828 cond_resched();
831 if (buffer_locked(bh)) {
832 wait_on_buffer(bh);
833 goto wait_for_iobuf;
834 }
835 if (cond_resched())
836 goto wait_for_iobuf;
837 829
838 if (unlikely(!buffer_uptodate(bh))) 830 if (unlikely(!buffer_uptodate(bh)))
839 err = -EIO; 831 err = -EIO;
840 832 jbd2_unfile_log_bh(bh);
841 clear_buffer_jwrite(bh);
842
843 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
844 jbd2_journal_unfile_buffer(journal, jh);
845 833
846 /* 834 /*
847 * ->t_iobuf_list should contain only dummy buffer_heads 835 * The list contains temporary buffer heads created by
848 * which were created by jbd2_journal_write_metadata_buffer(). 836 * jbd2_journal_write_metadata_buffer().
849 */ 837 */
850 BUFFER_TRACE(bh, "dumping temporary bh"); 838 BUFFER_TRACE(bh, "dumping temporary bh");
851 jbd2_journal_put_journal_head(jh);
852 __brelse(bh); 839 __brelse(bh);
853 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 840 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
854 free_buffer_head(bh); 841 free_buffer_head(bh);
855 842
856 /* We also have to unlock and free the corresponding 843 /* We also have to refile the corresponding shadowed buffer */
857 shadowed buffer */
858 jh = commit_transaction->t_shadow_list->b_tprev; 844 jh = commit_transaction->t_shadow_list->b_tprev;
859 bh = jh2bh(jh); 845 bh = jh2bh(jh);
860 clear_bit(BH_JWrite, &bh->b_state); 846 clear_buffer_jwrite(bh);
861 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 847 J_ASSERT_BH(bh, buffer_jbddirty(bh));
848 J_ASSERT_BH(bh, !buffer_shadow(bh));
862 849
863 /* The metadata is now released for reuse, but we need 850 /* The metadata is now released for reuse, but we need
864 to remember it against this transaction so that when 851 to remember it against this transaction so that when
@@ -866,14 +853,6 @@ wait_for_iobuf:
866 required. */ 853 required. */
867 JBUFFER_TRACE(jh, "file as BJ_Forget"); 854 JBUFFER_TRACE(jh, "file as BJ_Forget");
868 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 855 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
869 /*
870 * Wake up any transactions which were waiting for this IO to
871 * complete. The barrier must be here so that changes by
872 * jbd2_journal_file_buffer() take effect before wake_up_bit()
873 * does the waitqueue check.
874 */
875 smp_mb();
876 wake_up_bit(&bh->b_state, BH_Unshadow);
877 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 856 JBUFFER_TRACE(jh, "brelse shadowed buffer");
878 __brelse(bh); 857 __brelse(bh);
879 } 858 }
@@ -883,26 +862,19 @@ wait_for_iobuf:
883 jbd_debug(3, "JBD2: commit phase 4\n"); 862 jbd_debug(3, "JBD2: commit phase 4\n");
884 863
885 /* Here we wait for the revoke record and descriptor record buffers */ 864 /* Here we wait for the revoke record and descriptor record buffers */
886 wait_for_ctlbuf: 865 while (!list_empty(&log_bufs)) {
887 while (commit_transaction->t_log_list != NULL) {
888 struct buffer_head *bh; 866 struct buffer_head *bh;
889 867
890 jh = commit_transaction->t_log_list->b_tprev; 868 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
891 bh = jh2bh(jh); 869 wait_on_buffer(bh);
892 if (buffer_locked(bh)) { 870 cond_resched();
893 wait_on_buffer(bh);
894 goto wait_for_ctlbuf;
895 }
896 if (cond_resched())
897 goto wait_for_ctlbuf;
898 871
899 if (unlikely(!buffer_uptodate(bh))) 872 if (unlikely(!buffer_uptodate(bh)))
900 err = -EIO; 873 err = -EIO;
901 874
902 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 875 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
903 clear_buffer_jwrite(bh); 876 clear_buffer_jwrite(bh);
904 jbd2_journal_unfile_buffer(journal, jh); 877 jbd2_unfile_log_bh(bh);
905 jbd2_journal_put_journal_head(jh);
906 __brelse(bh); /* One for getblk */ 878 __brelse(bh); /* One for getblk */
907 /* AKPM: bforget here */ 879 /* AKPM: bforget here */
908 } 880 }
@@ -952,9 +924,7 @@ wait_for_iobuf:
952 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 924 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
953 J_ASSERT(commit_transaction->t_buffers == NULL); 925 J_ASSERT(commit_transaction->t_buffers == NULL);
954 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 926 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
955 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
956 J_ASSERT(commit_transaction->t_shadow_list == NULL); 927 J_ASSERT(commit_transaction->t_shadow_list == NULL);
957 J_ASSERT(commit_transaction->t_log_list == NULL);
958 928
959restart_loop: 929restart_loop:
960 /* 930 /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 95457576e434..02c7ad9d7a41 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
103static void __journal_abort_soft (journal_t *journal, int errno); 103static void __journal_abort_soft (journal_t *journal, int errno);
104static int jbd2_journal_create_slab(size_t slab_size); 104static int jbd2_journal_create_slab(size_t slab_size);
105 105
106#ifdef CONFIG_JBD2_DEBUG
107void __jbd2_debug(int level, const char *file, const char *func,
108 unsigned int line, const char *fmt, ...)
109{
110 struct va_format vaf;
111 va_list args;
112
113 if (level > jbd2_journal_enable_debug)
114 return;
115 va_start(args, fmt);
116 vaf.fmt = fmt;
117 vaf.va = &args;
118 printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
119 va_end(args);
120}
121EXPORT_SYMBOL(__jbd2_debug);
122#endif
123
106/* Checksumming functions */ 124/* Checksumming functions */
107int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 125int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
108{ 126{
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
310 * 328 *
311 * If the source buffer has already been modified by a new transaction 329 * If the source buffer has already been modified by a new transaction
312 * since we took the last commit snapshot, we use the frozen copy of 330 * since we took the last commit snapshot, we use the frozen copy of
313 * that data for IO. If we end up using the existing buffer_head's data 331 * that data for IO. If we end up using the existing buffer_head's data
314 * for the write, then we *have* to lock the buffer to prevent anyone 332 * for the write, then we have to make sure nobody modifies it while the
315 * else from using and possibly modifying it while the IO is in 333 * IO is in progress. do_get_write_access() handles this.
316 * progress.
317 * 334 *
318 * The function returns a pointer to the buffer_heads to be used for IO. 335 * The function returns a pointer to the buffer_head to be used for IO.
319 * 336 *
320 * We assume that the journal has already been locked in this function.
321 * 337 *
322 * Return value: 338 * Return value:
323 * <0: Error 339 * <0: Error
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
330 346
331int jbd2_journal_write_metadata_buffer(transaction_t *transaction, 347int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
332 struct journal_head *jh_in, 348 struct journal_head *jh_in,
333 struct journal_head **jh_out, 349 struct buffer_head **bh_out,
334 unsigned long long blocknr) 350 sector_t blocknr)
335{ 351{
336 int need_copy_out = 0; 352 int need_copy_out = 0;
337 int done_copy_out = 0; 353 int done_copy_out = 0;
338 int do_escape = 0; 354 int do_escape = 0;
339 char *mapped_data; 355 char *mapped_data;
340 struct buffer_head *new_bh; 356 struct buffer_head *new_bh;
341 struct journal_head *new_jh;
342 struct page *new_page; 357 struct page *new_page;
343 unsigned int new_offset; 358 unsigned int new_offset;
344 struct buffer_head *bh_in = jh2bh(jh_in); 359 struct buffer_head *bh_in = jh2bh(jh_in);
@@ -368,14 +383,13 @@ retry_alloc:
368 383
369 /* keep subsequent assertions sane */ 384 /* keep subsequent assertions sane */
370 atomic_set(&new_bh->b_count, 1); 385 atomic_set(&new_bh->b_count, 1);
371 new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
372 386
387 jbd_lock_bh_state(bh_in);
388repeat:
373 /* 389 /*
374 * If a new transaction has already done a buffer copy-out, then 390 * If a new transaction has already done a buffer copy-out, then
375 * we use that version of the data for the commit. 391 * we use that version of the data for the commit.
376 */ 392 */
377 jbd_lock_bh_state(bh_in);
378repeat:
379 if (jh_in->b_frozen_data) { 393 if (jh_in->b_frozen_data) {
380 done_copy_out = 1; 394 done_copy_out = 1;
381 new_page = virt_to_page(jh_in->b_frozen_data); 395 new_page = virt_to_page(jh_in->b_frozen_data);
@@ -415,7 +429,7 @@ repeat:
415 jbd_unlock_bh_state(bh_in); 429 jbd_unlock_bh_state(bh_in);
416 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 430 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
417 if (!tmp) { 431 if (!tmp) {
418 jbd2_journal_put_journal_head(new_jh); 432 brelse(new_bh);
419 return -ENOMEM; 433 return -ENOMEM;
420 } 434 }
421 jbd_lock_bh_state(bh_in); 435 jbd_lock_bh_state(bh_in);
@@ -426,7 +440,7 @@ repeat:
426 440
427 jh_in->b_frozen_data = tmp; 441 jh_in->b_frozen_data = tmp;
428 mapped_data = kmap_atomic(new_page); 442 mapped_data = kmap_atomic(new_page);
429 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); 443 memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
430 kunmap_atomic(mapped_data); 444 kunmap_atomic(mapped_data);
431 445
432 new_page = virt_to_page(tmp); 446 new_page = virt_to_page(tmp);
@@ -452,14 +466,14 @@ repeat:
452 } 466 }
453 467
454 set_bh_page(new_bh, new_page, new_offset); 468 set_bh_page(new_bh, new_page, new_offset);
455 new_jh->b_transaction = NULL; 469 new_bh->b_size = bh_in->b_size;
456 new_bh->b_size = jh2bh(jh_in)->b_size; 470 new_bh->b_bdev = journal->j_dev;
457 new_bh->b_bdev = transaction->t_journal->j_dev;
458 new_bh->b_blocknr = blocknr; 471 new_bh->b_blocknr = blocknr;
472 new_bh->b_private = bh_in;
459 set_buffer_mapped(new_bh); 473 set_buffer_mapped(new_bh);
460 set_buffer_dirty(new_bh); 474 set_buffer_dirty(new_bh);
461 475
462 *jh_out = new_jh; 476 *bh_out = new_bh;
463 477
464 /* 478 /*
465 * The to-be-written buffer needs to get moved to the io queue, 479 * The to-be-written buffer needs to get moved to the io queue,
@@ -470,11 +484,9 @@ repeat:
470 spin_lock(&journal->j_list_lock); 484 spin_lock(&journal->j_list_lock);
471 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 485 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
472 spin_unlock(&journal->j_list_lock); 486 spin_unlock(&journal->j_list_lock);
487 set_buffer_shadow(bh_in);
473 jbd_unlock_bh_state(bh_in); 488 jbd_unlock_bh_state(bh_in);
474 489
475 JBUFFER_TRACE(new_jh, "file as BJ_IO");
476 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
477
478 return do_escape | (done_copy_out << 1); 490 return do_escape | (done_copy_out << 1);
479} 491}
480 492
@@ -484,35 +496,6 @@ repeat:
484 */ 496 */
485 497
486/* 498/*
487 * __jbd2_log_space_left: Return the number of free blocks left in the journal.
488 *
489 * Called with the journal already locked.
490 *
491 * Called under j_state_lock
492 */
493
494int __jbd2_log_space_left(journal_t *journal)
495{
496 int left = journal->j_free;
497
498 /* assert_spin_locked(&journal->j_state_lock); */
499
500 /*
501 * Be pessimistic here about the number of those free blocks which
502 * might be required for log descriptor control blocks.
503 */
504
505#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
506
507 left -= MIN_LOG_RESERVED_BLOCKS;
508
509 if (left <= 0)
510 return 0;
511 left -= (left >> 3);
512 return left;
513}
514
515/*
516 * Called with j_state_lock locked for writing. 499 * Called with j_state_lock locked for writing.
517 * Returns true if a transaction commit was started. 500 * Returns true if a transaction commit was started.
518 */ 501 */
@@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
564} 547}
565 548
566/* 549/*
567 * Force and wait upon a commit if the calling process is not within 550 * Force and wait any uncommitted transactions. We can only force the running
568 * transaction. This is used for forcing out undo-protected data which contains 551 * transaction if we don't have an active handle, otherwise, we will deadlock.
569 * bitmaps, when the fs is running out of space. 552 * Returns: <0 in case of error,
570 * 553 * 0 if nothing to commit,
571 * We can only force the running transaction if we don't have an active handle; 554 * 1 if transaction was successfully committed.
572 * otherwise, we will deadlock.
573 *
574 * Returns true if a transaction was started.
575 */ 555 */
576int jbd2_journal_force_commit_nested(journal_t *journal) 556static int __jbd2_journal_force_commit(journal_t *journal)
577{ 557{
578 transaction_t *transaction = NULL; 558 transaction_t *transaction = NULL;
579 tid_t tid; 559 tid_t tid;
580 int need_to_start = 0; 560 int need_to_start = 0, ret = 0;
581 561
582 read_lock(&journal->j_state_lock); 562 read_lock(&journal->j_state_lock);
583 if (journal->j_running_transaction && !current->journal_info) { 563 if (journal->j_running_transaction && !current->journal_info) {
@@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
588 transaction = journal->j_committing_transaction; 568 transaction = journal->j_committing_transaction;
589 569
590 if (!transaction) { 570 if (!transaction) {
571 /* Nothing to commit */
591 read_unlock(&journal->j_state_lock); 572 read_unlock(&journal->j_state_lock);
592 return 0; /* Nothing to retry */ 573 return 0;
593 } 574 }
594
595 tid = transaction->t_tid; 575 tid = transaction->t_tid;
596 read_unlock(&journal->j_state_lock); 576 read_unlock(&journal->j_state_lock);
597 if (need_to_start) 577 if (need_to_start)
598 jbd2_log_start_commit(journal, tid); 578 jbd2_log_start_commit(journal, tid);
599 jbd2_log_wait_commit(journal, tid); 579 ret = jbd2_log_wait_commit(journal, tid);
600 return 1; 580 if (!ret)
581 ret = 1;
582
583 return ret;
584}
585
586/**
587 * Force and wait upon a commit if the calling process is not within
588 * transaction. This is used for forcing out undo-protected data which contains
589 * bitmaps, when the fs is running out of space.
590 *
591 * @journal: journal to force
592 * Returns true if progress was made.
593 */
594int jbd2_journal_force_commit_nested(journal_t *journal)
595{
596 int ret;
597
598 ret = __jbd2_journal_force_commit(journal);
599 return ret > 0;
600}
601
602/**
603 * int journal_force_commit() - force any uncommitted transactions
604 * @journal: journal to force
605 *
606 * Caller want unconditional commit. We can only force the running transaction
607 * if we don't have an active handle, otherwise, we will deadlock.
608 */
609int jbd2_journal_force_commit(journal_t *journal)
610{
611 int ret;
612
613 J_ASSERT(!current->journal_info);
614 ret = __jbd2_journal_force_commit(journal);
615 if (ret > 0)
616 ret = 0;
617 return ret;
601} 618}
602 619
603/* 620/*
@@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
798 * But we don't bother doing that, so there will be coherency problems with 815 * But we don't bother doing that, so there will be coherency problems with
799 * mmaps of blockdevs which hold live JBD-controlled filesystems. 816 * mmaps of blockdevs which hold live JBD-controlled filesystems.
800 */ 817 */
801struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) 818struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
802{ 819{
803 struct buffer_head *bh; 820 struct buffer_head *bh;
804 unsigned long long blocknr; 821 unsigned long long blocknr;
@@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
817 set_buffer_uptodate(bh); 834 set_buffer_uptodate(bh);
818 unlock_buffer(bh); 835 unlock_buffer(bh);
819 BUFFER_TRACE(bh, "return this buffer"); 836 BUFFER_TRACE(bh, "return this buffer");
820 return jbd2_journal_add_journal_head(bh); 837 return bh;
821} 838}
822 839
823/* 840/*
@@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void)
1062 return NULL; 1079 return NULL;
1063 1080
1064 init_waitqueue_head(&journal->j_wait_transaction_locked); 1081 init_waitqueue_head(&journal->j_wait_transaction_locked);
1065 init_waitqueue_head(&journal->j_wait_logspace);
1066 init_waitqueue_head(&journal->j_wait_done_commit); 1082 init_waitqueue_head(&journal->j_wait_done_commit);
1067 init_waitqueue_head(&journal->j_wait_checkpoint);
1068 init_waitqueue_head(&journal->j_wait_commit); 1083 init_waitqueue_head(&journal->j_wait_commit);
1069 init_waitqueue_head(&journal->j_wait_updates); 1084 init_waitqueue_head(&journal->j_wait_updates);
1085 init_waitqueue_head(&journal->j_wait_reserved);
1070 mutex_init(&journal->j_barrier); 1086 mutex_init(&journal->j_barrier);
1071 mutex_init(&journal->j_checkpoint_mutex); 1087 mutex_init(&journal->j_checkpoint_mutex);
1072 spin_lock_init(&journal->j_revoke_lock); 1088 spin_lock_init(&journal->j_revoke_lock);
@@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void)
1076 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 1092 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
1077 journal->j_min_batch_time = 0; 1093 journal->j_min_batch_time = 0;
1078 journal->j_max_batch_time = 15000; /* 15ms */ 1094 journal->j_max_batch_time = 15000; /* 15ms */
1095 atomic_set(&journal->j_reserved_credits, 0);
1079 1096
1080 /* The journal is marked for error until we succeed with recovery! */ 1097 /* The journal is marked for error until we succeed with recovery! */
1081 journal->j_flags = JBD2_ABORT; 1098 journal->j_flags = JBD2_ABORT;
@@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal)
1318static void jbd2_write_superblock(journal_t *journal, int write_op) 1335static void jbd2_write_superblock(journal_t *journal, int write_op)
1319{ 1336{
1320 struct buffer_head *bh = journal->j_sb_buffer; 1337 struct buffer_head *bh = journal->j_sb_buffer;
1338 journal_superblock_t *sb = journal->j_superblock;
1321 int ret; 1339 int ret;
1322 1340
1323 trace_jbd2_write_superblock(journal, write_op); 1341 trace_jbd2_write_superblock(journal, write_op);
@@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
1339 clear_buffer_write_io_error(bh); 1357 clear_buffer_write_io_error(bh);
1340 set_buffer_uptodate(bh); 1358 set_buffer_uptodate(bh);
1341 } 1359 }
1360 jbd2_superblock_csum_set(journal, sb);
1342 get_bh(bh); 1361 get_bh(bh);
1343 bh->b_end_io = end_buffer_write_sync; 1362 bh->b_end_io = end_buffer_write_sync;
1344 ret = submit_bh(write_op, bh); 1363 ret = submit_bh(write_op, bh);
@@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
1435 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", 1454 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
1436 journal->j_errno); 1455 journal->j_errno);
1437 sb->s_errno = cpu_to_be32(journal->j_errno); 1456 sb->s_errno = cpu_to_be32(journal->j_errno);
1438 jbd2_superblock_csum_set(journal, sb);
1439 read_unlock(&journal->j_state_lock); 1457 read_unlock(&journal->j_state_lock);
1440 1458
1441 jbd2_write_superblock(journal, WRITE_SYNC); 1459 jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
2325#ifdef CONFIG_JBD2_DEBUG 2343#ifdef CONFIG_JBD2_DEBUG
2326 atomic_inc(&nr_journal_heads); 2344 atomic_inc(&nr_journal_heads);
2327#endif 2345#endif
2328 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 2346 ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
2329 if (!ret) { 2347 if (!ret) {
2330 jbd_debug(1, "out of memory for journal_head\n"); 2348 jbd_debug(1, "out of memory for journal_head\n");
2331 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); 2349 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
2332 while (!ret) { 2350 while (!ret) {
2333 yield(); 2351 yield();
2334 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 2352 ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
2335 } 2353 }
2336 } 2354 }
2337 return ret; 2355 return ret;
@@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
2393 struct journal_head *new_jh = NULL; 2411 struct journal_head *new_jh = NULL;
2394 2412
2395repeat: 2413repeat:
2396 if (!buffer_jbd(bh)) { 2414 if (!buffer_jbd(bh))
2397 new_jh = journal_alloc_journal_head(); 2415 new_jh = journal_alloc_journal_head();
2398 memset(new_jh, 0, sizeof(*new_jh));
2399 }
2400 2416
2401 jbd_lock_bh_journal_head(bh); 2417 jbd_lock_bh_journal_head(bh);
2402 if (buffer_jbd(bh)) { 2418 if (buffer_jbd(bh)) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32f..d4851464b57e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, 399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
400 void *buf, __u32 sequence) 400 void *buf, __u32 sequence)
401{ 401{
402 __u32 provided, calculated; 402 __u32 csum32;
403 403
404 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 404 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
405 return 1; 405 return 1;
406 406
407 sequence = cpu_to_be32(sequence); 407 sequence = cpu_to_be32(sequence);
408 calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 408 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
409 sizeof(sequence)); 409 sizeof(sequence));
410 calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize); 410 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
411 provided = be32_to_cpu(tag->t_checksum);
412 411
413 return provided == cpu_to_be32(calculated); 412 return tag->t_checksum == cpu_to_be16(csum32);
414} 413}
415 414
416static int do_one_pass(journal_t *journal, 415static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8b..198c9c10276d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
122 122
123#ifdef __KERNEL__ 123#ifdef __KERNEL__
124static void write_one_revoke_record(journal_t *, transaction_t *, 124static void write_one_revoke_record(journal_t *, transaction_t *,
125 struct journal_head **, int *, 125 struct list_head *,
126 struct buffer_head **, int *,
126 struct jbd2_revoke_record_s *, int); 127 struct jbd2_revoke_record_s *, int);
127static void flush_descriptor(journal_t *, struct journal_head *, int, int); 128static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
128#endif 129#endif
129 130
130/* Utility functions to maintain the revoke table */ 131/* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
531 */ 532 */
532void jbd2_journal_write_revoke_records(journal_t *journal, 533void jbd2_journal_write_revoke_records(journal_t *journal,
533 transaction_t *transaction, 534 transaction_t *transaction,
535 struct list_head *log_bufs,
534 int write_op) 536 int write_op)
535{ 537{
536 struct journal_head *descriptor; 538 struct buffer_head *descriptor;
537 struct jbd2_revoke_record_s *record; 539 struct jbd2_revoke_record_s *record;
538 struct jbd2_revoke_table_s *revoke; 540 struct jbd2_revoke_table_s *revoke;
539 struct list_head *hash_list; 541 struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
553 while (!list_empty(hash_list)) { 555 while (!list_empty(hash_list)) {
554 record = (struct jbd2_revoke_record_s *) 556 record = (struct jbd2_revoke_record_s *)
555 hash_list->next; 557 hash_list->next;
556 write_one_revoke_record(journal, transaction, 558 write_one_revoke_record(journal, transaction, log_bufs,
557 &descriptor, &offset, 559 &descriptor, &offset,
558 record, write_op); 560 record, write_op);
559 count++; 561 count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
573 575
574static void write_one_revoke_record(journal_t *journal, 576static void write_one_revoke_record(journal_t *journal,
575 transaction_t *transaction, 577 transaction_t *transaction,
576 struct journal_head **descriptorp, 578 struct list_head *log_bufs,
579 struct buffer_head **descriptorp,
577 int *offsetp, 580 int *offsetp,
578 struct jbd2_revoke_record_s *record, 581 struct jbd2_revoke_record_s *record,
579 int write_op) 582 int write_op)
580{ 583{
581 int csum_size = 0; 584 int csum_size = 0;
582 struct journal_head *descriptor; 585 struct buffer_head *descriptor;
583 int offset; 586 int offset;
584 journal_header_t *header; 587 journal_header_t *header;
585 588
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
609 descriptor = jbd2_journal_get_descriptor_buffer(journal); 612 descriptor = jbd2_journal_get_descriptor_buffer(journal);
610 if (!descriptor) 613 if (!descriptor)
611 return; 614 return;
612 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; 615 header = (journal_header_t *)descriptor->b_data;
613 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 616 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
614 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); 617 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
615 header->h_sequence = cpu_to_be32(transaction->t_tid); 618 header->h_sequence = cpu_to_be32(transaction->t_tid);
616 619
617 /* Record it so that we can wait for IO completion later */ 620 /* Record it so that we can wait for IO completion later */
618 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); 621 BUFFER_TRACE(descriptor, "file in log_bufs");
619 jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl); 622 jbd2_file_log_bh(log_bufs, descriptor);
620 623
621 offset = sizeof(jbd2_journal_revoke_header_t); 624 offset = sizeof(jbd2_journal_revoke_header_t);
622 *descriptorp = descriptor; 625 *descriptorp = descriptor;
623 } 626 }
624 627
625 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { 628 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
626 * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) = 629 * ((__be64 *)(&descriptor->b_data[offset])) =
627 cpu_to_be64(record->blocknr); 630 cpu_to_be64(record->blocknr);
628 offset += 8; 631 offset += 8;
629 632
630 } else { 633 } else {
631 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = 634 * ((__be32 *)(&descriptor->b_data[offset])) =
632 cpu_to_be32(record->blocknr); 635 cpu_to_be32(record->blocknr);
633 offset += 4; 636 offset += 4;
634 } 637 }
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
636 *offsetp = offset; 639 *offsetp = offset;
637} 640}
638 641
639static void jbd2_revoke_csum_set(journal_t *j, 642static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
640 struct journal_head *descriptor)
641{ 643{
642 struct jbd2_journal_revoke_tail *tail; 644 struct jbd2_journal_revoke_tail *tail;
643 __u32 csum; 645 __u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
645 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 647 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
646 return; 648 return;
647 649
648 tail = (struct jbd2_journal_revoke_tail *) 650 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
649 (jh2bh(descriptor)->b_data + j->j_blocksize -
650 sizeof(struct jbd2_journal_revoke_tail)); 651 sizeof(struct jbd2_journal_revoke_tail));
651 tail->r_checksum = 0; 652 tail->r_checksum = 0;
652 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 653 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
653 j->j_blocksize);
654 tail->r_checksum = cpu_to_be32(csum); 654 tail->r_checksum = cpu_to_be32(csum);
655} 655}
656 656
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
662 */ 662 */
663 663
664static void flush_descriptor(journal_t *journal, 664static void flush_descriptor(journal_t *journal,
665 struct journal_head *descriptor, 665 struct buffer_head *descriptor,
666 int offset, int write_op) 666 int offset, int write_op)
667{ 667{
668 jbd2_journal_revoke_header_t *header; 668 jbd2_journal_revoke_header_t *header;
669 struct buffer_head *bh = jh2bh(descriptor);
670 669
671 if (is_journal_aborted(journal)) { 670 if (is_journal_aborted(journal)) {
672 put_bh(bh); 671 put_bh(descriptor);
673 return; 672 return;
674 } 673 }
675 674
676 header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; 675 header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
677 header->r_count = cpu_to_be32(offset); 676 header->r_count = cpu_to_be32(offset);
678 jbd2_revoke_csum_set(journal, descriptor); 677 jbd2_revoke_csum_set(journal, descriptor);
679 678
680 set_buffer_jwrite(bh); 679 set_buffer_jwrite(descriptor);
681 BUFFER_TRACE(bh, "write"); 680 BUFFER_TRACE(descriptor, "write");
682 set_buffer_dirty(bh); 681 set_buffer_dirty(descriptor);
683 write_dirty_buffer(bh, write_op); 682 write_dirty_buffer(descriptor, write_op);
684} 683}
685#endif 684#endif
686 685
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c59ea8..7aa9a32573bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
89 transaction->t_expires = jiffies + journal->j_commit_interval; 89 transaction->t_expires = jiffies + journal->j_commit_interval;
90 spin_lock_init(&transaction->t_handle_lock); 90 spin_lock_init(&transaction->t_handle_lock);
91 atomic_set(&transaction->t_updates, 0); 91 atomic_set(&transaction->t_updates, 0);
92 atomic_set(&transaction->t_outstanding_credits, 0); 92 atomic_set(&transaction->t_outstanding_credits,
93 atomic_read(&journal->j_reserved_credits));
93 atomic_set(&transaction->t_handle_count, 0); 94 atomic_set(&transaction->t_handle_count, 0);
94 INIT_LIST_HEAD(&transaction->t_inode_list); 95 INIT_LIST_HEAD(&transaction->t_inode_list);
95 INIT_LIST_HEAD(&transaction->t_private_list); 96 INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
141} 142}
142 143
143/* 144/*
145 * Wait until running transaction passes T_LOCKED state. Also starts the commit
146 * if needed. The function expects running transaction to exist and releases
147 * j_state_lock.
148 */
149static void wait_transaction_locked(journal_t *journal)
150 __releases(journal->j_state_lock)
151{
152 DEFINE_WAIT(wait);
153 int need_to_start;
154 tid_t tid = journal->j_running_transaction->t_tid;
155
156 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
157 TASK_UNINTERRUPTIBLE);
158 need_to_start = !tid_geq(journal->j_commit_request, tid);
159 read_unlock(&journal->j_state_lock);
160 if (need_to_start)
161 jbd2_log_start_commit(journal, tid);
162 schedule();
163 finish_wait(&journal->j_wait_transaction_locked, &wait);
164}
165
166static void sub_reserved_credits(journal_t *journal, int blocks)
167{
168 atomic_sub(blocks, &journal->j_reserved_credits);
169 wake_up(&journal->j_wait_reserved);
170}
171
172/*
173 * Wait until we can add credits for handle to the running transaction. Called
174 * with j_state_lock held for reading. Returns 0 if handle joined the running
175 * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
176 * caller must retry.
177 */
178static int add_transaction_credits(journal_t *journal, int blocks,
179 int rsv_blocks)
180{
181 transaction_t *t = journal->j_running_transaction;
182 int needed;
183 int total = blocks + rsv_blocks;
184
185 /*
186 * If the current transaction is locked down for commit, wait
187 * for the lock to be released.
188 */
189 if (t->t_state == T_LOCKED) {
190 wait_transaction_locked(journal);
191 return 1;
192 }
193
194 /*
195 * If there is not enough space left in the log to write all
196 * potential buffers requested by this operation, we need to
197 * stall pending a log checkpoint to free some more log space.
198 */
199 needed = atomic_add_return(total, &t->t_outstanding_credits);
200 if (needed > journal->j_max_transaction_buffers) {
201 /*
202 * If the current transaction is already too large,
203 * then start to commit it: we can then go back and
204 * attach this handle to a new transaction.
205 */
206 atomic_sub(total, &t->t_outstanding_credits);
207 wait_transaction_locked(journal);
208 return 1;
209 }
210
211 /*
212 * The commit code assumes that it can get enough log space
213 * without forcing a checkpoint. This is *critical* for
214 * correctness: a checkpoint of a buffer which is also
215 * associated with a committing transaction creates a deadlock,
216 * so commit simply cannot force through checkpoints.
217 *
218 * We must therefore ensure the necessary space in the journal
219 * *before* starting to dirty potentially checkpointed buffers
220 * in the new transaction.
221 */
222 if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
223 atomic_sub(total, &t->t_outstanding_credits);
224 read_unlock(&journal->j_state_lock);
225 write_lock(&journal->j_state_lock);
226 if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
227 __jbd2_log_wait_for_space(journal);
228 write_unlock(&journal->j_state_lock);
229 return 1;
230 }
231
232 /* No reservation? We are done... */
233 if (!rsv_blocks)
234 return 0;
235
236 needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
237 /* We allow at most half of a transaction to be reserved */
238 if (needed > journal->j_max_transaction_buffers / 2) {
239 sub_reserved_credits(journal, rsv_blocks);
240 atomic_sub(total, &t->t_outstanding_credits);
241 read_unlock(&journal->j_state_lock);
242 wait_event(journal->j_wait_reserved,
243 atomic_read(&journal->j_reserved_credits) + rsv_blocks
244 <= journal->j_max_transaction_buffers / 2);
245 return 1;
246 }
247 return 0;
248}
249
250/*
144 * start_this_handle: Given a handle, deal with any locking or stalling 251 * start_this_handle: Given a handle, deal with any locking or stalling
145 * needed to make sure that there is enough journal space for the handle 252 * needed to make sure that there is enough journal space for the handle
146 * to begin. Attach the handle to a transaction and set up the 253 * to begin. Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
151 gfp_t gfp_mask) 258 gfp_t gfp_mask)
152{ 259{
153 transaction_t *transaction, *new_transaction = NULL; 260 transaction_t *transaction, *new_transaction = NULL;
154 tid_t tid; 261 int blocks = handle->h_buffer_credits;
155 int needed, need_to_start; 262 int rsv_blocks = 0;
156 int nblocks = handle->h_buffer_credits;
157 unsigned long ts = jiffies; 263 unsigned long ts = jiffies;
158 264
159 if (nblocks > journal->j_max_transaction_buffers) { 265 /*
266 * 1/2 of transaction can be reserved so we can practically handle
267 * only 1/2 of maximum transaction size per operation
268 */
269 if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
160 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", 270 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
161 current->comm, nblocks, 271 current->comm, blocks,
162 journal->j_max_transaction_buffers); 272 journal->j_max_transaction_buffers / 2);
163 return -ENOSPC; 273 return -ENOSPC;
164 } 274 }
165 275
276 if (handle->h_rsv_handle)
277 rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
278
166alloc_transaction: 279alloc_transaction:
167 if (!journal->j_running_transaction) { 280 if (!journal->j_running_transaction) {
168 new_transaction = kmem_cache_zalloc(transaction_cache, 281 new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@ repeat:
199 return -EROFS; 312 return -EROFS;
200 } 313 }
201 314
202 /* Wait on the journal's transaction barrier if necessary */ 315 /*
203 if (journal->j_barrier_count) { 316 * Wait on the journal's transaction barrier if necessary. Specifically
317 * we allow reserved handles to proceed because otherwise commit could
318 * deadlock on page writeback not being able to complete.
319 */
320 if (!handle->h_reserved && journal->j_barrier_count) {
204 read_unlock(&journal->j_state_lock); 321 read_unlock(&journal->j_state_lock);
205 wait_event(journal->j_wait_transaction_locked, 322 wait_event(journal->j_wait_transaction_locked,
206 journal->j_barrier_count == 0); 323 journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@ repeat:
213 goto alloc_transaction; 330 goto alloc_transaction;
214 write_lock(&journal->j_state_lock); 331 write_lock(&journal->j_state_lock);
215 if (!journal->j_running_transaction && 332 if (!journal->j_running_transaction &&
216 !journal->j_barrier_count) { 333 (handle->h_reserved || !journal->j_barrier_count)) {
217 jbd2_get_transaction(journal, new_transaction); 334 jbd2_get_transaction(journal, new_transaction);
218 new_transaction = NULL; 335 new_transaction = NULL;
219 } 336 }
@@ -223,85 +340,18 @@ repeat:
223 340
224 transaction = journal->j_running_transaction; 341 transaction = journal->j_running_transaction;
225 342
226 /* 343 if (!handle->h_reserved) {
227 * If the current transaction is locked down for commit, wait for the 344 /* We may have dropped j_state_lock - restart in that case */
228 * lock to be released. 345 if (add_transaction_credits(journal, blocks, rsv_blocks))
229 */ 346 goto repeat;
230 if (transaction->t_state == T_LOCKED) { 347 } else {
231 DEFINE_WAIT(wait);
232
233 prepare_to_wait(&journal->j_wait_transaction_locked,
234 &wait, TASK_UNINTERRUPTIBLE);
235 read_unlock(&journal->j_state_lock);
236 schedule();
237 finish_wait(&journal->j_wait_transaction_locked, &wait);
238 goto repeat;
239 }
240
241 /*
242 * If there is not enough space left in the log to write all potential
243 * buffers requested by this operation, we need to stall pending a log
244 * checkpoint to free some more log space.
245 */
246 needed = atomic_add_return(nblocks,
247 &transaction->t_outstanding_credits);
248
249 if (needed > journal->j_max_transaction_buffers) {
250 /* 348 /*
251 * If the current transaction is already too large, then start 349 * We have handle reserved so we are allowed to join T_LOCKED
252 * to commit it: we can then go back and attach this handle to 350 * transaction and we don't have to check for transaction size
253 * a new transaction. 351 * and journal space.
254 */ 352 */
255 DEFINE_WAIT(wait); 353 sub_reserved_credits(journal, blocks);
256 354 handle->h_reserved = 0;
257 jbd_debug(2, "Handle %p starting new commit...\n", handle);
258 atomic_sub(nblocks, &transaction->t_outstanding_credits);
259 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
260 TASK_UNINTERRUPTIBLE);
261 tid = transaction->t_tid;
262 need_to_start = !tid_geq(journal->j_commit_request, tid);
263 read_unlock(&journal->j_state_lock);
264 if (need_to_start)
265 jbd2_log_start_commit(journal, tid);
266 schedule();
267 finish_wait(&journal->j_wait_transaction_locked, &wait);
268 goto repeat;
269 }
270
271 /*
272 * The commit code assumes that it can get enough log space
273 * without forcing a checkpoint. This is *critical* for
274 * correctness: a checkpoint of a buffer which is also
275 * associated with a committing transaction creates a deadlock,
276 * so commit simply cannot force through checkpoints.
277 *
278 * We must therefore ensure the necessary space in the journal
279 * *before* starting to dirty potentially checkpointed buffers
280 * in the new transaction.
281 *
282 * The worst part is, any transaction currently committing can
283 * reduce the free space arbitrarily. Be careful to account for
284 * those buffers when checkpointing.
285 */
286
287 /*
288 * @@@ AKPM: This seems rather over-defensive. We're giving commit
289 * a _lot_ of headroom: 1/4 of the journal plus the size of
290 * the committing transaction. Really, we only need to give it
291 * committing_transaction->t_outstanding_credits plus "enough" for
292 * the log control blocks.
293 * Also, this test is inconsistent with the matching one in
294 * jbd2_journal_extend().
295 */
296 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
297 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
298 atomic_sub(nblocks, &transaction->t_outstanding_credits);
299 read_unlock(&journal->j_state_lock);
300 write_lock(&journal->j_state_lock);
301 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
302 __jbd2_log_wait_for_space(journal);
303 write_unlock(&journal->j_state_lock);
304 goto repeat;
305 } 355 }
306 356
307 /* OK, account for the buffers that this operation expects to 357 /* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@ repeat:
309 */ 359 */
310 update_t_max_wait(transaction, ts); 360 update_t_max_wait(transaction, ts);
311 handle->h_transaction = transaction; 361 handle->h_transaction = transaction;
312 handle->h_requested_credits = nblocks; 362 handle->h_requested_credits = blocks;
313 handle->h_start_jiffies = jiffies; 363 handle->h_start_jiffies = jiffies;
314 atomic_inc(&transaction->t_updates); 364 atomic_inc(&transaction->t_updates);
315 atomic_inc(&transaction->t_handle_count); 365 atomic_inc(&transaction->t_handle_count);
316 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", 366 jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
317 handle, nblocks, 367 handle, blocks,
318 atomic_read(&transaction->t_outstanding_credits), 368 atomic_read(&transaction->t_outstanding_credits),
319 __jbd2_log_space_left(journal)); 369 jbd2_log_space_left(journal));
320 read_unlock(&journal->j_state_lock); 370 read_unlock(&journal->j_state_lock);
371 current->journal_info = handle;
321 372
322 lock_map_acquire(&handle->h_lockdep_map); 373 lock_map_acquire(&handle->h_lockdep_map);
323 jbd2_journal_free_transaction(new_transaction); 374 jbd2_journal_free_transaction(new_transaction);
@@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks)
348 * 399 *
349 * We make sure that the transaction can guarantee at least nblocks of 400 * We make sure that the transaction can guarantee at least nblocks of
350 * modified buffers in the log. We block until the log can guarantee 401 * modified buffers in the log. We block until the log can guarantee
351 * that much space. 402 * that much space. Additionally, if rsv_blocks > 0, we also create another
352 * 403 * handle with rsv_blocks reserved blocks in the journal. This handle is
353 * This function is visible to journal users (like ext3fs), so is not 404 * is stored in h_rsv_handle. It is not attached to any particular transaction
354 * called with the journal already locked. 405 * and thus doesn't block transaction commit. If the caller uses this reserved
406 * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
407 * on the parent handle will dispose the reserved one. Reserved handle has to
408 * be converted to a normal handle using jbd2_journal_start_reserved() before
409 * it can be used.
355 * 410 *
356 * Return a pointer to a newly allocated handle, or an ERR_PTR() value 411 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
357 * on failure. 412 * on failure.
358 */ 413 */
359handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, 414handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
360 unsigned int type, unsigned int line_no) 415 gfp_t gfp_mask, unsigned int type,
416 unsigned int line_no)
361{ 417{
362 handle_t *handle = journal_current_handle(); 418 handle_t *handle = journal_current_handle();
363 int err; 419 int err;
@@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
374 handle = new_handle(nblocks); 430 handle = new_handle(nblocks);
375 if (!handle) 431 if (!handle)
376 return ERR_PTR(-ENOMEM); 432 return ERR_PTR(-ENOMEM);
433 if (rsv_blocks) {
434 handle_t *rsv_handle;
377 435
378 current->journal_info = handle; 436 rsv_handle = new_handle(rsv_blocks);
437 if (!rsv_handle) {
438 jbd2_free_handle(handle);
439 return ERR_PTR(-ENOMEM);
440 }
441 rsv_handle->h_reserved = 1;
442 rsv_handle->h_journal = journal;
443 handle->h_rsv_handle = rsv_handle;
444 }
379 445
380 err = start_this_handle(journal, handle, gfp_mask); 446 err = start_this_handle(journal, handle, gfp_mask);
381 if (err < 0) { 447 if (err < 0) {
448 if (handle->h_rsv_handle)
449 jbd2_free_handle(handle->h_rsv_handle);
382 jbd2_free_handle(handle); 450 jbd2_free_handle(handle);
383 current->journal_info = NULL;
384 return ERR_PTR(err); 451 return ERR_PTR(err);
385 } 452 }
386 handle->h_type = type; 453 handle->h_type = type;
@@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
395 462
396handle_t *jbd2_journal_start(journal_t *journal, int nblocks) 463handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
397{ 464{
398 return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); 465 return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
399} 466}
400EXPORT_SYMBOL(jbd2_journal_start); 467EXPORT_SYMBOL(jbd2_journal_start);
401 468
469void jbd2_journal_free_reserved(handle_t *handle)
470{
471 journal_t *journal = handle->h_journal;
472
473 WARN_ON(!handle->h_reserved);
474 sub_reserved_credits(journal, handle->h_buffer_credits);
475 jbd2_free_handle(handle);
476}
477EXPORT_SYMBOL(jbd2_journal_free_reserved);
478
479/**
480 * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
481 * @handle: handle to start
482 *
483 * Start handle that has been previously reserved with jbd2_journal_reserve().
484 * This attaches @handle to the running transaction (or creates one if there's
485 * not transaction running). Unlike jbd2_journal_start() this function cannot
486 * block on journal commit, checkpointing, or similar stuff. It can block on
487 * memory allocation or frozen journal though.
488 *
489 * Return 0 on success, non-zero on error - handle is freed in that case.
490 */
491int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
492 unsigned int line_no)
493{
494 journal_t *journal = handle->h_journal;
495 int ret = -EIO;
496
497 if (WARN_ON(!handle->h_reserved)) {
498 /* Someone passed in normal handle? Just stop it. */
499 jbd2_journal_stop(handle);
500 return ret;
501 }
502 /*
503 * Usefulness of mixing of reserved and unreserved handles is
504 * questionable. So far nobody seems to need it so just error out.
505 */
506 if (WARN_ON(current->journal_info)) {
507 jbd2_journal_free_reserved(handle);
508 return ret;
509 }
510
511 handle->h_journal = NULL;
512 /*
513 * GFP_NOFS is here because callers are likely from writeback or
514 * similarly constrained call sites
515 */
516 ret = start_this_handle(journal, handle, GFP_NOFS);
517 if (ret < 0)
518 jbd2_journal_free_reserved(handle);
519 handle->h_type = type;
520 handle->h_line_no = line_no;
521 return ret;
522}
523EXPORT_SYMBOL(jbd2_journal_start_reserved);
402 524
403/** 525/**
404 * int jbd2_journal_extend() - extend buffer credits. 526 * int jbd2_journal_extend() - extend buffer credits.
@@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
423int jbd2_journal_extend(handle_t *handle, int nblocks) 545int jbd2_journal_extend(handle_t *handle, int nblocks)
424{ 546{
425 transaction_t *transaction = handle->h_transaction; 547 transaction_t *transaction = handle->h_transaction;
426 journal_t *journal = transaction->t_journal; 548 journal_t *journal;
427 int result; 549 int result;
428 int wanted; 550 int wanted;
429 551
430 result = -EIO; 552 WARN_ON(!transaction);
431 if (is_handle_aborted(handle)) 553 if (is_handle_aborted(handle))
432 goto out; 554 return -EROFS;
555 journal = transaction->t_journal;
433 556
434 result = 1; 557 result = 1;
435 558
436 read_lock(&journal->j_state_lock); 559 read_lock(&journal->j_state_lock);
437 560
438 /* Don't extend a locked-down transaction! */ 561 /* Don't extend a locked-down transaction! */
439 if (handle->h_transaction->t_state != T_RUNNING) { 562 if (transaction->t_state != T_RUNNING) {
440 jbd_debug(3, "denied handle %p %d blocks: " 563 jbd_debug(3, "denied handle %p %d blocks: "
441 "transaction not running\n", handle, nblocks); 564 "transaction not running\n", handle, nblocks);
442 goto error_out; 565 goto error_out;
443 } 566 }
444 567
445 spin_lock(&transaction->t_handle_lock); 568 spin_lock(&transaction->t_handle_lock);
446 wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; 569 wanted = atomic_add_return(nblocks,
570 &transaction->t_outstanding_credits);
447 571
448 if (wanted > journal->j_max_transaction_buffers) { 572 if (wanted > journal->j_max_transaction_buffers) {
449 jbd_debug(3, "denied handle %p %d blocks: " 573 jbd_debug(3, "denied handle %p %d blocks: "
450 "transaction too large\n", handle, nblocks); 574 "transaction too large\n", handle, nblocks);
575 atomic_sub(nblocks, &transaction->t_outstanding_credits);
451 goto unlock; 576 goto unlock;
452 } 577 }
453 578
454 if (wanted > __jbd2_log_space_left(journal)) { 579 if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
580 jbd2_log_space_left(journal)) {
455 jbd_debug(3, "denied handle %p %d blocks: " 581 jbd_debug(3, "denied handle %p %d blocks: "
456 "insufficient log space\n", handle, nblocks); 582 "insufficient log space\n", handle, nblocks);
583 atomic_sub(nblocks, &transaction->t_outstanding_credits);
457 goto unlock; 584 goto unlock;
458 } 585 }
459 586
460 trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, 587 trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
461 handle->h_transaction->t_tid, 588 transaction->t_tid,
462 handle->h_type, handle->h_line_no, 589 handle->h_type, handle->h_line_no,
463 handle->h_buffer_credits, 590 handle->h_buffer_credits,
464 nblocks); 591 nblocks);
465 592
466 handle->h_buffer_credits += nblocks; 593 handle->h_buffer_credits += nblocks;
467 handle->h_requested_credits += nblocks; 594 handle->h_requested_credits += nblocks;
468 atomic_add(nblocks, &transaction->t_outstanding_credits);
469 result = 0; 595 result = 0;
470 596
471 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); 597 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -473,7 +599,6 @@ unlock:
473 spin_unlock(&transaction->t_handle_lock); 599 spin_unlock(&transaction->t_handle_lock);
474error_out: 600error_out:
475 read_unlock(&journal->j_state_lock); 601 read_unlock(&journal->j_state_lock);
476out:
477 return result; 602 return result;
478} 603}
479 604
@@ -490,19 +615,22 @@ out:
490 * to a running handle, a call to jbd2_journal_restart will commit the 615 * to a running handle, a call to jbd2_journal_restart will commit the
491 * handle's transaction so far and reattach the handle to a new 616 * handle's transaction so far and reattach the handle to a new
492 * transaction capabable of guaranteeing the requested number of 617 * transaction capabable of guaranteeing the requested number of
493 * credits. 618 * credits. We preserve reserved handle if there's any attached to the
619 * passed in handle.
494 */ 620 */
495int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) 621int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
496{ 622{
497 transaction_t *transaction = handle->h_transaction; 623 transaction_t *transaction = handle->h_transaction;
498 journal_t *journal = transaction->t_journal; 624 journal_t *journal;
499 tid_t tid; 625 tid_t tid;
500 int need_to_start, ret; 626 int need_to_start, ret;
501 627
628 WARN_ON(!transaction);
502 /* If we've had an abort of any type, don't even think about 629 /* If we've had an abort of any type, don't even think about
503 * actually doing the restart! */ 630 * actually doing the restart! */
504 if (is_handle_aborted(handle)) 631 if (is_handle_aborted(handle))
505 return 0; 632 return 0;
633 journal = transaction->t_journal;
506 634
507 /* 635 /*
508 * First unlink the handle from its current transaction, and start the 636 * First unlink the handle from its current transaction, and start the
@@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
515 spin_lock(&transaction->t_handle_lock); 643 spin_lock(&transaction->t_handle_lock);
516 atomic_sub(handle->h_buffer_credits, 644 atomic_sub(handle->h_buffer_credits,
517 &transaction->t_outstanding_credits); 645 &transaction->t_outstanding_credits);
646 if (handle->h_rsv_handle) {
647 sub_reserved_credits(journal,
648 handle->h_rsv_handle->h_buffer_credits);
649 }
518 if (atomic_dec_and_test(&transaction->t_updates)) 650 if (atomic_dec_and_test(&transaction->t_updates))
519 wake_up(&journal->j_wait_updates); 651 wake_up(&journal->j_wait_updates);
652 tid = transaction->t_tid;
520 spin_unlock(&transaction->t_handle_lock); 653 spin_unlock(&transaction->t_handle_lock);
654 handle->h_transaction = NULL;
655 current->journal_info = NULL;
521 656
522 jbd_debug(2, "restarting handle %p\n", handle); 657 jbd_debug(2, "restarting handle %p\n", handle);
523 tid = transaction->t_tid;
524 need_to_start = !tid_geq(journal->j_commit_request, tid); 658 need_to_start = !tid_geq(journal->j_commit_request, tid);
525 read_unlock(&journal->j_state_lock); 659 read_unlock(&journal->j_state_lock);
526 if (need_to_start) 660 if (need_to_start)
@@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
557 write_lock(&journal->j_state_lock); 691 write_lock(&journal->j_state_lock);
558 ++journal->j_barrier_count; 692 ++journal->j_barrier_count;
559 693
694 /* Wait until there are no reserved handles */
695 if (atomic_read(&journal->j_reserved_credits)) {
696 write_unlock(&journal->j_state_lock);
697 wait_event(journal->j_wait_reserved,
698 atomic_read(&journal->j_reserved_credits) == 0);
699 write_lock(&journal->j_state_lock);
700 }
701
560 /* Wait until there are no running updates */ 702 /* Wait until there are no running updates */
561 while (1) { 703 while (1) {
562 transaction_t *transaction = journal->j_running_transaction; 704 transaction_t *transaction = journal->j_running_transaction;
@@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
619 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 761 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
620} 762}
621 763
764static int sleep_on_shadow_bh(void *word)
765{
766 io_schedule();
767 return 0;
768}
769
622/* 770/*
623 * If the buffer is already part of the current transaction, then there 771 * If the buffer is already part of the current transaction, then there
624 * is nothing we need to do. If it is already part of a prior 772 * is nothing we need to do. If it is already part of a prior
@@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
634 int force_copy) 782 int force_copy)
635{ 783{
636 struct buffer_head *bh; 784 struct buffer_head *bh;
637 transaction_t *transaction; 785 transaction_t *transaction = handle->h_transaction;
638 journal_t *journal; 786 journal_t *journal;
639 int error; 787 int error;
640 char *frozen_buffer = NULL; 788 char *frozen_buffer = NULL;
641 int need_copy = 0; 789 int need_copy = 0;
642 unsigned long start_lock, time_lock; 790 unsigned long start_lock, time_lock;
643 791
792 WARN_ON(!transaction);
644 if (is_handle_aborted(handle)) 793 if (is_handle_aborted(handle))
645 return -EROFS; 794 return -EROFS;
646
647 transaction = handle->h_transaction;
648 journal = transaction->t_journal; 795 journal = transaction->t_journal;
649 796
650 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); 797 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -754,41 +901,29 @@ repeat:
754 * journaled. If the primary copy is already going to 901 * journaled. If the primary copy is already going to
755 * disk then we cannot do copy-out here. */ 902 * disk then we cannot do copy-out here. */
756 903
757 if (jh->b_jlist == BJ_Shadow) { 904 if (buffer_shadow(bh)) {
758 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
759 wait_queue_head_t *wqh;
760
761 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
762
763 JBUFFER_TRACE(jh, "on shadow: sleep"); 905 JBUFFER_TRACE(jh, "on shadow: sleep");
764 jbd_unlock_bh_state(bh); 906 jbd_unlock_bh_state(bh);
765 /* commit wakes up all shadow buffers after IO */ 907 wait_on_bit(&bh->b_state, BH_Shadow,
766 for ( ; ; ) { 908 sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
767 prepare_to_wait(wqh, &wait.wait,
768 TASK_UNINTERRUPTIBLE);
769 if (jh->b_jlist != BJ_Shadow)
770 break;
771 schedule();
772 }
773 finish_wait(wqh, &wait.wait);
774 goto repeat; 909 goto repeat;
775 } 910 }
776 911
777 /* Only do the copy if the currently-owning transaction 912 /*
778 * still needs it. If it is on the Forget list, the 913 * Only do the copy if the currently-owning transaction still
779 * committing transaction is past that stage. The 914 * needs it. If buffer isn't on BJ_Metadata list, the
780 * buffer had better remain locked during the kmalloc, 915 * committing transaction is past that stage (here we use the
781 * but that should be true --- we hold the journal lock 916 * fact that BH_Shadow is set under bh_state lock together with
782 * still and the buffer is already on the BUF_JOURNAL 917 * refiling to BJ_Shadow list and at this point we know the
783 * list so won't be flushed. 918 * buffer doesn't have BH_Shadow set).
784 * 919 *
785 * Subtle point, though: if this is a get_undo_access, 920 * Subtle point, though: if this is a get_undo_access,
786 * then we will be relying on the frozen_data to contain 921 * then we will be relying on the frozen_data to contain
787 * the new value of the committed_data record after the 922 * the new value of the committed_data record after the
788 * transaction, so we HAVE to force the frozen_data copy 923 * transaction, so we HAVE to force the frozen_data copy
789 * in that case. */ 924 * in that case.
790 925 */
791 if (jh->b_jlist != BJ_Forget || force_copy) { 926 if (jh->b_jlist == BJ_Metadata || force_copy) {
792 JBUFFER_TRACE(jh, "generate frozen data"); 927 JBUFFER_TRACE(jh, "generate frozen data");
793 if (!frozen_buffer) { 928 if (!frozen_buffer) {
794 JBUFFER_TRACE(jh, "allocate memory for buffer"); 929 JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
915int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) 1050int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
916{ 1051{
917 transaction_t *transaction = handle->h_transaction; 1052 transaction_t *transaction = handle->h_transaction;
918 journal_t *journal = transaction->t_journal; 1053 journal_t *journal;
919 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 1054 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
920 int err; 1055 int err;
921 1056
922 jbd_debug(5, "journal_head %p\n", jh); 1057 jbd_debug(5, "journal_head %p\n", jh);
1058 WARN_ON(!transaction);
923 err = -EROFS; 1059 err = -EROFS;
924 if (is_handle_aborted(handle)) 1060 if (is_handle_aborted(handle))
925 goto out; 1061 goto out;
1062 journal = transaction->t_journal;
926 err = 0; 1063 err = 0;
927 1064
928 JBUFFER_TRACE(jh, "entry"); 1065 JBUFFER_TRACE(jh, "entry");
@@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
1128int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) 1265int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1129{ 1266{
1130 transaction_t *transaction = handle->h_transaction; 1267 transaction_t *transaction = handle->h_transaction;
1131 journal_t *journal = transaction->t_journal; 1268 journal_t *journal;
1132 struct journal_head *jh; 1269 struct journal_head *jh;
1133 int ret = 0; 1270 int ret = 0;
1134 1271
1272 WARN_ON(!transaction);
1135 if (is_handle_aborted(handle)) 1273 if (is_handle_aborted(handle))
1136 goto out; 1274 return -EROFS;
1275 journal = transaction->t_journal;
1137 jh = jbd2_journal_grab_journal_head(bh); 1276 jh = jbd2_journal_grab_journal_head(bh);
1138 if (!jh) { 1277 if (!jh) {
1139 ret = -EUCLEAN; 1278 ret = -EUCLEAN;
@@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1227 1366
1228 JBUFFER_TRACE(jh, "file as BJ_Metadata"); 1367 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1229 spin_lock(&journal->j_list_lock); 1368 spin_lock(&journal->j_list_lock);
1230 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); 1369 __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
1231 spin_unlock(&journal->j_list_lock); 1370 spin_unlock(&journal->j_list_lock);
1232out_unlock_bh: 1371out_unlock_bh:
1233 jbd_unlock_bh_state(bh); 1372 jbd_unlock_bh_state(bh);
@@ -1258,12 +1397,17 @@ out:
1258int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) 1397int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1259{ 1398{
1260 transaction_t *transaction = handle->h_transaction; 1399 transaction_t *transaction = handle->h_transaction;
1261 journal_t *journal = transaction->t_journal; 1400 journal_t *journal;
1262 struct journal_head *jh; 1401 struct journal_head *jh;
1263 int drop_reserve = 0; 1402 int drop_reserve = 0;
1264 int err = 0; 1403 int err = 0;
1265 int was_modified = 0; 1404 int was_modified = 0;
1266 1405
1406 WARN_ON(!transaction);
1407 if (is_handle_aborted(handle))
1408 return -EROFS;
1409 journal = transaction->t_journal;
1410
1267 BUFFER_TRACE(bh, "entry"); 1411 BUFFER_TRACE(bh, "entry");
1268 1412
1269 jbd_lock_bh_state(bh); 1413 jbd_lock_bh_state(bh);
@@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1290 */ 1434 */
1291 jh->b_modified = 0; 1435 jh->b_modified = 0;
1292 1436
1293 if (jh->b_transaction == handle->h_transaction) { 1437 if (jh->b_transaction == transaction) {
1294 J_ASSERT_JH(jh, !jh->b_frozen_data); 1438 J_ASSERT_JH(jh, !jh->b_frozen_data);
1295 1439
1296 /* If we are forgetting a buffer which is already part 1440 /* If we are forgetting a buffer which is already part
@@ -1385,19 +1529,21 @@ drop:
1385int jbd2_journal_stop(handle_t *handle) 1529int jbd2_journal_stop(handle_t *handle)
1386{ 1530{
1387 transaction_t *transaction = handle->h_transaction; 1531 transaction_t *transaction = handle->h_transaction;
1388 journal_t *journal = transaction->t_journal; 1532 journal_t *journal;
1389 int err, wait_for_commit = 0; 1533 int err = 0, wait_for_commit = 0;
1390 tid_t tid; 1534 tid_t tid;
1391 pid_t pid; 1535 pid_t pid;
1392 1536
1537 if (!transaction)
1538 goto free_and_exit;
1539 journal = transaction->t_journal;
1540
1393 J_ASSERT(journal_current_handle() == handle); 1541 J_ASSERT(journal_current_handle() == handle);
1394 1542
1395 if (is_handle_aborted(handle)) 1543 if (is_handle_aborted(handle))
1396 err = -EIO; 1544 err = -EIO;
1397 else { 1545 else
1398 J_ASSERT(atomic_read(&transaction->t_updates) > 0); 1546 J_ASSERT(atomic_read(&transaction->t_updates) > 0);
1399 err = 0;
1400 }
1401 1547
1402 if (--handle->h_ref > 0) { 1548 if (--handle->h_ref > 0) {
1403 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, 1549 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle)
1407 1553
1408 jbd_debug(4, "Handle %p going down\n", handle); 1554 jbd_debug(4, "Handle %p going down\n", handle);
1409 trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, 1555 trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
1410 handle->h_transaction->t_tid, 1556 transaction->t_tid,
1411 handle->h_type, handle->h_line_no, 1557 handle->h_type, handle->h_line_no,
1412 jiffies - handle->h_start_jiffies, 1558 jiffies - handle->h_start_jiffies,
1413 handle->h_sync, handle->h_requested_credits, 1559 handle->h_sync, handle->h_requested_credits,
@@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
1518 1664
1519 lock_map_release(&handle->h_lockdep_map); 1665 lock_map_release(&handle->h_lockdep_map);
1520 1666
1667 if (handle->h_rsv_handle)
1668 jbd2_journal_free_reserved(handle->h_rsv_handle);
1669free_and_exit:
1521 jbd2_free_handle(handle); 1670 jbd2_free_handle(handle);
1522 return err; 1671 return err;
1523} 1672}
1524 1673
1525/**
1526 * int jbd2_journal_force_commit() - force any uncommitted transactions
1527 * @journal: journal to force
1528 *
1529 * For synchronous operations: force any uncommitted transactions
1530 * to disk. May seem kludgy, but it reuses all the handle batching
1531 * code in a very simple manner.
1532 */
1533int jbd2_journal_force_commit(journal_t *journal)
1534{
1535 handle_t *handle;
1536 int ret;
1537
1538 handle = jbd2_journal_start(journal, 1);
1539 if (IS_ERR(handle)) {
1540 ret = PTR_ERR(handle);
1541 } else {
1542 handle->h_sync = 1;
1543 ret = jbd2_journal_stop(handle);
1544 }
1545 return ret;
1546}
1547
1548/* 1674/*
1549 * 1675 *
1550 * List management code snippets: various functions for manipulating the 1676 * List management code snippets: various functions for manipulating the
@@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1601 * Remove a buffer from the appropriate transaction list. 1727 * Remove a buffer from the appropriate transaction list.
1602 * 1728 *
1603 * Note that this function can *change* the value of 1729 * Note that this function can *change* the value of
1604 * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, 1730 * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
1605 * t_log_list or t_reserved_list. If the caller is holding onto a copy of one 1731 * t_reserved_list. If the caller is holding onto a copy of one of these
1606 * of these pointers, it could go bad. Generally the caller needs to re-read 1732 * pointers, it could go bad. Generally the caller needs to re-read the
1607 * the pointer from the transaction_t. 1733 * pointer from the transaction_t.
1608 * 1734 *
1609 * Called under j_list_lock. 1735 * Called under j_list_lock.
1610 */ 1736 */
@@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1634 case BJ_Forget: 1760 case BJ_Forget:
1635 list = &transaction->t_forget; 1761 list = &transaction->t_forget;
1636 break; 1762 break;
1637 case BJ_IO:
1638 list = &transaction->t_iobuf_list;
1639 break;
1640 case BJ_Shadow: 1763 case BJ_Shadow:
1641 list = &transaction->t_shadow_list; 1764 list = &transaction->t_shadow_list;
1642 break; 1765 break;
1643 case BJ_LogCtl:
1644 list = &transaction->t_log_list;
1645 break;
1646 case BJ_Reserved: 1766 case BJ_Reserved:
1647 list = &transaction->t_reserved_list; 1767 list = &transaction->t_reserved_list;
1648 break; 1768 break;
@@ -2034,18 +2154,23 @@ zap_buffer_unlocked:
2034 * void jbd2_journal_invalidatepage() 2154 * void jbd2_journal_invalidatepage()
2035 * @journal: journal to use for flush... 2155 * @journal: journal to use for flush...
2036 * @page: page to flush 2156 * @page: page to flush
2037 * @offset: length of page to invalidate. 2157 * @offset: start of the range to invalidate
2158 * @length: length of the range to invalidate
2038 * 2159 *
2039 * Reap page buffers containing data after offset in page. Can return -EBUSY 2160 * Reap page buffers containing data after in the specified range in page.
2040 * if buffers are part of the committing transaction and the page is straddling 2161 * Can return -EBUSY if buffers are part of the committing transaction and
2041 * i_size. Caller then has to wait for current commit and try again. 2162 * the page is straddling i_size. Caller then has to wait for current commit
2163 * and try again.
2042 */ 2164 */
2043int jbd2_journal_invalidatepage(journal_t *journal, 2165int jbd2_journal_invalidatepage(journal_t *journal,
2044 struct page *page, 2166 struct page *page,
2045 unsigned long offset) 2167 unsigned int offset,
2168 unsigned int length)
2046{ 2169{
2047 struct buffer_head *head, *bh, *next; 2170 struct buffer_head *head, *bh, *next;
2171 unsigned int stop = offset + length;
2048 unsigned int curr_off = 0; 2172 unsigned int curr_off = 0;
2173 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2049 int may_free = 1; 2174 int may_free = 1;
2050 int ret = 0; 2175 int ret = 0;
2051 2176
@@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
2054 if (!page_has_buffers(page)) 2179 if (!page_has_buffers(page))
2055 return 0; 2180 return 0;
2056 2181
2182 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
2183
2057 /* We will potentially be playing with lists other than just the 2184 /* We will potentially be playing with lists other than just the
2058 * data lists (especially for journaled data mode), so be 2185 * data lists (especially for journaled data mode), so be
2059 * cautious in our locking. */ 2186 * cautious in our locking. */
@@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
2063 unsigned int next_off = curr_off + bh->b_size; 2190 unsigned int next_off = curr_off + bh->b_size;
2064 next = bh->b_this_page; 2191 next = bh->b_this_page;
2065 2192
2193 if (next_off > stop)
2194 return 0;
2195
2066 if (offset <= curr_off) { 2196 if (offset <= curr_off) {
2067 /* This block is wholly outside the truncation point */ 2197 /* This block is wholly outside the truncation point */
2068 lock_buffer(bh); 2198 lock_buffer(bh);
2069 ret = journal_unmap_buffer(journal, bh, offset > 0); 2199 ret = journal_unmap_buffer(journal, bh, partial_page);
2070 unlock_buffer(bh); 2200 unlock_buffer(bh);
2071 if (ret < 0) 2201 if (ret < 0)
2072 return ret; 2202 return ret;
@@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
2077 2207
2078 } while (bh != head); 2208 } while (bh != head);
2079 2209
2080 if (!offset) { 2210 if (!partial_page) {
2081 if (may_free && try_to_free_buffers(page)) 2211 if (may_free && try_to_free_buffers(page))
2082 J_ASSERT(!page_has_buffers(page)); 2212 J_ASSERT(!page_has_buffers(page));
2083 } 2213 }
@@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
2138 case BJ_Forget: 2268 case BJ_Forget:
2139 list = &transaction->t_forget; 2269 list = &transaction->t_forget;
2140 break; 2270 break;
2141 case BJ_IO:
2142 list = &transaction->t_iobuf_list;
2143 break;
2144 case BJ_Shadow: 2271 case BJ_Shadow:
2145 list = &transaction->t_shadow_list; 2272 list = &transaction->t_shadow_list;
2146 break; 2273 break;
2147 case BJ_LogCtl:
2148 list = &transaction->t_log_list;
2149 break;
2150 case BJ_Reserved: 2274 case BJ_Reserved:
2151 list = &transaction->t_reserved_list; 2275 list = &transaction->t_reserved_list;
2152 break; 2276 break;
@@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2248int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) 2372int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
2249{ 2373{
2250 transaction_t *transaction = handle->h_transaction; 2374 transaction_t *transaction = handle->h_transaction;
2251 journal_t *journal = transaction->t_journal; 2375 journal_t *journal;
2252 2376
2377 WARN_ON(!transaction);
2253 if (is_handle_aborted(handle)) 2378 if (is_handle_aborted(handle))
2254 return -EIO; 2379 return -EROFS;
2380 journal = transaction->t_journal;
2255 2381
2256 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, 2382 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2257 transaction->t_tid); 2383 transaction->t_tid);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index acd46a4160cb..e3aac222472e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -22,7 +22,7 @@
22#include <linux/time.h> 22#include <linux/time.h>
23#include "nodelist.h" 23#include "nodelist.h"
24 24
25static int jffs2_readdir (struct file *, void *, filldir_t); 25static int jffs2_readdir (struct file *, struct dir_context *);
26 26
27static int jffs2_create (struct inode *,struct dentry *,umode_t, 27static int jffs2_create (struct inode *,struct dentry *,umode_t,
28 bool); 28 bool);
@@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *,
40const struct file_operations jffs2_dir_operations = 40const struct file_operations jffs2_dir_operations =
41{ 41{
42 .read = generic_read_dir, 42 .read = generic_read_dir,
43 .readdir = jffs2_readdir, 43 .iterate = jffs2_readdir,
44 .unlocked_ioctl=jffs2_ioctl, 44 .unlocked_ioctl=jffs2_ioctl,
45 .fsync = jffs2_fsync, 45 .fsync = jffs2_fsync,
46 .llseek = generic_file_llseek, 46 .llseek = generic_file_llseek,
@@ -114,60 +114,40 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
114/***********************************************************************/ 114/***********************************************************************/
115 115
116 116
117static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) 117static int jffs2_readdir(struct file *file, struct dir_context *ctx)
118{ 118{
119 struct jffs2_inode_info *f; 119 struct inode *inode = file_inode(file);
120 struct inode *inode = file_inode(filp); 120 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
121 struct jffs2_full_dirent *fd; 121 struct jffs2_full_dirent *fd;
122 unsigned long offset, curofs; 122 unsigned long curofs = 1;
123 123
124 jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", 124 jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino);
125 file_inode(filp)->i_ino);
126 125
127 f = JFFS2_INODE_INFO(inode); 126 if (!dir_emit_dots(file, ctx))
128 127 return 0;
129 offset = filp->f_pos;
130
131 if (offset == 0) {
132 jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino);
133 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
134 goto out;
135 offset++;
136 }
137 if (offset == 1) {
138 unsigned long pino = parent_ino(filp->f_path.dentry);
139 jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino);
140 if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
141 goto out;
142 offset++;
143 }
144 128
145 curofs=1;
146 mutex_lock(&f->sem); 129 mutex_lock(&f->sem);
147 for (fd = f->dents; fd; fd = fd->next) { 130 for (fd = f->dents; fd; fd = fd->next) {
148
149 curofs++; 131 curofs++;
150 /* First loop: curofs = 2; offset = 2 */ 132 /* First loop: curofs = 2; pos = 2 */
151 if (curofs < offset) { 133 if (curofs < ctx->pos) {
152 jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n", 134 jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
153 fd->name, fd->ino, fd->type, curofs, offset); 135 fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos);
154 continue; 136 continue;
155 } 137 }
156 if (!fd->ino) { 138 if (!fd->ino) {
157 jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n", 139 jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
158 fd->name); 140 fd->name);
159 offset++; 141 ctx->pos++;
160 continue; 142 continue;
161 } 143 }
162 jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n", 144 jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
163 offset, fd->name, fd->ino, fd->type); 145 (unsigned long)ctx->pos, fd->name, fd->ino, fd->type);
164 if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0) 146 if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type))
165 break; 147 break;
166 offset++; 148 ctx->pos++;
167 } 149 }
168 mutex_unlock(&f->sem); 150 mutex_unlock(&f->sem);
169 out:
170 filp->f_pos = offset;
171 return 0; 151 return 0;
172} 152}
173 153
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0ddbeceafc62..9f4ed13d9f15 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -3002,9 +3002,9 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
3002 * return: offset = (pn, index) of start entry 3002 * return: offset = (pn, index) of start entry
3003 * of next jfs_readdir()/dtRead() 3003 * of next jfs_readdir()/dtRead()
3004 */ 3004 */
3005int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 3005int jfs_readdir(struct file *file, struct dir_context *ctx)
3006{ 3006{
3007 struct inode *ip = file_inode(filp); 3007 struct inode *ip = file_inode(file);
3008 struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; 3008 struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
3009 int rc = 0; 3009 int rc = 0;
3010 loff_t dtpos; /* legacy OS/2 style position */ 3010 loff_t dtpos; /* legacy OS/2 style position */
@@ -3033,7 +3033,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3033 int overflow, fix_page, page_fixed = 0; 3033 int overflow, fix_page, page_fixed = 0;
3034 static int unique_pos = 2; /* If we can't fix broken index */ 3034 static int unique_pos = 2; /* If we can't fix broken index */
3035 3035
3036 if (filp->f_pos == DIREND) 3036 if (ctx->pos == DIREND)
3037 return 0; 3037 return 0;
3038 3038
3039 if (DO_INDEX(ip)) { 3039 if (DO_INDEX(ip)) {
@@ -3045,7 +3045,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3045 */ 3045 */
3046 do_index = 1; 3046 do_index = 1;
3047 3047
3048 dir_index = (u32) filp->f_pos; 3048 dir_index = (u32) ctx->pos;
3049 3049
3050 if (dir_index > 1) { 3050 if (dir_index > 1) {
3051 struct dir_table_slot dirtab_slot; 3051 struct dir_table_slot dirtab_slot;
@@ -3053,25 +3053,25 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3053 if (dtEmpty(ip) || 3053 if (dtEmpty(ip) ||
3054 (dir_index >= JFS_IP(ip)->next_index)) { 3054 (dir_index >= JFS_IP(ip)->next_index)) {
3055 /* Stale position. Directory has shrunk */ 3055 /* Stale position. Directory has shrunk */
3056 filp->f_pos = DIREND; 3056 ctx->pos = DIREND;
3057 return 0; 3057 return 0;
3058 } 3058 }
3059 repeat: 3059 repeat:
3060 rc = read_index(ip, dir_index, &dirtab_slot); 3060 rc = read_index(ip, dir_index, &dirtab_slot);
3061 if (rc) { 3061 if (rc) {
3062 filp->f_pos = DIREND; 3062 ctx->pos = DIREND;
3063 return rc; 3063 return rc;
3064 } 3064 }
3065 if (dirtab_slot.flag == DIR_INDEX_FREE) { 3065 if (dirtab_slot.flag == DIR_INDEX_FREE) {
3066 if (loop_count++ > JFS_IP(ip)->next_index) { 3066 if (loop_count++ > JFS_IP(ip)->next_index) {
3067 jfs_err("jfs_readdir detected " 3067 jfs_err("jfs_readdir detected "
3068 "infinite loop!"); 3068 "infinite loop!");
3069 filp->f_pos = DIREND; 3069 ctx->pos = DIREND;
3070 return 0; 3070 return 0;
3071 } 3071 }
3072 dir_index = le32_to_cpu(dirtab_slot.addr2); 3072 dir_index = le32_to_cpu(dirtab_slot.addr2);
3073 if (dir_index == -1) { 3073 if (dir_index == -1) {
3074 filp->f_pos = DIREND; 3074 ctx->pos = DIREND;
3075 return 0; 3075 return 0;
3076 } 3076 }
3077 goto repeat; 3077 goto repeat;
@@ -3080,13 +3080,13 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3080 index = dirtab_slot.slot; 3080 index = dirtab_slot.slot;
3081 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); 3081 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3082 if (rc) { 3082 if (rc) {
3083 filp->f_pos = DIREND; 3083 ctx->pos = DIREND;
3084 return 0; 3084 return 0;
3085 } 3085 }
3086 if (p->header.flag & BT_INTERNAL) { 3086 if (p->header.flag & BT_INTERNAL) {
3087 jfs_err("jfs_readdir: bad index table"); 3087 jfs_err("jfs_readdir: bad index table");
3088 DT_PUTPAGE(mp); 3088 DT_PUTPAGE(mp);
3089 filp->f_pos = -1; 3089 ctx->pos = -1;
3090 return 0; 3090 return 0;
3091 } 3091 }
3092 } else { 3092 } else {
@@ -3094,23 +3094,22 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3094 /* 3094 /*
3095 * self "." 3095 * self "."
3096 */ 3096 */
3097 filp->f_pos = 0; 3097 ctx->pos = 0;
3098 if (filldir(dirent, ".", 1, 0, ip->i_ino, 3098 if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
3099 DT_DIR))
3100 return 0; 3099 return 0;
3101 } 3100 }
3102 /* 3101 /*
3103 * parent ".." 3102 * parent ".."
3104 */ 3103 */
3105 filp->f_pos = 1; 3104 ctx->pos = 1;
3106 if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR)) 3105 if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
3107 return 0; 3106 return 0;
3108 3107
3109 /* 3108 /*
3110 * Find first entry of left-most leaf 3109 * Find first entry of left-most leaf
3111 */ 3110 */
3112 if (dtEmpty(ip)) { 3111 if (dtEmpty(ip)) {
3113 filp->f_pos = DIREND; 3112 ctx->pos = DIREND;
3114 return 0; 3113 return 0;
3115 } 3114 }
3116 3115
@@ -3128,23 +3127,19 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3128 * pn > 0: Real entries, pn=1 -> leftmost page 3127 * pn > 0: Real entries, pn=1 -> leftmost page
3129 * pn = index = -1: No more entries 3128 * pn = index = -1: No more entries
3130 */ 3129 */
3131 dtpos = filp->f_pos; 3130 dtpos = ctx->pos;
3132 if (dtpos == 0) { 3131 if (dtpos == 0) {
3133 /* build "." entry */ 3132 /* build "." entry */
3134 3133 if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
3135 if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
3136 DT_DIR))
3137 return 0; 3134 return 0;
3138 dtoffset->index = 1; 3135 dtoffset->index = 1;
3139 filp->f_pos = dtpos; 3136 ctx->pos = dtpos;
3140 } 3137 }
3141 3138
3142 if (dtoffset->pn == 0) { 3139 if (dtoffset->pn == 0) {
3143 if (dtoffset->index == 1) { 3140 if (dtoffset->index == 1) {
3144 /* build ".." entry */ 3141 /* build ".." entry */
3145 3142 if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
3146 if (filldir(dirent, "..", 2, filp->f_pos,
3147 PARENT(ip), DT_DIR))
3148 return 0; 3143 return 0;
3149 } else { 3144 } else {
3150 jfs_err("jfs_readdir called with " 3145 jfs_err("jfs_readdir called with "
@@ -3152,18 +3147,18 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3152 } 3147 }
3153 dtoffset->pn = 1; 3148 dtoffset->pn = 1;
3154 dtoffset->index = 0; 3149 dtoffset->index = 0;
3155 filp->f_pos = dtpos; 3150 ctx->pos = dtpos;
3156 } 3151 }
3157 3152
3158 if (dtEmpty(ip)) { 3153 if (dtEmpty(ip)) {
3159 filp->f_pos = DIREND; 3154 ctx->pos = DIREND;
3160 return 0; 3155 return 0;
3161 } 3156 }
3162 3157
3163 if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) { 3158 if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
3164 jfs_err("jfs_readdir: unexpected rc = %d " 3159 jfs_err("jfs_readdir: unexpected rc = %d "
3165 "from dtReadNext", rc); 3160 "from dtReadNext", rc);
3166 filp->f_pos = DIREND; 3161 ctx->pos = DIREND;
3167 return 0; 3162 return 0;
3168 } 3163 }
3169 /* get start leaf page and index */ 3164 /* get start leaf page and index */
@@ -3171,7 +3166,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3171 3166
3172 /* offset beyond directory eof ? */ 3167 /* offset beyond directory eof ? */
3173 if (bn < 0) { 3168 if (bn < 0) {
3174 filp->f_pos = DIREND; 3169 ctx->pos = DIREND;
3175 return 0; 3170 return 0;
3176 } 3171 }
3177 } 3172 }
@@ -3180,7 +3175,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3180 if (dirent_buf == 0) { 3175 if (dirent_buf == 0) {
3181 DT_PUTPAGE(mp); 3176 DT_PUTPAGE(mp);
3182 jfs_warn("jfs_readdir: __get_free_page failed!"); 3177 jfs_warn("jfs_readdir: __get_free_page failed!");
3183 filp->f_pos = DIREND; 3178 ctx->pos = DIREND;
3184 return -ENOMEM; 3179 return -ENOMEM;
3185 } 3180 }
3186 3181
@@ -3295,9 +3290,9 @@ skip_one:
3295 3290
3296 jfs_dirent = (struct jfs_dirent *) dirent_buf; 3291 jfs_dirent = (struct jfs_dirent *) dirent_buf;
3297 while (jfs_dirents--) { 3292 while (jfs_dirents--) {
3298 filp->f_pos = jfs_dirent->position; 3293 ctx->pos = jfs_dirent->position;
3299 if (filldir(dirent, jfs_dirent->name, 3294 if (!dir_emit(ctx, jfs_dirent->name,
3300 jfs_dirent->name_len, filp->f_pos, 3295 jfs_dirent->name_len,
3301 jfs_dirent->ino, DT_UNKNOWN)) 3296 jfs_dirent->ino, DT_UNKNOWN))
3302 goto out; 3297 goto out;
3303 jfs_dirent = next_jfs_dirent(jfs_dirent); 3298 jfs_dirent = next_jfs_dirent(jfs_dirent);
@@ -3309,7 +3304,7 @@ skip_one:
3309 } 3304 }
3310 3305
3311 if (!overflow && (bn == 0)) { 3306 if (!overflow && (bn == 0)) {
3312 filp->f_pos = DIREND; 3307 ctx->pos = DIREND;
3313 break; 3308 break;
3314 } 3309 }
3315 3310
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 2545bb317235..fd4169e6e698 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -265,5 +265,5 @@ extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
265extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key, 265extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
266 ino_t * orig_ino, ino_t new_ino, int flag); 266 ino_t * orig_ino, ino_t new_ino, int flag);
267 267
268extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir); 268extern int jfs_readdir(struct file *file, struct dir_context *ctx);
269#endif /* !_H_JFS_DTREE */ 269#endif /* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c57499dca89c..360d27c48887 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2009,7 +2009,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
2009 2009
2010 bio->bi_end_io = lbmIODone; 2010 bio->bi_end_io = lbmIODone;
2011 bio->bi_private = bp; 2011 bio->bi_private = bp;
2012 submit_bio(READ_SYNC, bio); 2012 /*check if journaling to disk has been disabled*/
2013 if (log->no_integrity) {
2014 bio->bi_size = 0;
2015 lbmIODone(bio, 0);
2016 } else {
2017 submit_bio(READ_SYNC, bio);
2018 }
2013 2019
2014 wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); 2020 wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
2015 2021
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..9e3aaff11f89 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
571 return ret; 571 return ret;
572} 572}
573 573
574static void metapage_invalidatepage(struct page *page, unsigned long offset) 574static void metapage_invalidatepage(struct page *page, unsigned int offset,
575 unsigned int length)
575{ 576{
576 BUG_ON(offset); 577 BUG_ON(offset || length < PAGE_CACHE_SIZE);
577 578
578 BUG_ON(PageWriteback(page)); 579 BUG_ON(PageWriteback(page));
579 580
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 3b91a7ad6086..8b19027291d6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1529,7 +1529,7 @@ const struct inode_operations jfs_dir_inode_operations = {
1529 1529
1530const struct file_operations jfs_dir_operations = { 1530const struct file_operations jfs_dir_operations = {
1531 .read = generic_read_dir, 1531 .read = generic_read_dir,
1532 .readdir = jfs_readdir, 1532 .iterate = jfs_readdir,
1533 .fsync = jfs_fsync, 1533 .fsync = jfs_fsync,
1534 .unlocked_ioctl = jfs_ioctl, 1534 .unlocked_ioctl = jfs_ioctl,
1535#ifdef CONFIG_COMPAT 1535#ifdef CONFIG_COMPAT
@@ -1538,8 +1538,7 @@ const struct file_operations jfs_dir_operations = {
1538 .llseek = generic_file_llseek, 1538 .llseek = generic_file_llseek,
1539}; 1539};
1540 1540
1541static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, 1541static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
1542 struct qstr *this)
1543{ 1542{
1544 unsigned long hash; 1543 unsigned long hash;
1545 int i; 1544 int i;
@@ -1552,9 +1551,7 @@ static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
1552 return 0; 1551 return 0;
1553} 1552}
1554 1553
1555static int jfs_ci_compare(const struct dentry *parent, 1554static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
1556 const struct inode *pinode,
1557 const struct dentry *dentry, const struct inode *inode,
1558 unsigned int len, const char *str, const struct qstr *name) 1555 unsigned int len, const char *str, const struct qstr *name)
1559{ 1556{
1560 int i, result = 1; 1557 int i, result = 1;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2003e830ed1c..788e0a9c1fb0 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -611,11 +611,28 @@ static int jfs_freeze(struct super_block *sb)
611{ 611{
612 struct jfs_sb_info *sbi = JFS_SBI(sb); 612 struct jfs_sb_info *sbi = JFS_SBI(sb);
613 struct jfs_log *log = sbi->log; 613 struct jfs_log *log = sbi->log;
614 int rc = 0;
614 615
615 if (!(sb->s_flags & MS_RDONLY)) { 616 if (!(sb->s_flags & MS_RDONLY)) {
616 txQuiesce(sb); 617 txQuiesce(sb);
617 lmLogShutdown(log); 618 rc = lmLogShutdown(log);
618 updateSuper(sb, FM_CLEAN); 619 if (rc) {
620 jfs_error(sb, "jfs_freeze: lmLogShutdown failed");
621
622 /* let operations fail rather than hang */
623 txResume(sb);
624
625 return rc;
626 }
627 rc = updateSuper(sb, FM_CLEAN);
628 if (rc) {
629 jfs_err("jfs_freeze: updateSuper failed\n");
630 /*
631 * Don't fail here. Everything succeeded except
632 * marking the superblock clean, so there's really
633 * no harm in leaving it frozen for now.
634 */
635 }
619 } 636 }
620 return 0; 637 return 0;
621} 638}
@@ -627,13 +644,18 @@ static int jfs_unfreeze(struct super_block *sb)
627 int rc = 0; 644 int rc = 0;
628 645
629 if (!(sb->s_flags & MS_RDONLY)) { 646 if (!(sb->s_flags & MS_RDONLY)) {
630 updateSuper(sb, FM_MOUNT); 647 rc = updateSuper(sb, FM_MOUNT);
631 if ((rc = lmLogInit(log))) 648 if (rc) {
632 jfs_err("jfs_unlock failed with return code %d", rc); 649 jfs_error(sb, "jfs_unfreeze: updateSuper failed");
633 else 650 goto out;
634 txResume(sb); 651 }
652 rc = lmLogInit(log);
653 if (rc)
654 jfs_error(sb, "jfs_unfreeze: lmLogInit failed");
655out:
656 txResume(sb);
635 } 657 }
636 return 0; 658 return rc;
637} 659}
638 660
639static struct dentry *jfs_do_mount(struct file_system_type *fs_type, 661static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
diff --git a/fs/libfs.c b/fs/libfs.c
index 916da8c4158b..c3a0837fb861 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -135,60 +135,40 @@ static inline unsigned char dt_type(struct inode *inode)
135 * both impossible due to the lock on directory. 135 * both impossible due to the lock on directory.
136 */ 136 */
137 137
138int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) 138int dcache_readdir(struct file *file, struct dir_context *ctx)
139{ 139{
140 struct dentry *dentry = filp->f_path.dentry; 140 struct dentry *dentry = file->f_path.dentry;
141 struct dentry *cursor = filp->private_data; 141 struct dentry *cursor = file->private_data;
142 struct list_head *p, *q = &cursor->d_u.d_child; 142 struct list_head *p, *q = &cursor->d_u.d_child;
143 ino_t ino;
144 int i = filp->f_pos;
145 143
146 switch (i) { 144 if (!dir_emit_dots(file, ctx))
147 case 0: 145 return 0;
148 ino = dentry->d_inode->i_ino; 146 spin_lock(&dentry->d_lock);
149 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) 147 if (ctx->pos == 2)
150 break; 148 list_move(q, &dentry->d_subdirs);
151 filp->f_pos++; 149
152 i++; 150 for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
153 /* fallthrough */ 151 struct dentry *next = list_entry(p, struct dentry, d_u.d_child);
154 case 1: 152 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
155 ino = parent_ino(dentry); 153 if (!simple_positive(next)) {
156 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 154 spin_unlock(&next->d_lock);
157 break; 155 continue;
158 filp->f_pos++; 156 }
159 i++;
160 /* fallthrough */
161 default:
162 spin_lock(&dentry->d_lock);
163 if (filp->f_pos == 2)
164 list_move(q, &dentry->d_subdirs);
165
166 for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
167 struct dentry *next;
168 next = list_entry(p, struct dentry, d_u.d_child);
169 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
170 if (!simple_positive(next)) {
171 spin_unlock(&next->d_lock);
172 continue;
173 }
174 157
175 spin_unlock(&next->d_lock); 158 spin_unlock(&next->d_lock);
176 spin_unlock(&dentry->d_lock); 159 spin_unlock(&dentry->d_lock);
177 if (filldir(dirent, next->d_name.name, 160 if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
178 next->d_name.len, filp->f_pos, 161 next->d_inode->i_ino, dt_type(next->d_inode)))
179 next->d_inode->i_ino, 162 return 0;
180 dt_type(next->d_inode)) < 0) 163 spin_lock(&dentry->d_lock);
181 return 0; 164 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
182 spin_lock(&dentry->d_lock); 165 /* next is still alive */
183 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); 166 list_move(q, p);
184 /* next is still alive */ 167 spin_unlock(&next->d_lock);
185 list_move(q, p); 168 p = q;
186 spin_unlock(&next->d_lock); 169 ctx->pos++;
187 p = q;
188 filp->f_pos++;
189 }
190 spin_unlock(&dentry->d_lock);
191 } 170 }
171 spin_unlock(&dentry->d_lock);
192 return 0; 172 return 0;
193} 173}
194 174
@@ -202,7 +182,7 @@ const struct file_operations simple_dir_operations = {
202 .release = dcache_dir_close, 182 .release = dcache_dir_close,
203 .llseek = dcache_dir_lseek, 183 .llseek = dcache_dir_lseek,
204 .read = generic_read_dir, 184 .read = generic_read_dir,
205 .readdir = dcache_readdir, 185 .iterate = dcache_readdir,
206 .fsync = noop_fsync, 186 .fsync = noop_fsync,
207}; 187};
208 188
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e703318c41df..067778b0ccc9 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block)
276 dprintk("lockd: unlinking block %p...\n", block); 276 dprintk("lockd: unlinking block %p...\n", block);
277 277
278 /* Remove block from list */ 278 /* Remove block from list */
279 status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl); 279 status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
280 nlmsvc_remove_block(block); 280 nlmsvc_remove_block(block);
281 return status; 281 return status;
282} 282}
@@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
744 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; 744 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
745} 745}
746 746
747/*
748 * Since NLM uses two "keys" for tracking locks, we need to hash them down
749 * to one for the blocked_hash. Here, we're just xor'ing the host address
750 * with the pid in order to create a key value for picking a hash bucket.
751 */
752static unsigned long
753nlmsvc_owner_key(struct file_lock *fl)
754{
755 return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid;
756}
757
747const struct lock_manager_operations nlmsvc_lock_operations = { 758const struct lock_manager_operations nlmsvc_lock_operations = {
748 .lm_compare_owner = nlmsvc_same_owner, 759 .lm_compare_owner = nlmsvc_same_owner,
760 .lm_owner_key = nlmsvc_owner_key,
749 .lm_notify = nlmsvc_notify_blocked, 761 .lm_notify = nlmsvc_notify_blocked,
750 .lm_grant = nlmsvc_grant_deferred, 762 .lm_grant = nlmsvc_grant_deferred,
751}; 763};
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 97e87415b145..dc5c75930f0f 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -169,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
169 169
170again: 170again:
171 file->f_locks = 0; 171 file->f_locks = 0;
172 lock_flocks(); /* protects i_flock list */ 172 spin_lock(&inode->i_lock);
173 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 173 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
174 if (fl->fl_lmops != &nlmsvc_lock_operations) 174 if (fl->fl_lmops != &nlmsvc_lock_operations)
175 continue; 175 continue;
@@ -181,7 +181,7 @@ again:
181 if (match(lockhost, host)) { 181 if (match(lockhost, host)) {
182 struct file_lock lock = *fl; 182 struct file_lock lock = *fl;
183 183
184 unlock_flocks(); 184 spin_unlock(&inode->i_lock);
185 lock.fl_type = F_UNLCK; 185 lock.fl_type = F_UNLCK;
186 lock.fl_start = 0; 186 lock.fl_start = 0;
187 lock.fl_end = OFFSET_MAX; 187 lock.fl_end = OFFSET_MAX;
@@ -193,7 +193,7 @@ again:
193 goto again; 193 goto again;
194 } 194 }
195 } 195 }
196 unlock_flocks(); 196 spin_unlock(&inode->i_lock);
197 197
198 return 0; 198 return 0;
199} 199}
@@ -228,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file)
228 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) 228 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
229 return 1; 229 return 1;
230 230
231 lock_flocks(); 231 spin_lock(&inode->i_lock);
232 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 232 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
233 if (fl->fl_lmops == &nlmsvc_lock_operations) { 233 if (fl->fl_lmops == &nlmsvc_lock_operations) {
234 unlock_flocks(); 234 spin_unlock(&inode->i_lock);
235 return 1; 235 return 1;
236 } 236 }
237 } 237 }
238 unlock_flocks(); 238 spin_unlock(&inode->i_lock);
239 file->f_locks = 0; 239 file->f_locks = 0;
240 return 0; 240 return 0;
241} 241}
diff --git a/fs/locks.c b/fs/locks.c
index cb424a4fed71..04e2c1fdb157 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -126,6 +126,7 @@
126#include <linux/time.h> 126#include <linux/time.h>
127#include <linux/rcupdate.h> 127#include <linux/rcupdate.h>
128#include <linux/pid_namespace.h> 128#include <linux/pid_namespace.h>
129#include <linux/hashtable.h>
129 130
130#include <asm/uaccess.h> 131#include <asm/uaccess.h>
131 132
@@ -153,30 +154,51 @@ int lease_break_time = 45;
153#define for_each_lock(inode, lockp) \ 154#define for_each_lock(inode, lockp) \
154 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) 155 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
155 156
156static LIST_HEAD(file_lock_list); 157/*
157static LIST_HEAD(blocked_list); 158 * The global file_lock_list is only used for displaying /proc/locks. Protected
159 * by the file_lock_lock.
160 */
161static HLIST_HEAD(file_lock_list);
158static DEFINE_SPINLOCK(file_lock_lock); 162static DEFINE_SPINLOCK(file_lock_lock);
159 163
160/* 164/*
161 * Protects the two list heads above, plus the inode->i_flock list 165 * The blocked_hash is used to find POSIX lock loops for deadlock detection.
166 * It is protected by blocked_lock_lock.
167 *
168 * We hash locks by lockowner in order to optimize searching for the lock a
169 * particular lockowner is waiting on.
170 *
171 * FIXME: make this value scale via some heuristic? We generally will want more
172 * buckets when we have more lockowners holding locks, but that's a little
173 * difficult to determine without knowing what the workload will look like.
162 */ 174 */
163void lock_flocks(void) 175#define BLOCKED_HASH_BITS 7
164{ 176static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
165 spin_lock(&file_lock_lock);
166}
167EXPORT_SYMBOL_GPL(lock_flocks);
168 177
169void unlock_flocks(void) 178/*
170{ 179 * This lock protects the blocked_hash. Generally, if you're accessing it, you
171 spin_unlock(&file_lock_lock); 180 * want to be holding this lock.
172} 181 *
173EXPORT_SYMBOL_GPL(unlock_flocks); 182 * In addition, it also protects the fl->fl_block list, and the fl->fl_next
183 * pointer for file_lock structures that are acting as lock requests (in
184 * contrast to those that are acting as records of acquired locks).
185 *
186 * Note that when we acquire this lock in order to change the above fields,
187 * we often hold the i_lock as well. In certain cases, when reading the fields
188 * protected by this lock, we can skip acquiring it iff we already hold the
189 * i_lock.
190 *
191 * In particular, adding an entry to the fl_block list requires that you hold
192 * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
193 * an entry from the list however only requires the file_lock_lock.
194 */
195static DEFINE_SPINLOCK(blocked_lock_lock);
174 196
175static struct kmem_cache *filelock_cache __read_mostly; 197static struct kmem_cache *filelock_cache __read_mostly;
176 198
177static void locks_init_lock_heads(struct file_lock *fl) 199static void locks_init_lock_heads(struct file_lock *fl)
178{ 200{
179 INIT_LIST_HEAD(&fl->fl_link); 201 INIT_HLIST_NODE(&fl->fl_link);
180 INIT_LIST_HEAD(&fl->fl_block); 202 INIT_LIST_HEAD(&fl->fl_block);
181 init_waitqueue_head(&fl->fl_wait); 203 init_waitqueue_head(&fl->fl_wait);
182} 204}
@@ -210,7 +232,7 @@ void locks_free_lock(struct file_lock *fl)
210{ 232{
211 BUG_ON(waitqueue_active(&fl->fl_wait)); 233 BUG_ON(waitqueue_active(&fl->fl_wait));
212 BUG_ON(!list_empty(&fl->fl_block)); 234 BUG_ON(!list_empty(&fl->fl_block));
213 BUG_ON(!list_empty(&fl->fl_link)); 235 BUG_ON(!hlist_unhashed(&fl->fl_link));
214 236
215 locks_release_private(fl); 237 locks_release_private(fl);
216 kmem_cache_free(filelock_cache, fl); 238 kmem_cache_free(filelock_cache, fl);
@@ -484,47 +506,108 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
484 return fl1->fl_owner == fl2->fl_owner; 506 return fl1->fl_owner == fl2->fl_owner;
485} 507}
486 508
509static inline void
510locks_insert_global_locks(struct file_lock *fl)
511{
512 spin_lock(&file_lock_lock);
513 hlist_add_head(&fl->fl_link, &file_lock_list);
514 spin_unlock(&file_lock_lock);
515}
516
517static inline void
518locks_delete_global_locks(struct file_lock *fl)
519{
520 spin_lock(&file_lock_lock);
521 hlist_del_init(&fl->fl_link);
522 spin_unlock(&file_lock_lock);
523}
524
525static unsigned long
526posix_owner_key(struct file_lock *fl)
527{
528 if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
529 return fl->fl_lmops->lm_owner_key(fl);
530 return (unsigned long)fl->fl_owner;
531}
532
533static inline void
534locks_insert_global_blocked(struct file_lock *waiter)
535{
536 hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
537}
538
539static inline void
540locks_delete_global_blocked(struct file_lock *waiter)
541{
542 hash_del(&waiter->fl_link);
543}
544
487/* Remove waiter from blocker's block list. 545/* Remove waiter from blocker's block list.
488 * When blocker ends up pointing to itself then the list is empty. 546 * When blocker ends up pointing to itself then the list is empty.
547 *
548 * Must be called with blocked_lock_lock held.
489 */ 549 */
490static void __locks_delete_block(struct file_lock *waiter) 550static void __locks_delete_block(struct file_lock *waiter)
491{ 551{
552 locks_delete_global_blocked(waiter);
492 list_del_init(&waiter->fl_block); 553 list_del_init(&waiter->fl_block);
493 list_del_init(&waiter->fl_link);
494 waiter->fl_next = NULL; 554 waiter->fl_next = NULL;
495} 555}
496 556
497/* 557static void locks_delete_block(struct file_lock *waiter)
498 */
499void locks_delete_block(struct file_lock *waiter)
500{ 558{
501 lock_flocks(); 559 spin_lock(&blocked_lock_lock);
502 __locks_delete_block(waiter); 560 __locks_delete_block(waiter);
503 unlock_flocks(); 561 spin_unlock(&blocked_lock_lock);
504} 562}
505EXPORT_SYMBOL(locks_delete_block);
506 563
507/* Insert waiter into blocker's block list. 564/* Insert waiter into blocker's block list.
508 * We use a circular list so that processes can be easily woken up in 565 * We use a circular list so that processes can be easily woken up in
509 * the order they blocked. The documentation doesn't require this but 566 * the order they blocked. The documentation doesn't require this but
510 * it seems like the reasonable thing to do. 567 * it seems like the reasonable thing to do.
568 *
569 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
570 * list itself is protected by the file_lock_list, but by ensuring that the
571 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
572 * in some cases when we see that the fl_block list is empty.
511 */ 573 */
512static void locks_insert_block(struct file_lock *blocker, 574static void __locks_insert_block(struct file_lock *blocker,
513 struct file_lock *waiter) 575 struct file_lock *waiter)
514{ 576{
515 BUG_ON(!list_empty(&waiter->fl_block)); 577 BUG_ON(!list_empty(&waiter->fl_block));
516 list_add_tail(&waiter->fl_block, &blocker->fl_block);
517 waiter->fl_next = blocker; 578 waiter->fl_next = blocker;
579 list_add_tail(&waiter->fl_block, &blocker->fl_block);
518 if (IS_POSIX(blocker)) 580 if (IS_POSIX(blocker))
519 list_add(&waiter->fl_link, &blocked_list); 581 locks_insert_global_blocked(waiter);
520} 582}
521 583
522/* Wake up processes blocked waiting for blocker. 584/* Must be called with i_lock held. */
523 * If told to wait then schedule the processes until the block list 585static void locks_insert_block(struct file_lock *blocker,
524 * is empty, otherwise empty the block list ourselves. 586 struct file_lock *waiter)
587{
588 spin_lock(&blocked_lock_lock);
589 __locks_insert_block(blocker, waiter);
590 spin_unlock(&blocked_lock_lock);
591}
592
593/*
594 * Wake up processes blocked waiting for blocker.
595 *
596 * Must be called with the inode->i_lock held!
525 */ 597 */
526static void locks_wake_up_blocks(struct file_lock *blocker) 598static void locks_wake_up_blocks(struct file_lock *blocker)
527{ 599{
600 /*
601 * Avoid taking global lock if list is empty. This is safe since new
602 * blocked requests are only added to the list under the i_lock, and
603 * the i_lock is always held here. Note that removal from the fl_block
604 * list does not require the i_lock, so we must recheck list_empty()
605 * after acquiring the blocked_lock_lock.
606 */
607 if (list_empty(&blocker->fl_block))
608 return;
609
610 spin_lock(&blocked_lock_lock);
528 while (!list_empty(&blocker->fl_block)) { 611 while (!list_empty(&blocker->fl_block)) {
529 struct file_lock *waiter; 612 struct file_lock *waiter;
530 613
@@ -536,20 +619,23 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
536 else 619 else
537 wake_up(&waiter->fl_wait); 620 wake_up(&waiter->fl_wait);
538 } 621 }
622 spin_unlock(&blocked_lock_lock);
539} 623}
540 624
541/* Insert file lock fl into an inode's lock list at the position indicated 625/* Insert file lock fl into an inode's lock list at the position indicated
542 * by pos. At the same time add the lock to the global file lock list. 626 * by pos. At the same time add the lock to the global file lock list.
627 *
628 * Must be called with the i_lock held!
543 */ 629 */
544static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) 630static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
545{ 631{
546 list_add(&fl->fl_link, &file_lock_list);
547
548 fl->fl_nspid = get_pid(task_tgid(current)); 632 fl->fl_nspid = get_pid(task_tgid(current));
549 633
550 /* insert into file's list */ 634 /* insert into file's list */
551 fl->fl_next = *pos; 635 fl->fl_next = *pos;
552 *pos = fl; 636 *pos = fl;
637
638 locks_insert_global_locks(fl);
553} 639}
554 640
555/* 641/*
@@ -557,14 +643,17 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
557 * Wake up processes that are blocked waiting for this lock, 643 * Wake up processes that are blocked waiting for this lock,
558 * notify the FS that the lock has been cleared and 644 * notify the FS that the lock has been cleared and
559 * finally free the lock. 645 * finally free the lock.
646 *
647 * Must be called with the i_lock held!
560 */ 648 */
561static void locks_delete_lock(struct file_lock **thisfl_p) 649static void locks_delete_lock(struct file_lock **thisfl_p)
562{ 650{
563 struct file_lock *fl = *thisfl_p; 651 struct file_lock *fl = *thisfl_p;
564 652
653 locks_delete_global_locks(fl);
654
565 *thisfl_p = fl->fl_next; 655 *thisfl_p = fl->fl_next;
566 fl->fl_next = NULL; 656 fl->fl_next = NULL;
567 list_del_init(&fl->fl_link);
568 657
569 if (fl->fl_nspid) { 658 if (fl->fl_nspid) {
570 put_pid(fl->fl_nspid); 659 put_pid(fl->fl_nspid);
@@ -625,8 +714,9 @@ void
625posix_test_lock(struct file *filp, struct file_lock *fl) 714posix_test_lock(struct file *filp, struct file_lock *fl)
626{ 715{
627 struct file_lock *cfl; 716 struct file_lock *cfl;
717 struct inode *inode = file_inode(filp);
628 718
629 lock_flocks(); 719 spin_lock(&inode->i_lock);
630 for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { 720 for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
631 if (!IS_POSIX(cfl)) 721 if (!IS_POSIX(cfl))
632 continue; 722 continue;
@@ -639,7 +729,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
639 fl->fl_pid = pid_vnr(cfl->fl_nspid); 729 fl->fl_pid = pid_vnr(cfl->fl_nspid);
640 } else 730 } else
641 fl->fl_type = F_UNLCK; 731 fl->fl_type = F_UNLCK;
642 unlock_flocks(); 732 spin_unlock(&inode->i_lock);
643 return; 733 return;
644} 734}
645EXPORT_SYMBOL(posix_test_lock); 735EXPORT_SYMBOL(posix_test_lock);
@@ -676,13 +766,14 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
676{ 766{
677 struct file_lock *fl; 767 struct file_lock *fl;
678 768
679 list_for_each_entry(fl, &blocked_list, fl_link) { 769 hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
680 if (posix_same_owner(fl, block_fl)) 770 if (posix_same_owner(fl, block_fl))
681 return fl->fl_next; 771 return fl->fl_next;
682 } 772 }
683 return NULL; 773 return NULL;
684} 774}
685 775
776/* Must be called with the blocked_lock_lock held! */
686static int posix_locks_deadlock(struct file_lock *caller_fl, 777static int posix_locks_deadlock(struct file_lock *caller_fl,
687 struct file_lock *block_fl) 778 struct file_lock *block_fl)
688{ 779{
@@ -718,7 +809,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
718 return -ENOMEM; 809 return -ENOMEM;
719 } 810 }
720 811
721 lock_flocks(); 812 spin_lock(&inode->i_lock);
722 if (request->fl_flags & FL_ACCESS) 813 if (request->fl_flags & FL_ACCESS)
723 goto find_conflict; 814 goto find_conflict;
724 815
@@ -748,9 +839,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
748 * give it the opportunity to lock the file. 839 * give it the opportunity to lock the file.
749 */ 840 */
750 if (found) { 841 if (found) {
751 unlock_flocks(); 842 spin_unlock(&inode->i_lock);
752 cond_resched(); 843 cond_resched();
753 lock_flocks(); 844 spin_lock(&inode->i_lock);
754 } 845 }
755 846
756find_conflict: 847find_conflict:
@@ -777,7 +868,7 @@ find_conflict:
777 error = 0; 868 error = 0;
778 869
779out: 870out:
780 unlock_flocks(); 871 spin_unlock(&inode->i_lock);
781 if (new_fl) 872 if (new_fl)
782 locks_free_lock(new_fl); 873 locks_free_lock(new_fl);
783 return error; 874 return error;
@@ -791,7 +882,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
791 struct file_lock *left = NULL; 882 struct file_lock *left = NULL;
792 struct file_lock *right = NULL; 883 struct file_lock *right = NULL;
793 struct file_lock **before; 884 struct file_lock **before;
794 int error, added = 0; 885 int error;
886 bool added = false;
795 887
796 /* 888 /*
797 * We may need two file_lock structures for this operation, 889 * We may need two file_lock structures for this operation,
@@ -806,7 +898,12 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
806 new_fl2 = locks_alloc_lock(); 898 new_fl2 = locks_alloc_lock();
807 } 899 }
808 900
809 lock_flocks(); 901 spin_lock(&inode->i_lock);
902 /*
903 * New lock request. Walk all POSIX locks and look for conflicts. If
904 * there are any, either return error or put the request on the
905 * blocker's list of waiters and the global blocked_hash.
906 */
810 if (request->fl_type != F_UNLCK) { 907 if (request->fl_type != F_UNLCK) {
811 for_each_lock(inode, before) { 908 for_each_lock(inode, before) {
812 fl = *before; 909 fl = *before;
@@ -819,11 +916,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
819 error = -EAGAIN; 916 error = -EAGAIN;
820 if (!(request->fl_flags & FL_SLEEP)) 917 if (!(request->fl_flags & FL_SLEEP))
821 goto out; 918 goto out;
919 /*
920 * Deadlock detection and insertion into the blocked
921 * locks list must be done while holding the same lock!
922 */
822 error = -EDEADLK; 923 error = -EDEADLK;
823 if (posix_locks_deadlock(request, fl)) 924 spin_lock(&blocked_lock_lock);
824 goto out; 925 if (likely(!posix_locks_deadlock(request, fl))) {
825 error = FILE_LOCK_DEFERRED; 926 error = FILE_LOCK_DEFERRED;
826 locks_insert_block(fl, request); 927 __locks_insert_block(fl, request);
928 }
929 spin_unlock(&blocked_lock_lock);
827 goto out; 930 goto out;
828 } 931 }
829 } 932 }
@@ -845,7 +948,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
845 before = &fl->fl_next; 948 before = &fl->fl_next;
846 } 949 }
847 950
848 /* Process locks with this owner. */ 951 /* Process locks with this owner. */
849 while ((fl = *before) && posix_same_owner(request, fl)) { 952 while ((fl = *before) && posix_same_owner(request, fl)) {
850 /* Detect adjacent or overlapping regions (if same lock type) 953 /* Detect adjacent or overlapping regions (if same lock type)
851 */ 954 */
@@ -880,7 +983,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
880 continue; 983 continue;
881 } 984 }
882 request = fl; 985 request = fl;
883 added = 1; 986 added = true;
884 } 987 }
885 else { 988 else {
886 /* Processing for different lock types is a bit 989 /* Processing for different lock types is a bit
@@ -891,7 +994,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
891 if (fl->fl_start > request->fl_end) 994 if (fl->fl_start > request->fl_end)
892 break; 995 break;
893 if (request->fl_type == F_UNLCK) 996 if (request->fl_type == F_UNLCK)
894 added = 1; 997 added = true;
895 if (fl->fl_start < request->fl_start) 998 if (fl->fl_start < request->fl_start)
896 left = fl; 999 left = fl;
897 /* If the next lock in the list has a higher end 1000 /* If the next lock in the list has a higher end
@@ -921,7 +1024,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
921 locks_release_private(fl); 1024 locks_release_private(fl);
922 locks_copy_private(fl, request); 1025 locks_copy_private(fl, request);
923 request = fl; 1026 request = fl;
924 added = 1; 1027 added = true;
925 } 1028 }
926 } 1029 }
927 /* Go on to next lock. 1030 /* Go on to next lock.
@@ -931,10 +1034,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
931 } 1034 }
932 1035
933 /* 1036 /*
934 * The above code only modifies existing locks in case of 1037 * The above code only modifies existing locks in case of merging or
935 * merging or replacing. If new lock(s) need to be inserted 1038 * replacing. If new lock(s) need to be inserted all modifications are
936 * all modifications are done bellow this, so it's safe yet to 1039 * done below this, so it's safe yet to bail out.
937 * bail out.
938 */ 1040 */
939 error = -ENOLCK; /* "no luck" */ 1041 error = -ENOLCK; /* "no luck" */
940 if (right && left == right && !new_fl2) 1042 if (right && left == right && !new_fl2)
@@ -974,7 +1076,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
974 locks_wake_up_blocks(left); 1076 locks_wake_up_blocks(left);
975 } 1077 }
976 out: 1078 out:
977 unlock_flocks(); 1079 spin_unlock(&inode->i_lock);
978 /* 1080 /*
979 * Free any unused locks. 1081 * Free any unused locks.
980 */ 1082 */
@@ -1049,14 +1151,14 @@ int locks_mandatory_locked(struct inode *inode)
1049 /* 1151 /*
1050 * Search the lock list for this inode for any POSIX locks. 1152 * Search the lock list for this inode for any POSIX locks.
1051 */ 1153 */
1052 lock_flocks(); 1154 spin_lock(&inode->i_lock);
1053 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1155 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1054 if (!IS_POSIX(fl)) 1156 if (!IS_POSIX(fl))
1055 continue; 1157 continue;
1056 if (fl->fl_owner != owner) 1158 if (fl->fl_owner != owner)
1057 break; 1159 break;
1058 } 1160 }
1059 unlock_flocks(); 1161 spin_unlock(&inode->i_lock);
1060 return fl ? -EAGAIN : 0; 1162 return fl ? -EAGAIN : 0;
1061} 1163}
1062 1164
@@ -1199,7 +1301,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1199 if (IS_ERR(new_fl)) 1301 if (IS_ERR(new_fl))
1200 return PTR_ERR(new_fl); 1302 return PTR_ERR(new_fl);
1201 1303
1202 lock_flocks(); 1304 spin_lock(&inode->i_lock);
1203 1305
1204 time_out_leases(inode); 1306 time_out_leases(inode);
1205 1307
@@ -1249,11 +1351,11 @@ restart:
1249 break_time++; 1351 break_time++;
1250 } 1352 }
1251 locks_insert_block(flock, new_fl); 1353 locks_insert_block(flock, new_fl);
1252 unlock_flocks(); 1354 spin_unlock(&inode->i_lock);
1253 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1355 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1254 !new_fl->fl_next, break_time); 1356 !new_fl->fl_next, break_time);
1255 lock_flocks(); 1357 spin_lock(&inode->i_lock);
1256 __locks_delete_block(new_fl); 1358 locks_delete_block(new_fl);
1257 if (error >= 0) { 1359 if (error >= 0) {
1258 if (error == 0) 1360 if (error == 0)
1259 time_out_leases(inode); 1361 time_out_leases(inode);
@@ -1270,7 +1372,7 @@ restart:
1270 } 1372 }
1271 1373
1272out: 1374out:
1273 unlock_flocks(); 1375 spin_unlock(&inode->i_lock);
1274 locks_free_lock(new_fl); 1376 locks_free_lock(new_fl);
1275 return error; 1377 return error;
1276} 1378}
@@ -1323,9 +1425,10 @@ EXPORT_SYMBOL(lease_get_mtime);
1323int fcntl_getlease(struct file *filp) 1425int fcntl_getlease(struct file *filp)
1324{ 1426{
1325 struct file_lock *fl; 1427 struct file_lock *fl;
1428 struct inode *inode = file_inode(filp);
1326 int type = F_UNLCK; 1429 int type = F_UNLCK;
1327 1430
1328 lock_flocks(); 1431 spin_lock(&inode->i_lock);
1329 time_out_leases(file_inode(filp)); 1432 time_out_leases(file_inode(filp));
1330 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); 1433 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
1331 fl = fl->fl_next) { 1434 fl = fl->fl_next) {
@@ -1334,11 +1437,11 @@ int fcntl_getlease(struct file *filp)
1334 break; 1437 break;
1335 } 1438 }
1336 } 1439 }
1337 unlock_flocks(); 1440 spin_unlock(&inode->i_lock);
1338 return type; 1441 return type;
1339} 1442}
1340 1443
1341int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) 1444static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1342{ 1445{
1343 struct file_lock *fl, **before, **my_before = NULL, *lease; 1446 struct file_lock *fl, **before, **my_before = NULL, *lease;
1344 struct dentry *dentry = filp->f_path.dentry; 1447 struct dentry *dentry = filp->f_path.dentry;
@@ -1403,7 +1506,7 @@ out:
1403 return error; 1506 return error;
1404} 1507}
1405 1508
1406int generic_delete_lease(struct file *filp, struct file_lock **flp) 1509static int generic_delete_lease(struct file *filp, struct file_lock **flp)
1407{ 1510{
1408 struct file_lock *fl, **before; 1511 struct file_lock *fl, **before;
1409 struct dentry *dentry = filp->f_path.dentry; 1512 struct dentry *dentry = filp->f_path.dentry;
@@ -1428,7 +1531,7 @@ int generic_delete_lease(struct file *filp, struct file_lock **flp)
1428 * The (input) flp->fl_lmops->lm_break function is required 1531 * The (input) flp->fl_lmops->lm_break function is required
1429 * by break_lease(). 1532 * by break_lease().
1430 * 1533 *
1431 * Called with file_lock_lock held. 1534 * Called with inode->i_lock held.
1432 */ 1535 */
1433int generic_setlease(struct file *filp, long arg, struct file_lock **flp) 1536int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1434{ 1537{
@@ -1497,11 +1600,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1497 1600
1498int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) 1601int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1499{ 1602{
1603 struct inode *inode = file_inode(filp);
1500 int error; 1604 int error;
1501 1605
1502 lock_flocks(); 1606 spin_lock(&inode->i_lock);
1503 error = __vfs_setlease(filp, arg, lease); 1607 error = __vfs_setlease(filp, arg, lease);
1504 unlock_flocks(); 1608 spin_unlock(&inode->i_lock);
1505 1609
1506 return error; 1610 return error;
1507} 1611}
@@ -1519,6 +1623,7 @@ static int do_fcntl_delete_lease(struct file *filp)
1519static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) 1623static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1520{ 1624{
1521 struct file_lock *fl, *ret; 1625 struct file_lock *fl, *ret;
1626 struct inode *inode = file_inode(filp);
1522 struct fasync_struct *new; 1627 struct fasync_struct *new;
1523 int error; 1628 int error;
1524 1629
@@ -1532,10 +1637,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1532 return -ENOMEM; 1637 return -ENOMEM;
1533 } 1638 }
1534 ret = fl; 1639 ret = fl;
1535 lock_flocks(); 1640 spin_lock(&inode->i_lock);
1536 error = __vfs_setlease(filp, arg, &ret); 1641 error = __vfs_setlease(filp, arg, &ret);
1537 if (error) { 1642 if (error) {
1538 unlock_flocks(); 1643 spin_unlock(&inode->i_lock);
1539 locks_free_lock(fl); 1644 locks_free_lock(fl);
1540 goto out_free_fasync; 1645 goto out_free_fasync;
1541 } 1646 }
@@ -1552,7 +1657,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1552 new = NULL; 1657 new = NULL;
1553 1658
1554 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 1659 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1555 unlock_flocks(); 1660 spin_unlock(&inode->i_lock);
1556 1661
1557out_free_fasync: 1662out_free_fasync:
1558 if (new) 1663 if (new)
@@ -2076,7 +2181,7 @@ void locks_remove_flock(struct file *filp)
2076 fl.fl_ops->fl_release_private(&fl); 2181 fl.fl_ops->fl_release_private(&fl);
2077 } 2182 }
2078 2183
2079 lock_flocks(); 2184 spin_lock(&inode->i_lock);
2080 before = &inode->i_flock; 2185 before = &inode->i_flock;
2081 2186
2082 while ((fl = *before) != NULL) { 2187 while ((fl = *before) != NULL) {
@@ -2094,30 +2199,28 @@ void locks_remove_flock(struct file *filp)
2094 } 2199 }
2095 before = &fl->fl_next; 2200 before = &fl->fl_next;
2096 } 2201 }
2097 unlock_flocks(); 2202 spin_unlock(&inode->i_lock);
2098} 2203}
2099 2204
2100/** 2205/**
2101 * posix_unblock_lock - stop waiting for a file lock 2206 * posix_unblock_lock - stop waiting for a file lock
2102 * @filp: how the file was opened
2103 * @waiter: the lock which was waiting 2207 * @waiter: the lock which was waiting
2104 * 2208 *
2105 * lockd needs to block waiting for locks. 2209 * lockd needs to block waiting for locks.
2106 */ 2210 */
2107int 2211int
2108posix_unblock_lock(struct file *filp, struct file_lock *waiter) 2212posix_unblock_lock(struct file_lock *waiter)
2109{ 2213{
2110 int status = 0; 2214 int status = 0;
2111 2215
2112 lock_flocks(); 2216 spin_lock(&blocked_lock_lock);
2113 if (waiter->fl_next) 2217 if (waiter->fl_next)
2114 __locks_delete_block(waiter); 2218 __locks_delete_block(waiter);
2115 else 2219 else
2116 status = -ENOENT; 2220 status = -ENOENT;
2117 unlock_flocks(); 2221 spin_unlock(&blocked_lock_lock);
2118 return status; 2222 return status;
2119} 2223}
2120
2121EXPORT_SYMBOL(posix_unblock_lock); 2224EXPORT_SYMBOL(posix_unblock_lock);
2122 2225
2123/** 2226/**
@@ -2215,7 +2318,7 @@ static int locks_show(struct seq_file *f, void *v)
2215{ 2318{
2216 struct file_lock *fl, *bfl; 2319 struct file_lock *fl, *bfl;
2217 2320
2218 fl = list_entry(v, struct file_lock, fl_link); 2321 fl = hlist_entry(v, struct file_lock, fl_link);
2219 2322
2220 lock_get_status(f, fl, *((loff_t *)f->private), ""); 2323 lock_get_status(f, fl, *((loff_t *)f->private), "");
2221 2324
@@ -2229,21 +2332,23 @@ static void *locks_start(struct seq_file *f, loff_t *pos)
2229{ 2332{
2230 loff_t *p = f->private; 2333 loff_t *p = f->private;
2231 2334
2232 lock_flocks(); 2335 spin_lock(&file_lock_lock);
2336 spin_lock(&blocked_lock_lock);
2233 *p = (*pos + 1); 2337 *p = (*pos + 1);
2234 return seq_list_start(&file_lock_list, *pos); 2338 return seq_hlist_start(&file_lock_list, *pos);
2235} 2339}
2236 2340
2237static void *locks_next(struct seq_file *f, void *v, loff_t *pos) 2341static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2238{ 2342{
2239 loff_t *p = f->private; 2343 loff_t *p = f->private;
2240 ++*p; 2344 ++*p;
2241 return seq_list_next(v, &file_lock_list, pos); 2345 return seq_hlist_next(v, &file_lock_list, pos);
2242} 2346}
2243 2347
2244static void locks_stop(struct seq_file *f, void *v) 2348static void locks_stop(struct seq_file *f, void *v)
2245{ 2349{
2246 unlock_flocks(); 2350 spin_unlock(&blocked_lock_lock);
2351 spin_unlock(&file_lock_lock);
2247} 2352}
2248 2353
2249static const struct seq_operations locks_seq_operations = { 2354static const struct seq_operations locks_seq_operations = {
@@ -2290,7 +2395,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2290{ 2395{
2291 struct file_lock *fl; 2396 struct file_lock *fl;
2292 int result = 1; 2397 int result = 1;
2293 lock_flocks(); 2398
2399 spin_lock(&inode->i_lock);
2294 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2400 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2295 if (IS_POSIX(fl)) { 2401 if (IS_POSIX(fl)) {
2296 if (fl->fl_type == F_RDLCK) 2402 if (fl->fl_type == F_RDLCK)
@@ -2307,7 +2413,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2307 result = 0; 2413 result = 0;
2308 break; 2414 break;
2309 } 2415 }
2310 unlock_flocks(); 2416 spin_unlock(&inode->i_lock);
2311 return result; 2417 return result;
2312} 2418}
2313 2419
@@ -2330,7 +2436,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2330{ 2436{
2331 struct file_lock *fl; 2437 struct file_lock *fl;
2332 int result = 1; 2438 int result = 1;
2333 lock_flocks(); 2439
2440 spin_lock(&inode->i_lock);
2334 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2441 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2335 if (IS_POSIX(fl)) { 2442 if (IS_POSIX(fl)) {
2336 if ((fl->fl_end < start) || (fl->fl_start > (start + len))) 2443 if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2345,7 +2452,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2345 result = 0; 2452 result = 0;
2346 break; 2453 break;
2347 } 2454 }
2348 unlock_flocks(); 2455 spin_unlock(&inode->i_lock);
2349 return result; 2456 return result;
2350} 2457}
2351 2458
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b82751082112..6bdc347008f5 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -281,17 +281,23 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
281 281
282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good 282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
283 * way to combine the two copies */ 283 * way to combine the two copies */
284#define IMPLICIT_NODES 2 284static int logfs_readdir(struct file *file, struct dir_context *ctx)
285static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
286{ 285{
287 struct inode *dir = file_inode(file); 286 struct inode *dir = file_inode(file);
288 loff_t pos = file->f_pos - IMPLICIT_NODES; 287 loff_t pos;
289 struct page *page; 288 struct page *page;
290 struct logfs_disk_dentry *dd; 289 struct logfs_disk_dentry *dd;
291 int full;
292 290
291 if (ctx->pos < 0)
292 return -EINVAL;
293
294 if (!dir_emit_dots(file, ctx))
295 return 0;
296
297 pos = ctx->pos - 2;
293 BUG_ON(pos < 0); 298 BUG_ON(pos < 0);
294 for (;; pos++) { 299 for (;; pos++, ctx->pos++) {
300 bool full;
295 if (beyond_eof(dir, pos)) 301 if (beyond_eof(dir, pos))
296 break; 302 break;
297 if (!logfs_exist_block(dir, pos)) { 303 if (!logfs_exist_block(dir, pos)) {
@@ -306,42 +312,17 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
306 dd = kmap(page); 312 dd = kmap(page);
307 BUG_ON(dd->namelen == 0); 313 BUG_ON(dd->namelen == 0);
308 314
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), 315 full = !dir_emit(ctx, (char *)dd->name,
310 pos, be64_to_cpu(dd->ino), dd->type); 316 be16_to_cpu(dd->namelen),
317 be64_to_cpu(dd->ino), dd->type);
311 kunmap(page); 318 kunmap(page);
312 page_cache_release(page); 319 page_cache_release(page);
313 if (full) 320 if (full)
314 break; 321 break;
315 } 322 }
316
317 file->f_pos = pos + IMPLICIT_NODES;
318 return 0; 323 return 0;
319} 324}
320 325
321static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
322{
323 struct inode *inode = file_inode(file);
324 ino_t pino = parent_ino(file->f_dentry);
325 int err;
326
327 if (file->f_pos < 0)
328 return -EINVAL;
329
330 if (file->f_pos == 0) {
331 if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
332 return 0;
333 file->f_pos++;
334 }
335 if (file->f_pos == 1) {
336 if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
337 return 0;
338 file->f_pos++;
339 }
340
341 err = __logfs_readdir(file, buf, filldir);
342 return err;
343}
344
345static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) 326static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
346{ 327{
347 dd->namelen = cpu_to_be16(name->len); 328 dd->namelen = cpu_to_be16(name->len);
@@ -814,7 +795,7 @@ const struct inode_operations logfs_dir_iops = {
814const struct file_operations logfs_dir_fops = { 795const struct file_operations logfs_dir_fops = {
815 .fsync = logfs_fsync, 796 .fsync = logfs_fsync,
816 .unlocked_ioctl = logfs_ioctl, 797 .unlocked_ioctl = logfs_ioctl,
817 .readdir = logfs_readdir, 798 .iterate = logfs_readdir,
818 .read = generic_read_dir, 799 .read = generic_read_dir,
819 .llseek = default_llseek, 800 .llseek = default_llseek,
820}; 801};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6dd3c8..57914fc32b62 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
159 return __logfs_writepage(page); 159 return __logfs_writepage(page);
160} 160}
161 161
162static void logfs_invalidatepage(struct page *page, unsigned long offset) 162static void logfs_invalidatepage(struct page *page, unsigned int offset,
163 unsigned int length)
163{ 164{
164 struct logfs_block *block = logfs_block(page); 165 struct logfs_block *block = logfs_block(page);
165 166
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da0991794..d448a777166b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
884 return area; 884 return area;
885} 885}
886 886
887static void map_invalidatepage(struct page *page, unsigned long l) 887static void map_invalidatepage(struct page *page, unsigned int o,
888 unsigned int l)
888{ 889{
889 return; 890 return;
890} 891}
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index a9ed6f36e6ea..dfaf6fa9b7b5 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -16,12 +16,12 @@
16typedef struct minix_dir_entry minix_dirent; 16typedef struct minix_dir_entry minix_dirent;
17typedef struct minix3_dir_entry minix3_dirent; 17typedef struct minix3_dir_entry minix3_dirent;
18 18
19static int minix_readdir(struct file *, void *, filldir_t); 19static int minix_readdir(struct file *, struct dir_context *);
20 20
21const struct file_operations minix_dir_operations = { 21const struct file_operations minix_dir_operations = {
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .read = generic_read_dir, 23 .read = generic_read_dir,
24 .readdir = minix_readdir, 24 .iterate = minix_readdir,
25 .fsync = generic_file_fsync, 25 .fsync = generic_file_fsync,
26}; 26};
27 27
@@ -82,22 +82,23 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
82 return (void*)((char*)de + sbi->s_dirsize); 82 return (void*)((char*)de + sbi->s_dirsize);
83} 83}
84 84
85static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) 85static int minix_readdir(struct file *file, struct dir_context *ctx)
86{ 86{
87 unsigned long pos = filp->f_pos; 87 struct inode *inode = file_inode(file);
88 struct inode *inode = file_inode(filp);
89 struct super_block *sb = inode->i_sb; 88 struct super_block *sb = inode->i_sb;
90 unsigned offset = pos & ~PAGE_CACHE_MASK;
91 unsigned long n = pos >> PAGE_CACHE_SHIFT;
92 unsigned long npages = dir_pages(inode);
93 struct minix_sb_info *sbi = minix_sb(sb); 89 struct minix_sb_info *sbi = minix_sb(sb);
94 unsigned chunk_size = sbi->s_dirsize; 90 unsigned chunk_size = sbi->s_dirsize;
95 char *name; 91 unsigned long npages = dir_pages(inode);
96 __u32 inumber; 92 unsigned long pos = ctx->pos;
93 unsigned offset;
94 unsigned long n;
97 95
98 pos = (pos + chunk_size-1) & ~(chunk_size-1); 96 ctx->pos = pos = ALIGN(pos, chunk_size);
99 if (pos >= inode->i_size) 97 if (pos >= inode->i_size)
100 goto done; 98 return 0;
99
100 offset = pos & ~PAGE_CACHE_MASK;
101 n = pos >> PAGE_CACHE_SHIFT;
101 102
102 for ( ; n < npages; n++, offset = 0) { 103 for ( ; n < npages; n++, offset = 0) {
103 char *p, *kaddr, *limit; 104 char *p, *kaddr, *limit;
@@ -109,6 +110,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
109 p = kaddr+offset; 110 p = kaddr+offset;
110 limit = kaddr + minix_last_byte(inode, n) - chunk_size; 111 limit = kaddr + minix_last_byte(inode, n) - chunk_size;
111 for ( ; p <= limit; p = minix_next_entry(p, sbi)) { 112 for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
113 const char *name;
114 __u32 inumber;
112 if (sbi->s_version == MINIX_V3) { 115 if (sbi->s_version == MINIX_V3) {
113 minix3_dirent *de3 = (minix3_dirent *)p; 116 minix3_dirent *de3 = (minix3_dirent *)p;
114 name = de3->name; 117 name = de3->name;
@@ -119,24 +122,17 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
119 inumber = de->inode; 122 inumber = de->inode;
120 } 123 }
121 if (inumber) { 124 if (inumber) {
122 int over;
123
124 unsigned l = strnlen(name, sbi->s_namelen); 125 unsigned l = strnlen(name, sbi->s_namelen);
125 offset = p - kaddr; 126 if (!dir_emit(ctx, name, l,
126 over = filldir(dirent, name, l, 127 inumber, DT_UNKNOWN)) {
127 (n << PAGE_CACHE_SHIFT) | offset,
128 inumber, DT_UNKNOWN);
129 if (over) {
130 dir_put_page(page); 128 dir_put_page(page);
131 goto done; 129 return 0;
132 } 130 }
133 } 131 }
132 ctx->pos += chunk_size;
134 } 133 }
135 dir_put_page(page); 134 dir_put_page(page);
136 } 135 }
137
138done:
139 filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
140 return 0; 136 return 0;
141} 137}
142 138
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 0db73d9dd668..cd950e2331b6 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -54,6 +54,18 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
54 return error; 54 return error;
55} 55}
56 56
57static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
58{
59 int error;
60 struct inode *inode = minix_new_inode(dir, mode, &error);
61 if (inode) {
62 minix_set_inode(inode, 0);
63 mark_inode_dirty(inode);
64 d_tmpfile(dentry, inode);
65 }
66 return error;
67}
68
57static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, 69static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
58 bool excl) 70 bool excl)
59{ 71{
@@ -254,4 +266,5 @@ const struct inode_operations minix_dir_inode_operations = {
254 .mknod = minix_mknod, 266 .mknod = minix_mknod,
255 .rename = minix_rename, 267 .rename = minix_rename,
256 .getattr = minix_getattr, 268 .getattr = minix_getattr,
269 .tmpfile = minix_tmpfile,
257}; 270};
diff --git a/fs/namei.c b/fs/namei.c
index 85e40d1c0a8f..b2beee7a733f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1352,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd,
1352 */ 1352 */
1353 if (nd->flags & LOOKUP_RCU) { 1353 if (nd->flags & LOOKUP_RCU) {
1354 unsigned seq; 1354 unsigned seq;
1355 dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode); 1355 dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1356 if (!dentry) 1356 if (!dentry)
1357 goto unlazy; 1357 goto unlazy;
1358 1358
@@ -1787,8 +1787,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1787 struct dentry *parent = nd->path.dentry; 1787 struct dentry *parent = nd->path.dentry;
1788 nd->flags &= ~LOOKUP_JUMPED; 1788 nd->flags &= ~LOOKUP_JUMPED;
1789 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1789 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1790 err = parent->d_op->d_hash(parent, nd->inode, 1790 err = parent->d_op->d_hash(parent, &this);
1791 &this);
1792 if (err < 0) 1791 if (err < 0)
1793 break; 1792 break;
1794 } 1793 }
@@ -1976,7 +1975,7 @@ static int path_lookupat(int dfd, const char *name,
1976 err = complete_walk(nd); 1975 err = complete_walk(nd);
1977 1976
1978 if (!err && nd->flags & LOOKUP_DIRECTORY) { 1977 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1979 if (!nd->inode->i_op->lookup) { 1978 if (!can_lookup(nd->inode)) {
1980 path_put(&nd->path); 1979 path_put(&nd->path);
1981 err = -ENOTDIR; 1980 err = -ENOTDIR;
1982 } 1981 }
@@ -2121,7 +2120,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2121 * to use its own hash.. 2120 * to use its own hash..
2122 */ 2121 */
2123 if (base->d_flags & DCACHE_OP_HASH) { 2122 if (base->d_flags & DCACHE_OP_HASH) {
2124 int err = base->d_op->d_hash(base, base->d_inode, &this); 2123 int err = base->d_op->d_hash(base, &this);
2125 if (err < 0) 2124 if (err < 0)
2126 return ERR_PTR(err); 2125 return ERR_PTR(err);
2127 } 2126 }
@@ -2690,28 +2689,10 @@ static int do_last(struct nameidata *nd, struct path *path,
2690 nd->flags &= ~LOOKUP_PARENT; 2689 nd->flags &= ~LOOKUP_PARENT;
2691 nd->flags |= op->intent; 2690 nd->flags |= op->intent;
2692 2691
2693 switch (nd->last_type) { 2692 if (nd->last_type != LAST_NORM) {
2694 case LAST_DOTDOT:
2695 case LAST_DOT:
2696 error = handle_dots(nd, nd->last_type); 2693 error = handle_dots(nd, nd->last_type);
2697 if (error) 2694 if (error)
2698 return error; 2695 return error;
2699 /* fallthrough */
2700 case LAST_ROOT:
2701 error = complete_walk(nd);
2702 if (error)
2703 return error;
2704 audit_inode(name, nd->path.dentry, 0);
2705 if (open_flag & O_CREAT) {
2706 error = -EISDIR;
2707 goto out;
2708 }
2709 goto finish_open;
2710 case LAST_BIND:
2711 error = complete_walk(nd);
2712 if (error)
2713 return error;
2714 audit_inode(name, dir, 0);
2715 goto finish_open; 2696 goto finish_open;
2716 } 2697 }
2717 2698
@@ -2841,19 +2822,19 @@ finish_lookup:
2841 } 2822 }
2842 nd->inode = inode; 2823 nd->inode = inode;
2843 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ 2824 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
2825finish_open:
2844 error = complete_walk(nd); 2826 error = complete_walk(nd);
2845 if (error) { 2827 if (error) {
2846 path_put(&save_parent); 2828 path_put(&save_parent);
2847 return error; 2829 return error;
2848 } 2830 }
2831 audit_inode(name, nd->path.dentry, 0);
2849 error = -EISDIR; 2832 error = -EISDIR;
2850 if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) 2833 if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
2851 goto out; 2834 goto out;
2852 error = -ENOTDIR; 2835 error = -ENOTDIR;
2853 if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) 2836 if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
2854 goto out; 2837 goto out;
2855 audit_inode(name, nd->path.dentry, 0);
2856finish_open:
2857 if (!S_ISREG(nd->inode->i_mode)) 2838 if (!S_ISREG(nd->inode->i_mode))
2858 will_truncate = false; 2839 will_truncate = false;
2859 2840
@@ -2920,6 +2901,67 @@ stale_open:
2920 goto retry_lookup; 2901 goto retry_lookup;
2921} 2902}
2922 2903
2904static int do_tmpfile(int dfd, struct filename *pathname,
2905 struct nameidata *nd, int flags,
2906 const struct open_flags *op,
2907 struct file *file, int *opened)
2908{
2909 static const struct qstr name = QSTR_INIT("/", 1);
2910 struct dentry *dentry, *child;
2911 struct inode *dir;
2912 int error = path_lookupat(dfd, pathname->name,
2913 flags | LOOKUP_DIRECTORY, nd);
2914 if (unlikely(error))
2915 return error;
2916 error = mnt_want_write(nd->path.mnt);
2917 if (unlikely(error))
2918 goto out;
2919 /* we want directory to be writable */
2920 error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
2921 if (error)
2922 goto out2;
2923 dentry = nd->path.dentry;
2924 dir = dentry->d_inode;
2925 if (!dir->i_op->tmpfile) {
2926 error = -EOPNOTSUPP;
2927 goto out2;
2928 }
2929 child = d_alloc(dentry, &name);
2930 if (unlikely(!child)) {
2931 error = -ENOMEM;
2932 goto out2;
2933 }
2934 nd->flags &= ~LOOKUP_DIRECTORY;
2935 nd->flags |= op->intent;
2936 dput(nd->path.dentry);
2937 nd->path.dentry = child;
2938 error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
2939 if (error)
2940 goto out2;
2941 audit_inode(pathname, nd->path.dentry, 0);
2942 error = may_open(&nd->path, op->acc_mode, op->open_flag);
2943 if (error)
2944 goto out2;
2945 file->f_path.mnt = nd->path.mnt;
2946 error = finish_open(file, nd->path.dentry, NULL, opened);
2947 if (error)
2948 goto out2;
2949 error = open_check_o_direct(file);
2950 if (error) {
2951 fput(file);
2952 } else if (!(op->open_flag & O_EXCL)) {
2953 struct inode *inode = file_inode(file);
2954 spin_lock(&inode->i_lock);
2955 inode->i_state |= I_LINKABLE;
2956 spin_unlock(&inode->i_lock);
2957 }
2958out2:
2959 mnt_drop_write(nd->path.mnt);
2960out:
2961 path_put(&nd->path);
2962 return error;
2963}
2964
2923static struct file *path_openat(int dfd, struct filename *pathname, 2965static struct file *path_openat(int dfd, struct filename *pathname,
2924 struct nameidata *nd, const struct open_flags *op, int flags) 2966 struct nameidata *nd, const struct open_flags *op, int flags)
2925{ 2967{
@@ -2935,6 +2977,11 @@ static struct file *path_openat(int dfd, struct filename *pathname,
2935 2977
2936 file->f_flags = op->open_flag; 2978 file->f_flags = op->open_flag;
2937 2979
2980 if (unlikely(file->f_flags & O_TMPFILE)) {
2981 error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
2982 goto out;
2983 }
2984
2938 error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); 2985 error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
2939 if (unlikely(error)) 2986 if (unlikely(error))
2940 goto out; 2987 goto out;
@@ -2987,9 +3034,10 @@ out:
2987} 3034}
2988 3035
2989struct file *do_filp_open(int dfd, struct filename *pathname, 3036struct file *do_filp_open(int dfd, struct filename *pathname,
2990 const struct open_flags *op, int flags) 3037 const struct open_flags *op)
2991{ 3038{
2992 struct nameidata nd; 3039 struct nameidata nd;
3040 int flags = op->lookup_flags;
2993 struct file *filp; 3041 struct file *filp;
2994 3042
2995 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); 3043 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
@@ -3001,17 +3049,16 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
3001} 3049}
3002 3050
3003struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, 3051struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3004 const char *name, const struct open_flags *op, int flags) 3052 const char *name, const struct open_flags *op)
3005{ 3053{
3006 struct nameidata nd; 3054 struct nameidata nd;
3007 struct file *file; 3055 struct file *file;
3008 struct filename filename = { .name = name }; 3056 struct filename filename = { .name = name };
3057 int flags = op->lookup_flags | LOOKUP_ROOT;
3009 3058
3010 nd.root.mnt = mnt; 3059 nd.root.mnt = mnt;
3011 nd.root.dentry = dentry; 3060 nd.root.dentry = dentry;
3012 3061
3013 flags |= LOOKUP_ROOT;
3014
3015 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) 3062 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
3016 return ERR_PTR(-ELOOP); 3063 return ERR_PTR(-ELOOP);
3017 3064
@@ -3586,12 +3633,18 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
3586 3633
3587 mutex_lock(&inode->i_mutex); 3634 mutex_lock(&inode->i_mutex);
3588 /* Make sure we don't allow creating hardlink to an unlinked file */ 3635 /* Make sure we don't allow creating hardlink to an unlinked file */
3589 if (inode->i_nlink == 0) 3636 if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
3590 error = -ENOENT; 3637 error = -ENOENT;
3591 else if (max_links && inode->i_nlink >= max_links) 3638 else if (max_links && inode->i_nlink >= max_links)
3592 error = -EMLINK; 3639 error = -EMLINK;
3593 else 3640 else
3594 error = dir->i_op->link(old_dentry, dir, new_dentry); 3641 error = dir->i_op->link(old_dentry, dir, new_dentry);
3642
3643 if (!error && (inode->i_state & I_LINKABLE)) {
3644 spin_lock(&inode->i_lock);
3645 inode->i_state &= ~I_LINKABLE;
3646 spin_unlock(&inode->i_lock);
3647 }
3595 mutex_unlock(&inode->i_mutex); 3648 mutex_unlock(&inode->i_mutex);
3596 if (!error) 3649 if (!error)
3597 fsnotify_link(dir, inode, new_dentry); 3650 fsnotify_link(dir, inode, new_dentry);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 816326093656..3be047474bfc 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -23,12 +23,12 @@
23 23
24#include "ncp_fs.h" 24#include "ncp_fs.h"
25 25
26static void ncp_read_volume_list(struct file *, void *, filldir_t, 26static void ncp_read_volume_list(struct file *, struct dir_context *,
27 struct ncp_cache_control *); 27 struct ncp_cache_control *);
28static void ncp_do_readdir(struct file *, void *, filldir_t, 28static void ncp_do_readdir(struct file *, struct dir_context *,
29 struct ncp_cache_control *); 29 struct ncp_cache_control *);
30 30
31static int ncp_readdir(struct file *, void *, filldir_t); 31static int ncp_readdir(struct file *, struct dir_context *);
32 32
33static int ncp_create(struct inode *, struct dentry *, umode_t, bool); 33static int ncp_create(struct inode *, struct dentry *, umode_t, bool);
34static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int); 34static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
@@ -49,7 +49,7 @@ const struct file_operations ncp_dir_operations =
49{ 49{
50 .llseek = generic_file_llseek, 50 .llseek = generic_file_llseek,
51 .read = generic_read_dir, 51 .read = generic_read_dir,
52 .readdir = ncp_readdir, 52 .iterate = ncp_readdir,
53 .unlocked_ioctl = ncp_ioctl, 53 .unlocked_ioctl = ncp_ioctl,
54#ifdef CONFIG_COMPAT 54#ifdef CONFIG_COMPAT
55 .compat_ioctl = ncp_compat_ioctl, 55 .compat_ioctl = ncp_compat_ioctl,
@@ -73,10 +73,8 @@ const struct inode_operations ncp_dir_inode_operations =
73 * Dentry operations routines 73 * Dentry operations routines
74 */ 74 */
75static int ncp_lookup_validate(struct dentry *, unsigned int); 75static int ncp_lookup_validate(struct dentry *, unsigned int);
76static int ncp_hash_dentry(const struct dentry *, const struct inode *, 76static int ncp_hash_dentry(const struct dentry *, struct qstr *);
77 struct qstr *); 77static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
78static int ncp_compare_dentry(const struct dentry *, const struct inode *,
79 const struct dentry *, const struct inode *,
80 unsigned int, const char *, const struct qstr *); 78 unsigned int, const char *, const struct qstr *);
81static int ncp_delete_dentry(const struct dentry *); 79static int ncp_delete_dentry(const struct dentry *);
82 80
@@ -119,11 +117,19 @@ static inline int ncp_case_sensitive(const struct inode *i)
119/* 117/*
120 * Note: leave the hash unchanged if the directory 118 * Note: leave the hash unchanged if the directory
121 * is case-sensitive. 119 * is case-sensitive.
120 *
121 * Accessing the parent inode can be racy under RCU pathwalking.
122 * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
123 * the callers will handle races.
122 */ 124 */
123static int 125static int
124ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, 126ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
125 struct qstr *this)
126{ 127{
128 struct inode *inode = ACCESS_ONCE(dentry->d_inode);
129
130 if (!inode)
131 return 0;
132
127 if (!ncp_case_sensitive(inode)) { 133 if (!ncp_case_sensitive(inode)) {
128 struct super_block *sb = dentry->d_sb; 134 struct super_block *sb = dentry->d_sb;
129 struct nls_table *t; 135 struct nls_table *t;
@@ -140,14 +146,24 @@ ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
140 return 0; 146 return 0;
141} 147}
142 148
149/*
150 * Accessing the parent inode can be racy under RCU pathwalking.
151 * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
152 * the callers will handle races.
153 */
143static int 154static int
144ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode, 155ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
145 const struct dentry *dentry, const struct inode *inode,
146 unsigned int len, const char *str, const struct qstr *name) 156 unsigned int len, const char *str, const struct qstr *name)
147{ 157{
158 struct inode *pinode;
159
148 if (len != name->len) 160 if (len != name->len)
149 return 1; 161 return 1;
150 162
163 pinode = ACCESS_ONCE(parent->d_inode);
164 if (!pinode)
165 return 1;
166
151 if (ncp_case_sensitive(pinode)) 167 if (ncp_case_sensitive(pinode))
152 return strncmp(str, name->name, len); 168 return strncmp(str, name->name, len);
153 169
@@ -424,9 +440,9 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
424 return ncp_date_dos2unix(i.modifyTime, i.modifyDate); 440 return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
425} 441}
426 442
427static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) 443static int ncp_readdir(struct file *file, struct dir_context *ctx)
428{ 444{
429 struct dentry *dentry = filp->f_path.dentry; 445 struct dentry *dentry = file->f_path.dentry;
430 struct inode *inode = dentry->d_inode; 446 struct inode *inode = dentry->d_inode;
431 struct page *page = NULL; 447 struct page *page = NULL;
432 struct ncp_server *server = NCP_SERVER(inode); 448 struct ncp_server *server = NCP_SERVER(inode);
@@ -440,7 +456,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
440 456
441 DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n", 457 DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
442 dentry->d_parent->d_name.name, dentry->d_name.name, 458 dentry->d_parent->d_name.name, dentry->d_name.name,
443 (int) filp->f_pos); 459 (int) ctx->pos);
444 460
445 result = -EIO; 461 result = -EIO;
446 /* Do not generate '.' and '..' when server is dead. */ 462 /* Do not generate '.' and '..' when server is dead. */
@@ -448,16 +464,8 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
448 goto out; 464 goto out;
449 465
450 result = 0; 466 result = 0;
451 if (filp->f_pos == 0) { 467 if (!dir_emit_dots(file, ctx))
452 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) 468 goto out;
453 goto out;
454 filp->f_pos = 1;
455 }
456 if (filp->f_pos == 1) {
457 if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR))
458 goto out;
459 filp->f_pos = 2;
460 }
461 469
462 page = grab_cache_page(&inode->i_data, 0); 470 page = grab_cache_page(&inode->i_data, 0);
463 if (!page) 471 if (!page)
@@ -469,7 +477,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
469 if (!PageUptodate(page) || !ctl.head.eof) 477 if (!PageUptodate(page) || !ctl.head.eof)
470 goto init_cache; 478 goto init_cache;
471 479
472 if (filp->f_pos == 2) { 480 if (ctx->pos == 2) {
473 if (jiffies - ctl.head.time >= NCP_MAX_AGE(server)) 481 if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
474 goto init_cache; 482 goto init_cache;
475 483
@@ -479,10 +487,10 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
479 goto init_cache; 487 goto init_cache;
480 } 488 }
481 489
482 if (filp->f_pos > ctl.head.end) 490 if (ctx->pos > ctl.head.end)
483 goto finished; 491 goto finished;
484 492
485 ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2); 493 ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2);
486 ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE; 494 ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE;
487 ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE; 495 ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE;
488 496
@@ -497,21 +505,21 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
497 } 505 }
498 while (ctl.idx < NCP_DIRCACHE_SIZE) { 506 while (ctl.idx < NCP_DIRCACHE_SIZE) {
499 struct dentry *dent; 507 struct dentry *dent;
500 int res; 508 bool over;
501 509
502 dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx], 510 dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
503 dentry, filp->f_pos); 511 dentry, ctx->pos);
504 if (!dent) 512 if (!dent)
505 goto invalid_cache; 513 goto invalid_cache;
506 res = filldir(dirent, dent->d_name.name, 514 over = !dir_emit(ctx, dent->d_name.name,
507 dent->d_name.len, filp->f_pos, 515 dent->d_name.len,
508 dent->d_inode->i_ino, DT_UNKNOWN); 516 dent->d_inode->i_ino, DT_UNKNOWN);
509 dput(dent); 517 dput(dent);
510 if (res) 518 if (over)
511 goto finished; 519 goto finished;
512 filp->f_pos += 1; 520 ctx->pos += 1;
513 ctl.idx += 1; 521 ctl.idx += 1;
514 if (filp->f_pos > ctl.head.end) 522 if (ctx->pos > ctl.head.end)
515 goto finished; 523 goto finished;
516 } 524 }
517 if (ctl.page) { 525 if (ctl.page) {
@@ -548,9 +556,9 @@ init_cache:
548 ctl.valid = 1; 556 ctl.valid = 1;
549read_really: 557read_really:
550 if (ncp_is_server_root(inode)) { 558 if (ncp_is_server_root(inode)) {
551 ncp_read_volume_list(filp, dirent, filldir, &ctl); 559 ncp_read_volume_list(file, ctx, &ctl);
552 } else { 560 } else {
553 ncp_do_readdir(filp, dirent, filldir, &ctl); 561 ncp_do_readdir(file, ctx, &ctl);
554 } 562 }
555 ctl.head.end = ctl.fpos - 1; 563 ctl.head.end = ctl.fpos - 1;
556 ctl.head.eof = ctl.valid; 564 ctl.head.eof = ctl.valid;
@@ -573,11 +581,11 @@ out:
573} 581}
574 582
575static int 583static int
576ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 584ncp_fill_cache(struct file *file, struct dir_context *ctx,
577 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, 585 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
578 int inval_childs) 586 int inval_childs)
579{ 587{
580 struct dentry *newdent, *dentry = filp->f_path.dentry; 588 struct dentry *newdent, *dentry = file->f_path.dentry;
581 struct inode *dir = dentry->d_inode; 589 struct inode *dir = dentry->d_inode;
582 struct ncp_cache_control ctl = *ctrl; 590 struct ncp_cache_control ctl = *ctrl;
583 struct qstr qname; 591 struct qstr qname;
@@ -666,15 +674,13 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
666end_advance: 674end_advance:
667 if (!valid) 675 if (!valid)
668 ctl.valid = 0; 676 ctl.valid = 0;
669 if (!ctl.filled && (ctl.fpos == filp->f_pos)) { 677 if (!ctl.filled && (ctl.fpos == ctx->pos)) {
670 if (!ino)
671 ino = find_inode_number(dentry, &qname);
672 if (!ino) 678 if (!ino)
673 ino = iunique(dir->i_sb, 2); 679 ino = iunique(dir->i_sb, 2);
674 ctl.filled = filldir(dirent, qname.name, qname.len, 680 ctl.filled = !dir_emit(ctx, qname.name, qname.len,
675 filp->f_pos, ino, DT_UNKNOWN); 681 ino, DT_UNKNOWN);
676 if (!ctl.filled) 682 if (!ctl.filled)
677 filp->f_pos += 1; 683 ctx->pos += 1;
678 } 684 }
679 ctl.fpos += 1; 685 ctl.fpos += 1;
680 ctl.idx += 1; 686 ctl.idx += 1;
@@ -683,10 +689,10 @@ end_advance:
683} 689}
684 690
685static void 691static void
686ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, 692ncp_read_volume_list(struct file *file, struct dir_context *ctx,
687 struct ncp_cache_control *ctl) 693 struct ncp_cache_control *ctl)
688{ 694{
689 struct dentry *dentry = filp->f_path.dentry; 695 struct dentry *dentry = file->f_path.dentry;
690 struct inode *inode = dentry->d_inode; 696 struct inode *inode = dentry->d_inode;
691 struct ncp_server *server = NCP_SERVER(inode); 697 struct ncp_server *server = NCP_SERVER(inode);
692 struct ncp_volume_info info; 698 struct ncp_volume_info info;
@@ -694,7 +700,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
694 int i; 700 int i;
695 701
696 DPRINTK("ncp_read_volume_list: pos=%ld\n", 702 DPRINTK("ncp_read_volume_list: pos=%ld\n",
697 (unsigned long) filp->f_pos); 703 (unsigned long) ctx->pos);
698 704
699 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { 705 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
700 int inval_dentry; 706 int inval_dentry;
@@ -715,16 +721,16 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
715 } 721 }
716 inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL); 722 inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
717 entry.volume = entry.i.volNumber; 723 entry.volume = entry.i.volNumber;
718 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry)) 724 if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry))
719 return; 725 return;
720 } 726 }
721} 727}
722 728
723static void 729static void
724ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, 730ncp_do_readdir(struct file *file, struct dir_context *ctx,
725 struct ncp_cache_control *ctl) 731 struct ncp_cache_control *ctl)
726{ 732{
727 struct dentry *dentry = filp->f_path.dentry; 733 struct dentry *dentry = file->f_path.dentry;
728 struct inode *dir = dentry->d_inode; 734 struct inode *dir = dentry->d_inode;
729 struct ncp_server *server = NCP_SERVER(dir); 735 struct ncp_server *server = NCP_SERVER(dir);
730 struct nw_search_sequence seq; 736 struct nw_search_sequence seq;
@@ -736,7 +742,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
736 742
737 DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n", 743 DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
738 dentry->d_parent->d_name.name, dentry->d_name.name, 744 dentry->d_parent->d_name.name, dentry->d_name.name,
739 (unsigned long) filp->f_pos); 745 (unsigned long) ctx->pos);
740 PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n", 746 PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
741 dentry->d_name.name, NCP_FINFO(dir)->volNumber, 747 dentry->d_name.name, NCP_FINFO(dir)->volNumber,
742 NCP_FINFO(dir)->dirEntNum); 748 NCP_FINFO(dir)->dirEntNum);
@@ -778,7 +784,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
778 rpl += onerpl; 784 rpl += onerpl;
779 rpls -= onerpl; 785 rpls -= onerpl;
780 entry.volume = entry.i.volNumber; 786 entry.volume = entry.i.volNumber;
781 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0)) 787 if (!ncp_fill_cache(file, ctx, ctl, &entry, 0))
782 break; 788 break;
783 } 789 }
784 } while (more); 790 } while (more);
@@ -1029,15 +1035,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1029 DPRINTK("ncp_rmdir: removing %s/%s\n", 1035 DPRINTK("ncp_rmdir: removing %s/%s\n",
1030 dentry->d_parent->d_name.name, dentry->d_name.name); 1036 dentry->d_parent->d_name.name, dentry->d_name.name);
1031 1037
1032 /*
1033 * fail with EBUSY if there are still references to this
1034 * directory.
1035 */
1036 dentry_unhash(dentry);
1037 error = -EBUSY;
1038 if (!d_unhashed(dentry))
1039 goto out;
1040
1041 len = sizeof(__name); 1038 len = sizeof(__name);
1042 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 1039 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
1043 dentry->d_name.len, !ncp_preserve_case(dir)); 1040 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -1140,17 +1137,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1140 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1137 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1141 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1138 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1142 1139
1143 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
1144 /*
1145 * fail with EBUSY if there are still references to this
1146 * directory.
1147 */
1148 dentry_unhash(new_dentry);
1149 error = -EBUSY;
1150 if (!d_unhashed(new_dentry))
1151 goto out;
1152 }
1153
1154 ncp_age_dentry(server, old_dentry); 1140 ncp_age_dentry(server, old_dentry);
1155 ncp_age_dentry(server, new_dentry); 1141 ncp_age_dentry(server, new_dentry);
1156 1142
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 26910c8154da..0765ad12c382 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -891,6 +891,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
891 if (!server) /* How this could happen? */ 891 if (!server) /* How this could happen? */
892 goto out; 892 goto out;
893 893
894 result = -EPERM;
895 if (IS_DEADDIR(dentry->d_inode))
896 goto out;
897
894 /* ageing the dentry to force validation */ 898 /* ageing the dentry to force validation */
895 ncp_age_dentry(server, dentry); 899 ncp_age_dentry(server, dentry);
896 900
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a13d26ede254..0bc27684ebfa 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -414,7 +414,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
414 414
415 spin_lock(&tbl->slot_tbl_lock); 415 spin_lock(&tbl->slot_tbl_lock);
416 /* state manager is resetting the session */ 416 /* state manager is resetting the session */
417 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { 417 if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
418 spin_unlock(&tbl->slot_tbl_lock); 418 spin_unlock(&tbl->slot_tbl_lock);
419 status = htonl(NFS4ERR_DELAY); 419 status = htonl(NFS4ERR_DELAY);
420 /* Return NFS4ERR_BADSESSION if we're draining the session 420 /* Return NFS4ERR_BADSESSION if we're draining the session
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 59461c957d9d..a35582c9d444 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -763,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
763 * A single slot, so highest used slotid is either 0 or -1 763 * A single slot, so highest used slotid is either 0 or -1
764 */ 764 */
765 tbl->highest_used_slotid = NFS4_NO_SLOT; 765 tbl->highest_used_slotid = NFS4_NO_SLOT;
766 nfs4_session_drain_complete(session, tbl); 766 nfs4_slot_tbl_drain_complete(tbl);
767 spin_unlock(&tbl->slot_tbl_lock); 767 spin_unlock(&tbl->slot_tbl_lock);
768} 768}
769 769
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 57db3244f4d9..7ec4814e298d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -73,20 +73,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
73 if (inode->i_flock == NULL) 73 if (inode->i_flock == NULL)
74 goto out; 74 goto out;
75 75
76 /* Protect inode->i_flock using the file locks lock */ 76 /* Protect inode->i_flock using the i_lock */
77 lock_flocks(); 77 spin_lock(&inode->i_lock);
78 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 78 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
79 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 79 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
80 continue; 80 continue;
81 if (nfs_file_open_context(fl->fl_file) != ctx) 81 if (nfs_file_open_context(fl->fl_file) != ctx)
82 continue; 82 continue;
83 unlock_flocks(); 83 spin_unlock(&inode->i_lock);
84 status = nfs4_lock_delegation_recall(fl, state, stateid); 84 status = nfs4_lock_delegation_recall(fl, state, stateid);
85 if (status < 0) 85 if (status < 0)
86 goto out; 86 goto out;
87 lock_flocks(); 87 spin_lock(&inode->i_lock);
88 } 88 }
89 unlock_flocks(); 89 spin_unlock(&inode->i_lock);
90out: 90out:
91 return status; 91 return status;
92} 92}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e093e73178b7..5d051419527b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -46,7 +46,7 @@
46 46
47static int nfs_opendir(struct inode *, struct file *); 47static int nfs_opendir(struct inode *, struct file *);
48static int nfs_closedir(struct inode *, struct file *); 48static int nfs_closedir(struct inode *, struct file *);
49static int nfs_readdir(struct file *, void *, filldir_t); 49static int nfs_readdir(struct file *, struct dir_context *);
50static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); 50static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
51static loff_t nfs_llseek_dir(struct file *, loff_t, int); 51static loff_t nfs_llseek_dir(struct file *, loff_t, int);
52static void nfs_readdir_clear_array(struct page*); 52static void nfs_readdir_clear_array(struct page*);
@@ -54,7 +54,7 @@ static void nfs_readdir_clear_array(struct page*);
54const struct file_operations nfs_dir_operations = { 54const struct file_operations nfs_dir_operations = {
55 .llseek = nfs_llseek_dir, 55 .llseek = nfs_llseek_dir,
56 .read = generic_read_dir, 56 .read = generic_read_dir,
57 .readdir = nfs_readdir, 57 .iterate = nfs_readdir,
58 .open = nfs_opendir, 58 .open = nfs_opendir,
59 .release = nfs_closedir, 59 .release = nfs_closedir,
60 .fsync = nfs_fsync_dir, 60 .fsync = nfs_fsync_dir,
@@ -147,6 +147,7 @@ typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
147typedef struct { 147typedef struct {
148 struct file *file; 148 struct file *file;
149 struct page *page; 149 struct page *page;
150 struct dir_context *ctx;
150 unsigned long page_index; 151 unsigned long page_index;
151 u64 *dir_cookie; 152 u64 *dir_cookie;
152 u64 last_cookie; 153 u64 last_cookie;
@@ -252,7 +253,7 @@ out:
252static 253static
253int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) 254int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
254{ 255{
255 loff_t diff = desc->file->f_pos - desc->current_index; 256 loff_t diff = desc->ctx->pos - desc->current_index;
256 unsigned int index; 257 unsigned int index;
257 258
258 if (diff < 0) 259 if (diff < 0)
@@ -289,7 +290,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
289 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { 290 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
290 ctx->duped = 0; 291 ctx->duped = 0;
291 ctx->attr_gencount = nfsi->attr_gencount; 292 ctx->attr_gencount = nfsi->attr_gencount;
292 } else if (new_pos < desc->file->f_pos) { 293 } else if (new_pos < desc->ctx->pos) {
293 if (ctx->duped > 0 294 if (ctx->duped > 0
294 && ctx->dup_cookie == *desc->dir_cookie) { 295 && ctx->dup_cookie == *desc->dir_cookie) {
295 if (printk_ratelimit()) { 296 if (printk_ratelimit()) {
@@ -307,7 +308,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
307 ctx->dup_cookie = *desc->dir_cookie; 308 ctx->dup_cookie = *desc->dir_cookie;
308 ctx->duped = -1; 309 ctx->duped = -1;
309 } 310 }
310 desc->file->f_pos = new_pos; 311 desc->ctx->pos = new_pos;
311 desc->cache_entry_index = i; 312 desc->cache_entry_index = i;
312 return 0; 313 return 0;
313 } 314 }
@@ -405,13 +406,13 @@ different:
405} 406}
406 407
407static 408static
408bool nfs_use_readdirplus(struct inode *dir, struct file *filp) 409bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
409{ 410{
410 if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) 411 if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
411 return false; 412 return false;
412 if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) 413 if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
413 return true; 414 return true;
414 if (filp->f_pos == 0) 415 if (ctx->pos == 0)
415 return true; 416 return true;
416 return false; 417 return false;
417} 418}
@@ -702,8 +703,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
702 * Once we've found the start of the dirent within a page: fill 'er up... 703 * Once we've found the start of the dirent within a page: fill 'er up...
703 */ 704 */
704static 705static
705int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, 706int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
706 filldir_t filldir)
707{ 707{
708 struct file *file = desc->file; 708 struct file *file = desc->file;
709 int i = 0; 709 int i = 0;
@@ -721,13 +721,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
721 struct nfs_cache_array_entry *ent; 721 struct nfs_cache_array_entry *ent;
722 722
723 ent = &array->array[i]; 723 ent = &array->array[i];
724 if (filldir(dirent, ent->string.name, ent->string.len, 724 if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
725 file->f_pos, nfs_compat_user_ino64(ent->ino), 725 nfs_compat_user_ino64(ent->ino), ent->d_type)) {
726 ent->d_type) < 0) {
727 desc->eof = 1; 726 desc->eof = 1;
728 break; 727 break;
729 } 728 }
730 file->f_pos++; 729 desc->ctx->pos++;
731 if (i < (array->size-1)) 730 if (i < (array->size-1))
732 *desc->dir_cookie = array->array[i+1].cookie; 731 *desc->dir_cookie = array->array[i+1].cookie;
733 else 732 else
@@ -759,8 +758,7 @@ out:
759 * directory in the page cache by the time we get here. 758 * directory in the page cache by the time we get here.
760 */ 759 */
761static inline 760static inline
762int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, 761int uncached_readdir(nfs_readdir_descriptor_t *desc)
763 filldir_t filldir)
764{ 762{
765 struct page *page = NULL; 763 struct page *page = NULL;
766 int status; 764 int status;
@@ -785,7 +783,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
785 if (status < 0) 783 if (status < 0)
786 goto out_release; 784 goto out_release;
787 785
788 status = nfs_do_filldir(desc, dirent, filldir); 786 status = nfs_do_filldir(desc);
789 787
790 out: 788 out:
791 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", 789 dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
@@ -800,35 +798,36 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
800 last cookie cache takes care of the common case of reading the 798 last cookie cache takes care of the common case of reading the
801 whole directory. 799 whole directory.
802 */ 800 */
803static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 801static int nfs_readdir(struct file *file, struct dir_context *ctx)
804{ 802{
805 struct dentry *dentry = filp->f_path.dentry; 803 struct dentry *dentry = file->f_path.dentry;
806 struct inode *inode = dentry->d_inode; 804 struct inode *inode = dentry->d_inode;
807 nfs_readdir_descriptor_t my_desc, 805 nfs_readdir_descriptor_t my_desc,
808 *desc = &my_desc; 806 *desc = &my_desc;
809 struct nfs_open_dir_context *dir_ctx = filp->private_data; 807 struct nfs_open_dir_context *dir_ctx = file->private_data;
810 int res; 808 int res;
811 809
812 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 810 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
813 dentry->d_parent->d_name.name, dentry->d_name.name, 811 dentry->d_parent->d_name.name, dentry->d_name.name,
814 (long long)filp->f_pos); 812 (long long)ctx->pos);
815 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); 813 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
816 814
817 /* 815 /*
818 * filp->f_pos points to the dirent entry number. 816 * ctx->pos points to the dirent entry number.
819 * *desc->dir_cookie has the cookie for the next entry. We have 817 * *desc->dir_cookie has the cookie for the next entry. We have
820 * to either find the entry with the appropriate number or 818 * to either find the entry with the appropriate number or
821 * revalidate the cookie. 819 * revalidate the cookie.
822 */ 820 */
823 memset(desc, 0, sizeof(*desc)); 821 memset(desc, 0, sizeof(*desc));
824 822
825 desc->file = filp; 823 desc->file = file;
824 desc->ctx = ctx;
826 desc->dir_cookie = &dir_ctx->dir_cookie; 825 desc->dir_cookie = &dir_ctx->dir_cookie;
827 desc->decode = NFS_PROTO(inode)->decode_dirent; 826 desc->decode = NFS_PROTO(inode)->decode_dirent;
828 desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0; 827 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
829 828
830 nfs_block_sillyrename(dentry); 829 nfs_block_sillyrename(dentry);
831 res = nfs_revalidate_mapping(inode, filp->f_mapping); 830 res = nfs_revalidate_mapping(inode, file->f_mapping);
832 if (res < 0) 831 if (res < 0)
833 goto out; 832 goto out;
834 833
@@ -840,7 +839,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
840 /* This means either end of directory */ 839 /* This means either end of directory */
841 if (*desc->dir_cookie && desc->eof == 0) { 840 if (*desc->dir_cookie && desc->eof == 0) {
842 /* Or that the server has 'lost' a cookie */ 841 /* Or that the server has 'lost' a cookie */
843 res = uncached_readdir(desc, dirent, filldir); 842 res = uncached_readdir(desc);
844 if (res == 0) 843 if (res == 0)
845 continue; 844 continue;
846 } 845 }
@@ -857,7 +856,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
857 if (res < 0) 856 if (res < 0)
858 break; 857 break;
859 858
860 res = nfs_do_filldir(desc, dirent, filldir); 859 res = nfs_do_filldir(desc);
861 if (res < 0) 860 if (res < 0)
862 break; 861 break;
863 } while (!desc->eof); 862 } while (!desc->eof);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a87a44f84113..6b4a79f4ad1d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
451 * - Called if either PG_private or PG_fscache is set on the page 451 * - Called if either PG_private or PG_fscache is set on the page
452 * - Caller holds page lock 452 * - Caller holds page lock
453 */ 453 */
454static void nfs_invalidate_page(struct page *page, unsigned long offset) 454static void nfs_invalidate_page(struct page *page, unsigned int offset,
455 unsigned int length)
455{ 456{
456 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); 457 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
458 page, offset, length);
457 459
458 if (offset != 0) 460 if (offset != 0 || length < PAGE_CACHE_SIZE)
459 return; 461 return;
460 /* Cancel any unstarted writes on this page */ 462 /* Cancel any unstarted writes on this page */
461 nfs_wb_page_cancel(page_file_mapping(page)->host, page); 463 nfs_wb_page_cancel(page_file_mapping(page)->host, page);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 947b0c908aa9..4cbad5d6b276 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -203,7 +203,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
203 __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); 203 __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
204 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); 204 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
205 if (error == -EINVAL) 205 if (error == -EINVAL)
206 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_NULL); 206 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
207 if (error < 0) 207 if (error < 0)
208 goto error; 208 goto error;
209 209
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8fbc10054115..d7ba5616989c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -572,7 +572,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
572 task->tk_timeout = 0; 572 task->tk_timeout = 0;
573 573
574 spin_lock(&tbl->slot_tbl_lock); 574 spin_lock(&tbl->slot_tbl_lock);
575 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && 575 if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) &&
576 !args->sa_privileged) { 576 !args->sa_privileged) {
577 /* The state manager will wait until the slot table is empty */ 577 /* The state manager will wait until the slot table is empty */
578 dprintk("%s session is draining\n", __func__); 578 dprintk("%s session is draining\n", __func__);
@@ -1078,7 +1078,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
1078 struct nfs4_state *state = opendata->state; 1078 struct nfs4_state *state = opendata->state;
1079 struct nfs_inode *nfsi = NFS_I(state->inode); 1079 struct nfs_inode *nfsi = NFS_I(state->inode);
1080 struct nfs_delegation *delegation; 1080 struct nfs_delegation *delegation;
1081 int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC); 1081 int open_mode = opendata->o_arg.open_flags;
1082 fmode_t fmode = opendata->o_arg.fmode; 1082 fmode_t fmode = opendata->o_arg.fmode;
1083 nfs4_stateid stateid; 1083 nfs4_stateid stateid;
1084 int ret = -EAGAIN; 1084 int ret = -EAGAIN;
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index ebda5f4a031b..c4e225e4a9af 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -73,7 +73,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
73 tbl->highest_used_slotid = new_max; 73 tbl->highest_used_slotid = new_max;
74 else { 74 else {
75 tbl->highest_used_slotid = NFS4_NO_SLOT; 75 tbl->highest_used_slotid = NFS4_NO_SLOT;
76 nfs4_session_drain_complete(tbl->session, tbl); 76 nfs4_slot_tbl_drain_complete(tbl);
77 } 77 }
78 } 78 }
79 dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, 79 dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
@@ -226,7 +226,7 @@ static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
226 struct nfs4_slot *slot = pslot; 226 struct nfs4_slot *slot = pslot;
227 struct nfs4_slot_table *tbl = slot->table; 227 struct nfs4_slot_table *tbl = slot->table;
228 228
229 if (nfs4_session_draining(tbl->session) && !args->sa_privileged) 229 if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
230 return false; 230 return false;
231 slot->generation = tbl->generation; 231 slot->generation = tbl->generation;
232 args->sa_slot = slot; 232 args->sa_slot = slot;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 6f3cb39386d4..ff7d9f0f8a65 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -25,6 +25,10 @@ struct nfs4_slot {
25}; 25};
26 26
27/* Sessions */ 27/* Sessions */
28enum nfs4_slot_tbl_state {
29 NFS4_SLOT_TBL_DRAINING,
30};
31
28#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) 32#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
29struct nfs4_slot_table { 33struct nfs4_slot_table {
30 struct nfs4_session *session; /* Parent session */ 34 struct nfs4_session *session; /* Parent session */
@@ -43,6 +47,7 @@ struct nfs4_slot_table {
43 unsigned long generation; /* Generation counter for 47 unsigned long generation; /* Generation counter for
44 target_highest_slotid */ 48 target_highest_slotid */
45 struct completion complete; 49 struct completion complete;
50 unsigned long slot_tbl_state;
46}; 51};
47 52
48/* 53/*
@@ -68,7 +73,6 @@ struct nfs4_session {
68 73
69enum nfs4_session_state { 74enum nfs4_session_state {
70 NFS4_SESSION_INITING, 75 NFS4_SESSION_INITING,
71 NFS4_SESSION_DRAINING,
72}; 76};
73 77
74#if defined(CONFIG_NFS_V4_1) 78#if defined(CONFIG_NFS_V4_1)
@@ -88,12 +92,11 @@ extern void nfs4_destroy_session(struct nfs4_session *session);
88extern int nfs4_init_session(struct nfs_server *server); 92extern int nfs4_init_session(struct nfs_server *server);
89extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); 93extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
90 94
91extern void nfs4_session_drain_complete(struct nfs4_session *session, 95extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
92 struct nfs4_slot_table *tbl);
93 96
94static inline bool nfs4_session_draining(struct nfs4_session *session) 97static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
95{ 98{
96 return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); 99 return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
97} 100}
98 101
99bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, 102bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 300d17d85c0e..ff10b4aa534c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -241,7 +241,7 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
241 if (ses == NULL) 241 if (ses == NULL)
242 return; 242 return;
243 tbl = &ses->fc_slot_table; 243 tbl = &ses->fc_slot_table;
244 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { 244 if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
245 spin_lock(&tbl->slot_tbl_lock); 245 spin_lock(&tbl->slot_tbl_lock);
246 nfs41_wake_slot_table(tbl); 246 nfs41_wake_slot_table(tbl);
247 spin_unlock(&tbl->slot_tbl_lock); 247 spin_unlock(&tbl->slot_tbl_lock);
@@ -251,15 +251,15 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
251/* 251/*
252 * Signal state manager thread if session fore channel is drained 252 * Signal state manager thread if session fore channel is drained
253 */ 253 */
254void nfs4_session_drain_complete(struct nfs4_session *session, 254void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
255 struct nfs4_slot_table *tbl)
256{ 255{
257 if (nfs4_session_draining(session)) 256 if (nfs4_slot_tbl_draining(tbl))
258 complete(&tbl->complete); 257 complete(&tbl->complete);
259} 258}
260 259
261static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) 260static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
262{ 261{
262 set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
263 spin_lock(&tbl->slot_tbl_lock); 263 spin_lock(&tbl->slot_tbl_lock);
264 if (tbl->highest_used_slotid != NFS4_NO_SLOT) { 264 if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
265 INIT_COMPLETION(tbl->complete); 265 INIT_COMPLETION(tbl->complete);
@@ -275,13 +275,12 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
275 struct nfs4_session *ses = clp->cl_session; 275 struct nfs4_session *ses = clp->cl_session;
276 int ret = 0; 276 int ret = 0;
277 277
278 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
279 /* back channel */ 278 /* back channel */
280 ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table); 279 ret = nfs4_drain_slot_tbl(&ses->bc_slot_table);
281 if (ret) 280 if (ret)
282 return ret; 281 return ret;
283 /* fore channel */ 282 /* fore channel */
284 return nfs4_wait_on_slot_tbl(&ses->fc_slot_table); 283 return nfs4_drain_slot_tbl(&ses->fc_slot_table);
285} 284}
286 285
287static void nfs41_finish_session_reset(struct nfs_client *clp) 286static void nfs41_finish_session_reset(struct nfs_client *clp)
@@ -1374,13 +1373,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1374 /* Guard against delegation returns and new lock/unlock calls */ 1373 /* Guard against delegation returns and new lock/unlock calls */
1375 down_write(&nfsi->rwsem); 1374 down_write(&nfsi->rwsem);
1376 /* Protect inode->i_flock using the BKL */ 1375 /* Protect inode->i_flock using the BKL */
1377 lock_flocks(); 1376 spin_lock(&inode->i_lock);
1378 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1377 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1379 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 1378 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
1380 continue; 1379 continue;
1381 if (nfs_file_open_context(fl->fl_file)->state != state) 1380 if (nfs_file_open_context(fl->fl_file)->state != state)
1382 continue; 1381 continue;
1383 unlock_flocks(); 1382 spin_unlock(&inode->i_lock);
1384 status = ops->recover_lock(state, fl); 1383 status = ops->recover_lock(state, fl);
1385 switch (status) { 1384 switch (status) {
1386 case 0: 1385 case 0:
@@ -1407,9 +1406,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1407 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1406 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
1408 status = 0; 1407 status = 0;
1409 } 1408 }
1410 lock_flocks(); 1409 spin_lock(&inode->i_lock);
1411 } 1410 }
1412 unlock_flocks(); 1411 spin_unlock(&inode->i_lock);
1413out: 1412out:
1414 up_write(&nfsi->rwsem); 1413 up_write(&nfsi->rwsem);
1415 return status; 1414 return status;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a366107a7331..2d7525fbcf25 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1942,6 +1942,7 @@ static int nfs23_validate_mount_data(void *options,
1942 args->namlen = data->namlen; 1942 args->namlen = data->namlen;
1943 args->bsize = data->bsize; 1943 args->bsize = data->bsize;
1944 1944
1945 args->auth_flavors[0] = RPC_AUTH_UNIX;
1945 if (data->flags & NFS_MOUNT_SECFLAVOUR) 1946 if (data->flags & NFS_MOUNT_SECFLAVOUR)
1946 args->auth_flavors[0] = data->pseudoflavor; 1947 args->auth_flavors[0] = data->pseudoflavor;
1947 if (!args->nfs_server.hostname) 1948 if (!args->nfs_server.hostname)
@@ -2637,6 +2638,7 @@ static int nfs4_validate_mount_data(void *options,
2637 goto out_no_address; 2638 goto out_no_address;
2638 args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); 2639 args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
2639 2640
2641 args->auth_flavors[0] = RPC_AUTH_UNIX;
2640 if (data->auth_flavourlen) { 2642 if (data->auth_flavourlen) {
2641 if (data->auth_flavourlen > 1) 2643 if (data->auth_flavourlen > 1)
2642 goto out_inval_auth; 2644 goto out_inval_auth;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4e9a21db867a..105a3b080d12 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -240,11 +240,16 @@ struct name_list {
240 struct list_head list; 240 struct list_head list;
241}; 241};
242 242
243struct nfs4_dir_ctx {
244 struct dir_context ctx;
245 struct list_head names;
246};
247
243static int 248static int
244nfsd4_build_namelist(void *arg, const char *name, int namlen, 249nfsd4_build_namelist(void *arg, const char *name, int namlen,
245 loff_t offset, u64 ino, unsigned int d_type) 250 loff_t offset, u64 ino, unsigned int d_type)
246{ 251{
247 struct list_head *names = arg; 252 struct nfs4_dir_ctx *ctx = arg;
248 struct name_list *entry; 253 struct name_list *entry;
249 254
250 if (namlen != HEXDIR_LEN - 1) 255 if (namlen != HEXDIR_LEN - 1)
@@ -254,7 +259,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
254 return -ENOMEM; 259 return -ENOMEM;
255 memcpy(entry->name, name, HEXDIR_LEN - 1); 260 memcpy(entry->name, name, HEXDIR_LEN - 1);
256 entry->name[HEXDIR_LEN - 1] = '\0'; 261 entry->name[HEXDIR_LEN - 1] = '\0';
257 list_add(&entry->list, names); 262 list_add(&entry->list, &ctx->names);
258 return 0; 263 return 0;
259} 264}
260 265
@@ -263,7 +268,10 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
263{ 268{
264 const struct cred *original_cred; 269 const struct cred *original_cred;
265 struct dentry *dir = nn->rec_file->f_path.dentry; 270 struct dentry *dir = nn->rec_file->f_path.dentry;
266 LIST_HEAD(names); 271 struct nfs4_dir_ctx ctx = {
272 .ctx.actor = nfsd4_build_namelist,
273 .names = LIST_HEAD_INIT(ctx.names)
274 };
267 int status; 275 int status;
268 276
269 status = nfs4_save_creds(&original_cred); 277 status = nfs4_save_creds(&original_cred);
@@ -276,11 +284,11 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
276 return status; 284 return status;
277 } 285 }
278 286
279 status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names); 287 status = iterate_dir(nn->rec_file, &ctx.ctx);
280 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 288 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
281 while (!list_empty(&names)) { 289 while (!list_empty(&ctx.names)) {
282 struct name_list *entry; 290 struct name_list *entry;
283 entry = list_entry(names.next, struct name_list, list); 291 entry = list_entry(ctx.names.next, struct name_list, list);
284 if (!status) { 292 if (!status) {
285 struct dentry *dentry; 293 struct dentry *dentry;
286 dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); 294 dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 316ec843dec2..f17051838b41 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2645,13 +2645,13 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
2645 2645
2646 list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); 2646 list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
2647 2647
2648 /* only place dl_time is set. protected by lock_flocks*/ 2648 /* Only place dl_time is set; protected by i_lock: */
2649 dp->dl_time = get_seconds(); 2649 dp->dl_time = get_seconds();
2650 2650
2651 nfsd4_cb_recall(dp); 2651 nfsd4_cb_recall(dp);
2652} 2652}
2653 2653
2654/* Called from break_lease() with lock_flocks() held. */ 2654/* Called from break_lease() with i_lock held. */
2655static void nfsd_break_deleg_cb(struct file_lock *fl) 2655static void nfsd_break_deleg_cb(struct file_lock *fl)
2656{ 2656{
2657 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; 2657 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
@@ -4520,7 +4520,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
4520 struct inode *inode = filp->fi_inode; 4520 struct inode *inode = filp->fi_inode;
4521 int status = 0; 4521 int status = 0;
4522 4522
4523 lock_flocks(); 4523 spin_lock(&inode->i_lock);
4524 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 4524 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
4525 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 4525 if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
4526 status = 1; 4526 status = 1;
@@ -4528,7 +4528,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
4528 } 4528 }
4529 } 4529 }
4530out: 4530out:
4531 unlock_flocks(); 4531 spin_unlock(&inode->i_lock);
4532 return status; 4532 return status;
4533} 4533}
4534 4534
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 84ce601d8063..a6bc8a7423db 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1912,6 +1912,7 @@ struct buffered_dirent {
1912}; 1912};
1913 1913
1914struct readdir_data { 1914struct readdir_data {
1915 struct dir_context ctx;
1915 char *dirent; 1916 char *dirent;
1916 size_t used; 1917 size_t used;
1917 int full; 1918 int full;
@@ -1943,13 +1944,15 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
1943static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, 1944static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
1944 struct readdir_cd *cdp, loff_t *offsetp) 1945 struct readdir_cd *cdp, loff_t *offsetp)
1945{ 1946{
1946 struct readdir_data buf;
1947 struct buffered_dirent *de; 1947 struct buffered_dirent *de;
1948 int host_err; 1948 int host_err;
1949 int size; 1949 int size;
1950 loff_t offset; 1950 loff_t offset;
1951 struct readdir_data buf = {
1952 .ctx.actor = nfsd_buffered_filldir,
1953 .dirent = (void *)__get_free_page(GFP_KERNEL)
1954 };
1951 1955
1952 buf.dirent = (void *)__get_free_page(GFP_KERNEL);
1953 if (!buf.dirent) 1956 if (!buf.dirent)
1954 return nfserrno(-ENOMEM); 1957 return nfserrno(-ENOMEM);
1955 1958
@@ -1963,7 +1966,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
1963 buf.used = 0; 1966 buf.used = 0;
1964 buf.full = 0; 1967 buf.full = 0;
1965 1968
1966 host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf); 1969 host_err = iterate_dir(file, &buf.ctx);
1967 if (buf.full) 1970 if (buf.full)
1968 host_err = 0; 1971 host_err = 0;
1969 1972
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index f30b017740a7..197a63e9d102 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -256,22 +256,18 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
256 de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 256 de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
257} 257}
258 258
259static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 259static int nilfs_readdir(struct file *file, struct dir_context *ctx)
260{ 260{
261 loff_t pos = filp->f_pos; 261 loff_t pos = ctx->pos;
262 struct inode *inode = file_inode(filp); 262 struct inode *inode = file_inode(file);
263 struct super_block *sb = inode->i_sb; 263 struct super_block *sb = inode->i_sb;
264 unsigned int offset = pos & ~PAGE_CACHE_MASK; 264 unsigned int offset = pos & ~PAGE_CACHE_MASK;
265 unsigned long n = pos >> PAGE_CACHE_SHIFT; 265 unsigned long n = pos >> PAGE_CACHE_SHIFT;
266 unsigned long npages = dir_pages(inode); 266 unsigned long npages = dir_pages(inode);
267/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ 267/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
268 unsigned char *types = NULL;
269 int ret;
270 268
271 if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) 269 if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
272 goto success; 270 return 0;
273
274 types = nilfs_filetype_table;
275 271
276 for ( ; n < npages; n++, offset = 0) { 272 for ( ; n < npages; n++, offset = 0) {
277 char *kaddr, *limit; 273 char *kaddr, *limit;
@@ -281,9 +277,8 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
281 if (IS_ERR(page)) { 277 if (IS_ERR(page)) {
282 nilfs_error(sb, __func__, "bad page in #%lu", 278 nilfs_error(sb, __func__, "bad page in #%lu",
283 inode->i_ino); 279 inode->i_ino);
284 filp->f_pos += PAGE_CACHE_SIZE - offset; 280 ctx->pos += PAGE_CACHE_SIZE - offset;
285 ret = -EIO; 281 return -EIO;
286 goto done;
287 } 282 }
288 kaddr = page_address(page); 283 kaddr = page_address(page);
289 de = (struct nilfs_dir_entry *)(kaddr + offset); 284 de = (struct nilfs_dir_entry *)(kaddr + offset);
@@ -293,35 +288,28 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
293 if (de->rec_len == 0) { 288 if (de->rec_len == 0) {
294 nilfs_error(sb, __func__, 289 nilfs_error(sb, __func__,
295 "zero-length directory entry"); 290 "zero-length directory entry");
296 ret = -EIO;
297 nilfs_put_page(page); 291 nilfs_put_page(page);
298 goto done; 292 return -EIO;
299 } 293 }
300 if (de->inode) { 294 if (de->inode) {
301 int over; 295 unsigned char t;
302 unsigned char d_type = DT_UNKNOWN;
303 296
304 if (types && de->file_type < NILFS_FT_MAX) 297 if (de->file_type < NILFS_FT_MAX)
305 d_type = types[de->file_type]; 298 t = nilfs_filetype_table[de->file_type];
299 else
300 t = DT_UNKNOWN;
306 301
307 offset = (char *)de - kaddr; 302 if (!dir_emit(ctx, de->name, de->name_len,
308 over = filldir(dirent, de->name, de->name_len, 303 le64_to_cpu(de->inode), t)) {
309 (n<<PAGE_CACHE_SHIFT) | offset,
310 le64_to_cpu(de->inode), d_type);
311 if (over) {
312 nilfs_put_page(page); 304 nilfs_put_page(page);
313 goto success; 305 return 0;
314 } 306 }
315 } 307 }
316 filp->f_pos += nilfs_rec_len_from_disk(de->rec_len); 308 ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
317 } 309 }
318 nilfs_put_page(page); 310 nilfs_put_page(page);
319 } 311 }
320 312 return 0;
321success:
322 ret = 0;
323done:
324 return ret;
325} 313}
326 314
327/* 315/*
@@ -678,7 +666,7 @@ not_empty:
678const struct file_operations nilfs_dir_operations = { 666const struct file_operations nilfs_dir_operations = {
679 .llseek = generic_file_llseek, 667 .llseek = generic_file_llseek,
680 .read = generic_read_dir, 668 .read = generic_read_dir,
681 .readdir = nilfs_readdir, 669 .iterate = nilfs_readdir,
682 .unlocked_ioctl = nilfs_ioctl, 670 .unlocked_ioctl = nilfs_ioctl,
683#ifdef CONFIG_COMPAT 671#ifdef CONFIG_COMPAT
684 .compat_ioctl = nilfs_compat_ioctl, 672 .compat_ioctl = nilfs_compat_ioctl,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 689fb608648e..bccfec8343c5 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -219,13 +219,32 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
219 219
220static int nilfs_set_page_dirty(struct page *page) 220static int nilfs_set_page_dirty(struct page *page)
221{ 221{
222 int ret = __set_page_dirty_buffers(page); 222 int ret = __set_page_dirty_nobuffers(page);
223 223
224 if (ret) { 224 if (page_has_buffers(page)) {
225 struct inode *inode = page->mapping->host; 225 struct inode *inode = page->mapping->host;
226 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); 226 unsigned nr_dirty = 0;
227 struct buffer_head *bh, *head;
227 228
228 nilfs_set_file_dirty(inode, nr_dirty); 229 /*
230 * This page is locked by callers, and no other thread
231 * concurrently marks its buffers dirty since they are
232 * only dirtied through routines in fs/buffer.c in
233 * which call sites of mark_buffer_dirty are protected
234 * by page lock.
235 */
236 bh = head = page_buffers(page);
237 do {
238 /* Do not mark hole blocks dirty */
239 if (buffer_dirty(bh) || !buffer_mapped(bh))
240 continue;
241
242 set_buffer_dirty(bh);
243 nr_dirty++;
244 } while (bh = bh->b_this_page, bh != head);
245
246 if (nr_dirty)
247 nilfs_set_file_dirty(inode, nr_dirty);
229 } 248 }
230 return ret; 249 return ret;
231} 250}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6c80083a984f..1ea52f7c031f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -399,9 +399,6 @@ static int fanotify_release(struct inode *ignored, struct file *file)
399 wake_up(&group->fanotify_data.access_waitq); 399 wake_up(&group->fanotify_data.access_waitq);
400#endif 400#endif
401 401
402 if (file->f_flags & FASYNC)
403 fsnotify_fasync(-1, file, 0);
404
405 /* matches the fanotify_init->fsnotify_alloc_group */ 402 /* matches the fanotify_init->fsnotify_alloc_group */
406 fsnotify_destroy_group(group); 403 fsnotify_destroy_group(group);
407 404
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af4..d267ea6aa1a0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
1372 * The page may have dirty, unmapped buffers. Make them 1372 * The page may have dirty, unmapped buffers. Make them
1373 * freeable here, so the page does not leak. 1373 * freeable here, so the page does not leak.
1374 */ 1374 */
1375 block_invalidatepage(page, 0); 1375 block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1376 unlock_page(page); 1376 unlock_page(page);
1377 ntfs_debug("Write outside i_size - truncated?"); 1377 ntfs_debug("Write outside i_size - truncated?");
1378 return 0; 1378 return 0;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index aa411c3f20e9..9e38dafa3bc7 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1004,13 +1004,11 @@ dir_err_out:
1004/** 1004/**
1005 * ntfs_filldir - ntfs specific filldir method 1005 * ntfs_filldir - ntfs specific filldir method
1006 * @vol: current ntfs volume 1006 * @vol: current ntfs volume
1007 * @fpos: position in the directory
1008 * @ndir: ntfs inode of current directory 1007 * @ndir: ntfs inode of current directory
1009 * @ia_page: page in which the index allocation buffer @ie is in resides 1008 * @ia_page: page in which the index allocation buffer @ie is in resides
1010 * @ie: current index entry 1009 * @ie: current index entry
1011 * @name: buffer to use for the converted name 1010 * @name: buffer to use for the converted name
1012 * @dirent: vfs filldir callback context 1011 * @actor: what to feed the entries to
1013 * @filldir: vfs filldir callback
1014 * 1012 *
1015 * Convert the Unicode @name to the loaded NLS and pass it to the @filldir 1013 * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
1016 * callback. 1014 * callback.
@@ -1024,12 +1022,12 @@ dir_err_out:
1024 * retake the lock if we are returning a non-zero value as ntfs_readdir() 1022 * retake the lock if we are returning a non-zero value as ntfs_readdir()
1025 * would need to drop the lock immediately anyway. 1023 * would need to drop the lock immediately anyway.
1026 */ 1024 */
1027static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, 1025static inline int ntfs_filldir(ntfs_volume *vol,
1028 ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, 1026 ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
1029 u8 *name, void *dirent, filldir_t filldir) 1027 u8 *name, struct dir_context *actor)
1030{ 1028{
1031 unsigned long mref; 1029 unsigned long mref;
1032 int name_len, rc; 1030 int name_len;
1033 unsigned dt_type; 1031 unsigned dt_type;
1034 FILE_NAME_TYPE_FLAGS name_type; 1032 FILE_NAME_TYPE_FLAGS name_type;
1035 1033
@@ -1068,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
1068 if (ia_page) 1066 if (ia_page)
1069 unlock_page(ia_page); 1067 unlock_page(ia_page);
1070 ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " 1068 ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
1071 "0x%lx, DT_%s.", name, name_len, fpos, mref, 1069 "0x%lx, DT_%s.", name, name_len, actor->pos, mref,
1072 dt_type == DT_DIR ? "DIR" : "REG"); 1070 dt_type == DT_DIR ? "DIR" : "REG");
1073 rc = filldir(dirent, name, name_len, fpos, mref, dt_type); 1071 if (!dir_emit(actor, name, name_len, mref, dt_type))
1072 return 1;
1074 /* Relock the page but not if we are aborting ->readdir. */ 1073 /* Relock the page but not if we are aborting ->readdir. */
1075 if (!rc && ia_page) 1074 if (ia_page)
1076 lock_page(ia_page); 1075 lock_page(ia_page);
1077 return rc; 1076 return 0;
1078} 1077}
1079 1078
1080/* 1079/*
@@ -1097,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
1097 * removes them again after the write is complete after which it 1096 * removes them again after the write is complete after which it
1098 * unlocks the page. 1097 * unlocks the page.
1099 */ 1098 */
1100static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 1099static int ntfs_readdir(struct file *file, struct dir_context *actor)
1101{ 1100{
1102 s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; 1101 s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
1103 loff_t fpos, i_size; 1102 loff_t i_size;
1104 struct inode *bmp_vi, *vdir = file_inode(filp); 1103 struct inode *bmp_vi, *vdir = file_inode(file);
1105 struct super_block *sb = vdir->i_sb; 1104 struct super_block *sb = vdir->i_sb;
1106 ntfs_inode *ndir = NTFS_I(vdir); 1105 ntfs_inode *ndir = NTFS_I(vdir);
1107 ntfs_volume *vol = NTFS_SB(sb); 1106 ntfs_volume *vol = NTFS_SB(sb);
@@ -1116,33 +1115,16 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1116 u8 *kaddr, *bmp, *index_end; 1115 u8 *kaddr, *bmp, *index_end;
1117 ntfs_attr_search_ctx *ctx; 1116 ntfs_attr_search_ctx *ctx;
1118 1117
1119 fpos = filp->f_pos;
1120 ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", 1118 ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
1121 vdir->i_ino, fpos); 1119 vdir->i_ino, actor->pos);
1122 rc = err = 0; 1120 rc = err = 0;
1123 /* Are we at end of dir yet? */ 1121 /* Are we at end of dir yet? */
1124 i_size = i_size_read(vdir); 1122 i_size = i_size_read(vdir);
1125 if (fpos >= i_size + vol->mft_record_size) 1123 if (actor->pos >= i_size + vol->mft_record_size)
1126 goto done; 1124 return 0;
1127 /* Emulate . and .. for all directories. */ 1125 /* Emulate . and .. for all directories. */
1128 if (!fpos) { 1126 if (!dir_emit_dots(file, actor))
1129 ntfs_debug("Calling filldir for . with len 1, fpos 0x0, " 1127 return 0;
1130 "inode 0x%lx, DT_DIR.", vdir->i_ino);
1131 rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
1132 if (rc)
1133 goto done;
1134 fpos++;
1135 }
1136 if (fpos == 1) {
1137 ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
1138 "inode 0x%lx, DT_DIR.",
1139 (unsigned long)parent_ino(filp->f_path.dentry));
1140 rc = filldir(dirent, "..", 2, fpos,
1141 parent_ino(filp->f_path.dentry), DT_DIR);
1142 if (rc)
1143 goto done;
1144 fpos++;
1145 }
1146 m = NULL; 1128 m = NULL;
1147 ctx = NULL; 1129 ctx = NULL;
1148 /* 1130 /*
@@ -1155,7 +1137,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1155 goto err_out; 1137 goto err_out;
1156 } 1138 }
1157 /* Are we jumping straight into the index allocation attribute? */ 1139 /* Are we jumping straight into the index allocation attribute? */
1158 if (fpos >= vol->mft_record_size) 1140 if (actor->pos >= vol->mft_record_size)
1159 goto skip_index_root; 1141 goto skip_index_root;
1160 /* Get hold of the mft record for the directory. */ 1142 /* Get hold of the mft record for the directory. */
1161 m = map_mft_record(ndir); 1143 m = map_mft_record(ndir);
@@ -1170,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1170 goto err_out; 1152 goto err_out;
1171 } 1153 }
1172 /* Get the offset into the index root attribute. */ 1154 /* Get the offset into the index root attribute. */
1173 ir_pos = (s64)fpos; 1155 ir_pos = (s64)actor->pos;
1174 /* Find the index root attribute in the mft record. */ 1156 /* Find the index root attribute in the mft record. */
1175 err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, 1157 err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
1176 0, ctx); 1158 0, ctx);
@@ -1226,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1226 if (ir_pos > (u8*)ie - (u8*)ir) 1208 if (ir_pos > (u8*)ie - (u8*)ir)
1227 continue; 1209 continue;
1228 /* Advance the position even if going to skip the entry. */ 1210 /* Advance the position even if going to skip the entry. */
1229 fpos = (u8*)ie - (u8*)ir; 1211 actor->pos = (u8*)ie - (u8*)ir;
1230 /* Submit the name to the filldir callback. */ 1212 /* Submit the name to the filldir callback. */
1231 rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent, 1213 rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
1232 filldir);
1233 if (rc) { 1214 if (rc) {
1234 kfree(ir); 1215 kfree(ir);
1235 goto abort; 1216 goto abort;
@@ -1242,12 +1223,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1242 if (!NInoIndexAllocPresent(ndir)) 1223 if (!NInoIndexAllocPresent(ndir))
1243 goto EOD; 1224 goto EOD;
1244 /* Advance fpos to the beginning of the index allocation. */ 1225 /* Advance fpos to the beginning of the index allocation. */
1245 fpos = vol->mft_record_size; 1226 actor->pos = vol->mft_record_size;
1246skip_index_root: 1227skip_index_root:
1247 kaddr = NULL; 1228 kaddr = NULL;
1248 prev_ia_pos = -1LL; 1229 prev_ia_pos = -1LL;
1249 /* Get the offset into the index allocation attribute. */ 1230 /* Get the offset into the index allocation attribute. */
1250 ia_pos = (s64)fpos - vol->mft_record_size; 1231 ia_pos = (s64)actor->pos - vol->mft_record_size;
1251 ia_mapping = vdir->i_mapping; 1232 ia_mapping = vdir->i_mapping;
1252 ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); 1233 ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
1253 bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); 1234 bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
@@ -1409,7 +1390,7 @@ find_next_index_buffer:
1409 if (ia_pos - ia_start > (u8*)ie - (u8*)ia) 1390 if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
1410 continue; 1391 continue;
1411 /* Advance the position even if going to skip the entry. */ 1392 /* Advance the position even if going to skip the entry. */
1412 fpos = (u8*)ie - (u8*)ia + 1393 actor->pos = (u8*)ie - (u8*)ia +
1413 (sle64_to_cpu(ia->index_block_vcn) << 1394 (sle64_to_cpu(ia->index_block_vcn) <<
1414 ndir->itype.index.vcn_size_bits) + 1395 ndir->itype.index.vcn_size_bits) +
1415 vol->mft_record_size; 1396 vol->mft_record_size;
@@ -1419,8 +1400,7 @@ find_next_index_buffer:
1419 * before returning, unless a non-zero value is returned in 1400 * before returning, unless a non-zero value is returned in
1420 * which case the page is left unlocked. 1401 * which case the page is left unlocked.
1421 */ 1402 */
1422 rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent, 1403 rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
1423 filldir);
1424 if (rc) { 1404 if (rc) {
1425 /* @ia_page is already unlocked in this case. */ 1405 /* @ia_page is already unlocked in this case. */
1426 ntfs_unmap_page(ia_page); 1406 ntfs_unmap_page(ia_page);
@@ -1439,18 +1419,9 @@ unm_EOD:
1439 iput(bmp_vi); 1419 iput(bmp_vi);
1440EOD: 1420EOD:
1441 /* We are finished, set fpos to EOD. */ 1421 /* We are finished, set fpos to EOD. */
1442 fpos = i_size + vol->mft_record_size; 1422 actor->pos = i_size + vol->mft_record_size;
1443abort: 1423abort:
1444 kfree(name); 1424 kfree(name);
1445done:
1446#ifdef DEBUG
1447 if (!rc)
1448 ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
1449 else
1450 ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
1451 rc, fpos);
1452#endif
1453 filp->f_pos = fpos;
1454 return 0; 1425 return 0;
1455err_out: 1426err_out:
1456 if (bmp_page) { 1427 if (bmp_page) {
@@ -1471,7 +1442,6 @@ iput_err_out:
1471 if (!err) 1442 if (!err)
1472 err = -EIO; 1443 err = -EIO;
1473 ntfs_debug("Failed. Returning error code %i.", -err); 1444 ntfs_debug("Failed. Returning error code %i.", -err);
1474 filp->f_pos = fpos;
1475 return err; 1445 return err;
1476} 1446}
1477 1447
@@ -1571,7 +1541,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
1571const struct file_operations ntfs_dir_ops = { 1541const struct file_operations ntfs_dir_ops = {
1572 .llseek = generic_file_llseek, /* Seek inside directory. */ 1542 .llseek = generic_file_llseek, /* Seek inside directory. */
1573 .read = generic_read_dir, /* Return -EISDIR. */ 1543 .read = generic_read_dir, /* Return -EISDIR. */
1574 .readdir = ntfs_readdir, /* Read directory contents. */ 1544 .iterate = ntfs_readdir, /* Read directory contents. */
1575#ifdef NTFS_RW 1545#ifdef NTFS_RW
1576 .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ 1546 .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */
1577 /*.aio_fsync = ,*/ /* Sync all outstanding async 1547 /*.aio_fsync = ,*/ /* Sync all outstanding async
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec72e903..79736a28d84f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
603 * from ext3. PageChecked() bits have been removed as OCFS2 does not 603 * from ext3. PageChecked() bits have been removed as OCFS2 does not
604 * do journalled data. 604 * do journalled data.
605 */ 605 */
606static void ocfs2_invalidatepage(struct page *page, unsigned long offset) 606static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
607 unsigned int length)
607{ 608{
608 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 609 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
609 610
610 jbd2_journal_invalidatepage(journal, page, offset); 611 jbd2_journal_invalidatepage(journal, page, offset, length);
611} 612}
612 613
613static int ocfs2_releasepage(struct page *page, gfp_t wait) 614static int ocfs2_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f1e1aed8f638..eb760d8acd50 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1761,11 +1761,10 @@ bail:
1761 1761
1762static int ocfs2_dir_foreach_blk_id(struct inode *inode, 1762static int ocfs2_dir_foreach_blk_id(struct inode *inode,
1763 u64 *f_version, 1763 u64 *f_version,
1764 loff_t *f_pos, void *priv, 1764 struct dir_context *ctx)
1765 filldir_t filldir, int *filldir_err)
1766{ 1765{
1767 int ret, i, filldir_ret; 1766 int ret, i;
1768 unsigned long offset = *f_pos; 1767 unsigned long offset = ctx->pos;
1769 struct buffer_head *di_bh = NULL; 1768 struct buffer_head *di_bh = NULL;
1770 struct ocfs2_dinode *di; 1769 struct ocfs2_dinode *di;
1771 struct ocfs2_inline_data *data; 1770 struct ocfs2_inline_data *data;
@@ -1781,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
1781 di = (struct ocfs2_dinode *)di_bh->b_data; 1780 di = (struct ocfs2_dinode *)di_bh->b_data;
1782 data = &di->id2.i_data; 1781 data = &di->id2.i_data;
1783 1782
1784 while (*f_pos < i_size_read(inode)) { 1783 while (ctx->pos < i_size_read(inode)) {
1785revalidate:
1786 /* If the dir block has changed since the last call to 1784 /* If the dir block has changed since the last call to
1787 * readdir(2), then we might be pointing to an invalid 1785 * readdir(2), then we might be pointing to an invalid
1788 * dirent right now. Scan from the start of the block 1786 * dirent right now. Scan from the start of the block
@@ -1802,50 +1800,31 @@ revalidate:
1802 break; 1800 break;
1803 i += le16_to_cpu(de->rec_len); 1801 i += le16_to_cpu(de->rec_len);
1804 } 1802 }
1805 *f_pos = offset = i; 1803 ctx->pos = offset = i;
1806 *f_version = inode->i_version; 1804 *f_version = inode->i_version;
1807 } 1805 }
1808 1806
1809 de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos); 1807 de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
1810 if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) { 1808 if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
1811 /* On error, skip the f_pos to the end. */ 1809 /* On error, skip the f_pos to the end. */
1812 *f_pos = i_size_read(inode); 1810 ctx->pos = i_size_read(inode);
1813 goto out; 1811 break;
1814 } 1812 }
1815 offset += le16_to_cpu(de->rec_len); 1813 offset += le16_to_cpu(de->rec_len);
1816 if (le64_to_cpu(de->inode)) { 1814 if (le64_to_cpu(de->inode)) {
1817 /* We might block in the next section
1818 * if the data destination is
1819 * currently swapped out. So, use a
1820 * version stamp to detect whether or
1821 * not the directory has been modified
1822 * during the copy operation.
1823 */
1824 u64 version = *f_version;
1825 unsigned char d_type = DT_UNKNOWN; 1815 unsigned char d_type = DT_UNKNOWN;
1826 1816
1827 if (de->file_type < OCFS2_FT_MAX) 1817 if (de->file_type < OCFS2_FT_MAX)
1828 d_type = ocfs2_filetype_table[de->file_type]; 1818 d_type = ocfs2_filetype_table[de->file_type];
1829 1819
1830 filldir_ret = filldir(priv, de->name, 1820 if (!dir_emit(ctx, de->name, de->name_len,
1831 de->name_len, 1821 le64_to_cpu(de->inode), d_type))
1832 *f_pos, 1822 goto out;
1833 le64_to_cpu(de->inode),
1834 d_type);
1835 if (filldir_ret) {
1836 if (filldir_err)
1837 *filldir_err = filldir_ret;
1838 break;
1839 }
1840 if (version != *f_version)
1841 goto revalidate;
1842 } 1823 }
1843 *f_pos += le16_to_cpu(de->rec_len); 1824 ctx->pos += le16_to_cpu(de->rec_len);
1844 } 1825 }
1845
1846out: 1826out:
1847 brelse(di_bh); 1827 brelse(di_bh);
1848
1849 return 0; 1828 return 0;
1850} 1829}
1851 1830
@@ -1855,27 +1834,26 @@ out:
1855 */ 1834 */
1856static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1835static int ocfs2_dir_foreach_blk_el(struct inode *inode,
1857 u64 *f_version, 1836 u64 *f_version,
1858 loff_t *f_pos, void *priv, 1837 struct dir_context *ctx,
1859 filldir_t filldir, int *filldir_err) 1838 bool persist)
1860{ 1839{
1861 int error = 0;
1862 unsigned long offset, blk, last_ra_blk = 0; 1840 unsigned long offset, blk, last_ra_blk = 0;
1863 int i, stored; 1841 int i;
1864 struct buffer_head * bh, * tmp; 1842 struct buffer_head * bh, * tmp;
1865 struct ocfs2_dir_entry * de; 1843 struct ocfs2_dir_entry * de;
1866 struct super_block * sb = inode->i_sb; 1844 struct super_block * sb = inode->i_sb;
1867 unsigned int ra_sectors = 16; 1845 unsigned int ra_sectors = 16;
1846 int stored = 0;
1868 1847
1869 stored = 0;
1870 bh = NULL; 1848 bh = NULL;
1871 1849
1872 offset = (*f_pos) & (sb->s_blocksize - 1); 1850 offset = ctx->pos & (sb->s_blocksize - 1);
1873 1851
1874 while (!error && !stored && *f_pos < i_size_read(inode)) { 1852 while (ctx->pos < i_size_read(inode)) {
1875 blk = (*f_pos) >> sb->s_blocksize_bits; 1853 blk = ctx->pos >> sb->s_blocksize_bits;
1876 if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { 1854 if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
1877 /* Skip the corrupt dirblock and keep trying */ 1855 /* Skip the corrupt dirblock and keep trying */
1878 *f_pos += sb->s_blocksize - offset; 1856 ctx->pos += sb->s_blocksize - offset;
1879 continue; 1857 continue;
1880 } 1858 }
1881 1859
@@ -1897,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
1897 ra_sectors = 8; 1875 ra_sectors = 8;
1898 } 1876 }
1899 1877
1900revalidate:
1901 /* If the dir block has changed since the last call to 1878 /* If the dir block has changed since the last call to
1902 * readdir(2), then we might be pointing to an invalid 1879 * readdir(2), then we might be pointing to an invalid
1903 * dirent right now. Scan from the start of the block 1880 * dirent right now. Scan from the start of the block
@@ -1917,93 +1894,64 @@ revalidate:
1917 i += le16_to_cpu(de->rec_len); 1894 i += le16_to_cpu(de->rec_len);
1918 } 1895 }
1919 offset = i; 1896 offset = i;
1920 *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1)) 1897 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
1921 | offset; 1898 | offset;
1922 *f_version = inode->i_version; 1899 *f_version = inode->i_version;
1923 } 1900 }
1924 1901
1925 while (!error && *f_pos < i_size_read(inode) 1902 while (ctx->pos < i_size_read(inode)
1926 && offset < sb->s_blocksize) { 1903 && offset < sb->s_blocksize) {
1927 de = (struct ocfs2_dir_entry *) (bh->b_data + offset); 1904 de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
1928 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { 1905 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
1929 /* On error, skip the f_pos to the 1906 /* On error, skip the f_pos to the
1930 next block. */ 1907 next block. */
1931 *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1; 1908 ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
1932 brelse(bh); 1909 brelse(bh);
1933 goto out; 1910 continue;
1934 } 1911 }
1935 offset += le16_to_cpu(de->rec_len);
1936 if (le64_to_cpu(de->inode)) { 1912 if (le64_to_cpu(de->inode)) {
1937 /* We might block in the next section
1938 * if the data destination is
1939 * currently swapped out. So, use a
1940 * version stamp to detect whether or
1941 * not the directory has been modified
1942 * during the copy operation.
1943 */
1944 unsigned long version = *f_version;
1945 unsigned char d_type = DT_UNKNOWN; 1913 unsigned char d_type = DT_UNKNOWN;
1946 1914
1947 if (de->file_type < OCFS2_FT_MAX) 1915 if (de->file_type < OCFS2_FT_MAX)
1948 d_type = ocfs2_filetype_table[de->file_type]; 1916 d_type = ocfs2_filetype_table[de->file_type];
1949 error = filldir(priv, de->name, 1917 if (!dir_emit(ctx, de->name,
1950 de->name_len, 1918 de->name_len,
1951 *f_pos,
1952 le64_to_cpu(de->inode), 1919 le64_to_cpu(de->inode),
1953 d_type); 1920 d_type)) {
1954 if (error) { 1921 brelse(bh);
1955 if (filldir_err) 1922 return 0;
1956 *filldir_err = error;
1957 break;
1958 } 1923 }
1959 if (version != *f_version) 1924 stored++;
1960 goto revalidate;
1961 stored ++;
1962 } 1925 }
1963 *f_pos += le16_to_cpu(de->rec_len); 1926 offset += le16_to_cpu(de->rec_len);
1927 ctx->pos += le16_to_cpu(de->rec_len);
1964 } 1928 }
1965 offset = 0; 1929 offset = 0;
1966 brelse(bh); 1930 brelse(bh);
1967 bh = NULL; 1931 bh = NULL;
1932 if (!persist && stored)
1933 break;
1968 } 1934 }
1969 1935 return 0;
1970 stored = 0;
1971out:
1972 return stored;
1973} 1936}
1974 1937
1975static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, 1938static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
1976 loff_t *f_pos, void *priv, filldir_t filldir, 1939 struct dir_context *ctx,
1977 int *filldir_err) 1940 bool persist)
1978{ 1941{
1979 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1942 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1980 return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv, 1943 return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
1981 filldir, filldir_err); 1944 return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
1982
1983 return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
1984 filldir_err);
1985} 1945}
1986 1946
1987/* 1947/*
1988 * This is intended to be called from inside other kernel functions, 1948 * This is intended to be called from inside other kernel functions,
1989 * so we fake some arguments. 1949 * so we fake some arguments.
1990 */ 1950 */
1991int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, 1951int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
1992 filldir_t filldir)
1993{ 1952{
1994 int ret = 0, filldir_err = 0;
1995 u64 version = inode->i_version; 1953 u64 version = inode->i_version;
1996 1954 ocfs2_dir_foreach_blk(inode, &version, ctx, true);
1997 while (*f_pos < i_size_read(inode)) {
1998 ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
1999 filldir, &filldir_err);
2000 if (ret || filldir_err)
2001 break;
2002 }
2003
2004 if (ret > 0)
2005 ret = -EIO;
2006
2007 return 0; 1955 return 0;
2008} 1956}
2009 1957
@@ -2011,15 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
2011 * ocfs2_readdir() 1959 * ocfs2_readdir()
2012 * 1960 *
2013 */ 1961 */
2014int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) 1962int ocfs2_readdir(struct file *file, struct dir_context *ctx)
2015{ 1963{
2016 int error = 0; 1964 int error = 0;
2017 struct inode *inode = file_inode(filp); 1965 struct inode *inode = file_inode(file);
2018 int lock_level = 0; 1966 int lock_level = 0;
2019 1967
2020 trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); 1968 trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
2021 1969
2022 error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); 1970 error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
2023 if (lock_level && error >= 0) { 1971 if (lock_level && error >= 0) {
2024 /* We release EX lock which used to update atime 1972 /* We release EX lock which used to update atime
2025 * and get PR lock again to reduce contention 1973 * and get PR lock again to reduce contention
@@ -2035,8 +1983,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
2035 goto bail_nolock; 1983 goto bail_nolock;
2036 } 1984 }
2037 1985
2038 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, 1986 error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
2039 dirent, filldir, NULL);
2040 1987
2041 ocfs2_inode_unlock(inode, lock_level); 1988 ocfs2_inode_unlock(inode, lock_level);
2042 if (error) 1989 if (error)
@@ -2120,6 +2067,7 @@ bail:
2120} 2067}
2121 2068
2122struct ocfs2_empty_dir_priv { 2069struct ocfs2_empty_dir_priv {
2070 struct dir_context ctx;
2123 unsigned seen_dot; 2071 unsigned seen_dot;
2124 unsigned seen_dot_dot; 2072 unsigned seen_dot_dot;
2125 unsigned seen_other; 2073 unsigned seen_other;
@@ -2204,8 +2152,9 @@ out:
2204int ocfs2_empty_dir(struct inode *inode) 2152int ocfs2_empty_dir(struct inode *inode)
2205{ 2153{
2206 int ret; 2154 int ret;
2207 loff_t start = 0; 2155 struct ocfs2_empty_dir_priv priv = {
2208 struct ocfs2_empty_dir_priv priv; 2156 .ctx.actor = ocfs2_empty_dir_filldir
2157 };
2209 2158
2210 memset(&priv, 0, sizeof(priv)); 2159 memset(&priv, 0, sizeof(priv));
2211 2160
@@ -2219,7 +2168,7 @@ int ocfs2_empty_dir(struct inode *inode)
2219 */ 2168 */
2220 } 2169 }
2221 2170
2222 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); 2171 ret = ocfs2_dir_foreach(inode, &priv.ctx);
2223 if (ret) 2172 if (ret)
2224 mlog_errno(ret); 2173 mlog_errno(ret);
2225 2174
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3deb645..f0344b75b14d 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name,
92 struct ocfs2_dir_lookup_result *res); 92 struct ocfs2_dir_lookup_result *res);
93int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, 93int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
94 int namelen, u64 *blkno); 94 int namelen, u64 *blkno);
95int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); 95int ocfs2_readdir(struct file *file, struct dir_context *ctx);
96int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, 96int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
97 filldir_t filldir);
98int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 97int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
99 struct inode *dir, 98 struct inode *dir,
100 struct buffer_head *parent_fe_bh, 99 struct buffer_head *parent_fe_bh,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b3fdd1a323d6..e68588e6b1e8 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1408,6 +1408,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
1408 mres->lockname_len, mres->lockname); 1408 mres->lockname_len, mres->lockname);
1409 ret = -EFAULT; 1409 ret = -EFAULT;
1410 spin_unlock(&res->spinlock); 1410 spin_unlock(&res->spinlock);
1411 dlm_lockres_put(res);
1411 goto leave; 1412 goto leave;
1412 } 1413 }
1413 res->state |= DLM_LOCK_RES_MIGRATING; 1414 res->state |= DLM_LOCK_RES_MIGRATING;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 1c39efb71bab..2487116d0d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -790,7 +790,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
790 &hole_size, &rec, &is_last); 790 &hole_size, &rec, &is_last);
791 if (ret) { 791 if (ret) {
792 mlog_errno(ret); 792 mlog_errno(ret);
793 goto out; 793 goto out_unlock;
794 } 794 }
795 795
796 if (rec.e_blkno == 0ULL) { 796 if (rec.e_blkno == 0ULL) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8a7509f9e6f5..41000f223ca4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2288,7 +2288,7 @@ relock:
2288 ret = ocfs2_inode_lock(inode, NULL, 1); 2288 ret = ocfs2_inode_lock(inode, NULL, 1);
2289 if (ret < 0) { 2289 if (ret < 0) {
2290 mlog_errno(ret); 2290 mlog_errno(ret);
2291 goto out_sems; 2291 goto out;
2292 } 2292 }
2293 2293
2294 ocfs2_inode_unlock(inode, 1); 2294 ocfs2_inode_unlock(inode, 1);
@@ -2646,17 +2646,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2646 goto out; 2646 goto out;
2647 } 2647 }
2648 2648
2649 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 2649 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2650 ret = -EINVAL;
2651 if (!ret && offset > inode->i_sb->s_maxbytes)
2652 ret = -EINVAL;
2653 if (ret)
2654 goto out;
2655
2656 if (offset != file->f_pos) {
2657 file->f_pos = offset;
2658 file->f_version = 0;
2659 }
2660 2650
2661out: 2651out:
2662 mutex_unlock(&inode->i_mutex); 2652 mutex_unlock(&inode->i_mutex);
@@ -2712,7 +2702,7 @@ const struct file_operations ocfs2_fops = {
2712const struct file_operations ocfs2_dops = { 2702const struct file_operations ocfs2_dops = {
2713 .llseek = generic_file_llseek, 2703 .llseek = generic_file_llseek,
2714 .read = generic_read_dir, 2704 .read = generic_read_dir,
2715 .readdir = ocfs2_readdir, 2705 .iterate = ocfs2_readdir,
2716 .fsync = ocfs2_sync_file, 2706 .fsync = ocfs2_sync_file,
2717 .release = ocfs2_dir_release, 2707 .release = ocfs2_dir_release,
2718 .open = ocfs2_dir_open, 2708 .open = ocfs2_dir_open,
@@ -2759,7 +2749,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
2759const struct file_operations ocfs2_dops_no_plocks = { 2749const struct file_operations ocfs2_dops_no_plocks = {
2760 .llseek = generic_file_llseek, 2750 .llseek = generic_file_llseek,
2761 .read = generic_read_dir, 2751 .read = generic_read_dir,
2762 .readdir = ocfs2_readdir, 2752 .iterate = ocfs2_readdir,
2763 .fsync = ocfs2_sync_file, 2753 .fsync = ocfs2_sync_file,
2764 .release = ocfs2_dir_release, 2754 .release = ocfs2_dir_release,
2765 .open = ocfs2_dir_open, 2755 .open = ocfs2_dir_open,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8eccfabcd12e..242170d83971 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1941,6 +1941,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
1941} 1941}
1942 1942
1943struct ocfs2_orphan_filldir_priv { 1943struct ocfs2_orphan_filldir_priv {
1944 struct dir_context ctx;
1944 struct inode *head; 1945 struct inode *head;
1945 struct ocfs2_super *osb; 1946 struct ocfs2_super *osb;
1946}; 1947};
@@ -1977,11 +1978,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1977{ 1978{
1978 int status; 1979 int status;
1979 struct inode *orphan_dir_inode = NULL; 1980 struct inode *orphan_dir_inode = NULL;
1980 struct ocfs2_orphan_filldir_priv priv; 1981 struct ocfs2_orphan_filldir_priv priv = {
1981 loff_t pos = 0; 1982 .ctx.actor = ocfs2_orphan_filldir,
1982 1983 .osb = osb,
1983 priv.osb = osb; 1984 .head = *head
1984 priv.head = *head; 1985 };
1985 1986
1986 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1987 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1987 ORPHAN_DIR_SYSTEM_INODE, 1988 ORPHAN_DIR_SYSTEM_INODE,
@@ -1999,8 +2000,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1999 goto out; 2000 goto out;
2000 } 2001 }
2001 2002
2002 status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv, 2003 status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
2003 ocfs2_orphan_filldir);
2004 if (status) { 2004 if (status) {
2005 mlog_errno(status); 2005 mlog_errno(status);
2006 goto out_cluster; 2006 goto out_cluster;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 04ee1b57c243..b4a5cdf9dbc5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -947,7 +947,7 @@ leave:
947 ocfs2_free_dir_lookup_result(&orphan_insert); 947 ocfs2_free_dir_lookup_result(&orphan_insert);
948 ocfs2_free_dir_lookup_result(&lookup); 948 ocfs2_free_dir_lookup_result(&lookup);
949 949
950 if (status) 950 if (status && (status != -ENOTEMPTY))
951 mlog_errno(status); 951 mlog_errno(status);
952 952
953 return status; 953 return status;
@@ -2216,7 +2216,7 @@ out:
2216 2216
2217 brelse(orphan_dir_bh); 2217 brelse(orphan_dir_bh);
2218 2218
2219 return 0; 2219 return ret;
2220} 2220}
2221 2221
2222int ocfs2_create_inode_in_orphan(struct inode *dir, 2222int ocfs2_create_inode_in_orphan(struct inode *dir,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index acbaebcad3a8..1b8e9e8405b2 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -327,26 +327,23 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
327 return is_bad; 327 return is_bad;
328} 328}
329 329
330static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, 330static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
331 u64 fsblock, int hindex) 331 u64 fsblock, int hindex)
332{ 332{
333 struct inode *dir = file_inode(filp);
334 struct buffer_head *bh;
335 struct omfs_inode *oi;
336 u64 self;
337 int res = 0;
338 unsigned char d_type;
339
340 /* follow chain in this bucket */ 333 /* follow chain in this bucket */
341 while (fsblock != ~0) { 334 while (fsblock != ~0) {
342 bh = omfs_bread(dir->i_sb, fsblock); 335 struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock);
336 struct omfs_inode *oi;
337 u64 self;
338 unsigned char d_type;
339
343 if (!bh) 340 if (!bh)
344 goto out; 341 return true;
345 342
346 oi = (struct omfs_inode *) bh->b_data; 343 oi = (struct omfs_inode *) bh->b_data;
347 if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) { 344 if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
348 brelse(bh); 345 brelse(bh);
349 goto out; 346 return true;
350 } 347 }
351 348
352 self = fsblock; 349 self = fsblock;
@@ -361,15 +358,16 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
361 358
362 d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG; 359 d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
363 360
364 res = filldir(dirent, oi->i_name, strnlen(oi->i_name, 361 if (!dir_emit(ctx, oi->i_name,
365 OMFS_NAMELEN), filp->f_pos, self, d_type); 362 strnlen(oi->i_name, OMFS_NAMELEN),
363 self, d_type)) {
364 brelse(bh);
365 return false;
366 }
366 brelse(bh); 367 brelse(bh);
367 if (res < 0) 368 ctx->pos++;
368 break;
369 filp->f_pos++;
370 } 369 }
371out: 370 return true;
372 return res;
373} 371}
374 372
375static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, 373static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -403,60 +401,44 @@ out:
403 return err; 401 return err;
404} 402}
405 403
406static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 404static int omfs_readdir(struct file *file, struct dir_context *ctx)
407{ 405{
408 struct inode *dir = file_inode(filp); 406 struct inode *dir = file_inode(file);
409 struct buffer_head *bh; 407 struct buffer_head *bh;
410 loff_t offset, res; 408 __be64 *p;
411 unsigned int hchain, hindex; 409 unsigned int hchain, hindex;
412 int nbuckets; 410 int nbuckets;
413 u64 fsblock; 411
414 int ret = -EINVAL; 412 if (ctx->pos >> 32)
415 413 return -EINVAL;
416 if (filp->f_pos >> 32) 414
417 goto success; 415 if (ctx->pos < 1 << 20) {
418 416 if (!dir_emit_dots(file, ctx))
419 switch ((unsigned long) filp->f_pos) { 417 return 0;
420 case 0: 418 ctx->pos = 1 << 20;
421 if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
422 goto success;
423 filp->f_pos++;
424 /* fall through */
425 case 1:
426 if (filldir(dirent, "..", 2, 1,
427 parent_ino(filp->f_dentry), DT_DIR) < 0)
428 goto success;
429 filp->f_pos = 1 << 20;
430 /* fall through */
431 } 419 }
432 420
433 nbuckets = (dir->i_size - OMFS_DIR_START) / 8; 421 nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
434 422
435 /* high 12 bits store bucket + 1 and low 20 bits store hash index */ 423 /* high 12 bits store bucket + 1 and low 20 bits store hash index */
436 hchain = (filp->f_pos >> 20) - 1; 424 hchain = (ctx->pos >> 20) - 1;
437 hindex = filp->f_pos & 0xfffff; 425 hindex = ctx->pos & 0xfffff;
438 426
439 bh = omfs_bread(dir->i_sb, dir->i_ino); 427 bh = omfs_bread(dir->i_sb, dir->i_ino);
440 if (!bh) 428 if (!bh)
441 goto out; 429 return -EINVAL;
442 430
443 offset = OMFS_DIR_START + hchain * 8; 431 p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain;
444 432
445 for (; hchain < nbuckets; hchain++, offset += 8) { 433 for (; hchain < nbuckets; hchain++) {
446 fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset])); 434 __u64 fsblock = be64_to_cpu(*p++);
447 435 if (!omfs_fill_chain(dir, ctx, fsblock, hindex))
448 res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
449 hindex = 0;
450 if (res < 0)
451 break; 436 break;
452 437 hindex = 0;
453 filp->f_pos = (hchain+2) << 20; 438 ctx->pos = (hchain+2) << 20;
454 } 439 }
455 brelse(bh); 440 brelse(bh);
456success: 441 return 0;
457 ret = 0;
458out:
459 return ret;
460} 442}
461 443
462const struct inode_operations omfs_dir_inops = { 444const struct inode_operations omfs_dir_inops = {
@@ -470,6 +452,6 @@ const struct inode_operations omfs_dir_inops = {
470 452
471const struct file_operations omfs_dir_operations = { 453const struct file_operations omfs_dir_operations = {
472 .read = generic_read_dir, 454 .read = generic_read_dir,
473 .readdir = omfs_readdir, 455 .iterate = omfs_readdir,
474 .llseek = generic_file_llseek, 456 .llseek = generic_file_llseek,
475}; 457};
diff --git a/fs/open.c b/fs/open.c
index 8c741002f947..fca72c4d3f17 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -840,11 +840,15 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
840 if (flags & __O_SYNC) 840 if (flags & __O_SYNC)
841 flags |= O_DSYNC; 841 flags |= O_DSYNC;
842 842
843 /* 843 if (flags & O_TMPFILE) {
844 * If we have O_PATH in the open flag. Then we 844 if (!(flags & O_CREAT))
845 * cannot have anything other than the below set of flags 845 return -EINVAL;
846 */ 846 acc_mode = MAY_OPEN | ACC_MODE(flags);
847 if (flags & O_PATH) { 847 } else if (flags & O_PATH) {
848 /*
849 * If we have O_PATH in the open flag. Then we
850 * cannot have anything other than the below set of flags
851 */
848 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; 852 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
849 acc_mode = 0; 853 acc_mode = 0;
850 } else { 854 } else {
@@ -876,7 +880,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
876 lookup_flags |= LOOKUP_DIRECTORY; 880 lookup_flags |= LOOKUP_DIRECTORY;
877 if (!(flags & O_NOFOLLOW)) 881 if (!(flags & O_NOFOLLOW))
878 lookup_flags |= LOOKUP_FOLLOW; 882 lookup_flags |= LOOKUP_FOLLOW;
879 return lookup_flags; 883 op->lookup_flags = lookup_flags;
884 return 0;
880} 885}
881 886
882/** 887/**
@@ -893,8 +898,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
893struct file *file_open_name(struct filename *name, int flags, umode_t mode) 898struct file *file_open_name(struct filename *name, int flags, umode_t mode)
894{ 899{
895 struct open_flags op; 900 struct open_flags op;
896 int lookup = build_open_flags(flags, mode, &op); 901 int err = build_open_flags(flags, mode, &op);
897 return do_filp_open(AT_FDCWD, name, &op, lookup); 902 return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
898} 903}
899 904
900/** 905/**
@@ -919,37 +924,43 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
919 const char *filename, int flags) 924 const char *filename, int flags)
920{ 925{
921 struct open_flags op; 926 struct open_flags op;
922 int lookup = build_open_flags(flags, 0, &op); 927 int err = build_open_flags(flags, 0, &op);
928 if (err)
929 return ERR_PTR(err);
923 if (flags & O_CREAT) 930 if (flags & O_CREAT)
924 return ERR_PTR(-EINVAL); 931 return ERR_PTR(-EINVAL);
925 if (!filename && (flags & O_DIRECTORY)) 932 if (!filename && (flags & O_DIRECTORY))
926 if (!dentry->d_inode->i_op->lookup) 933 if (!dentry->d_inode->i_op->lookup)
927 return ERR_PTR(-ENOTDIR); 934 return ERR_PTR(-ENOTDIR);
928 return do_file_open_root(dentry, mnt, filename, &op, lookup); 935 return do_file_open_root(dentry, mnt, filename, &op);
929} 936}
930EXPORT_SYMBOL(file_open_root); 937EXPORT_SYMBOL(file_open_root);
931 938
932long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) 939long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
933{ 940{
934 struct open_flags op; 941 struct open_flags op;
935 int lookup = build_open_flags(flags, mode, &op); 942 int fd = build_open_flags(flags, mode, &op);
936 struct filename *tmp = getname(filename); 943 struct filename *tmp;
937 int fd = PTR_ERR(tmp); 944
938 945 if (fd)
939 if (!IS_ERR(tmp)) { 946 return fd;
940 fd = get_unused_fd_flags(flags); 947
941 if (fd >= 0) { 948 tmp = getname(filename);
942 struct file *f = do_filp_open(dfd, tmp, &op, lookup); 949 if (IS_ERR(tmp))
943 if (IS_ERR(f)) { 950 return PTR_ERR(tmp);
944 put_unused_fd(fd); 951
945 fd = PTR_ERR(f); 952 fd = get_unused_fd_flags(flags);
946 } else { 953 if (fd >= 0) {
947 fsnotify_open(f); 954 struct file *f = do_filp_open(dfd, tmp, &op);
948 fd_install(fd, f); 955 if (IS_ERR(f)) {
949 } 956 put_unused_fd(fd);
957 fd = PTR_ERR(f);
958 } else {
959 fsnotify_open(f);
960 fd_install(fd, f);
950 } 961 }
951 putname(tmp);
952 } 962 }
963 putname(tmp);
953 return fd; 964 return fd;
954} 965}
955 966
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 75885ffde44e..8c0ceb8dd1f7 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -162,11 +162,11 @@ static const struct file_operations openpromfs_prop_ops = {
162 .release = seq_release, 162 .release = seq_release,
163}; 163};
164 164
165static int openpromfs_readdir(struct file *, void *, filldir_t); 165static int openpromfs_readdir(struct file *, struct dir_context *);
166 166
167static const struct file_operations openprom_operations = { 167static const struct file_operations openprom_operations = {
168 .read = generic_read_dir, 168 .read = generic_read_dir,
169 .readdir = openpromfs_readdir, 169 .iterate = openpromfs_readdir,
170 .llseek = generic_file_llseek, 170 .llseek = generic_file_llseek,
171}; 171};
172 172
@@ -260,71 +260,64 @@ found:
260 return NULL; 260 return NULL;
261} 261}
262 262
263static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 263static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
264{ 264{
265 struct inode *inode = file_inode(filp); 265 struct inode *inode = file_inode(file);
266 struct op_inode_info *oi = OP_I(inode); 266 struct op_inode_info *oi = OP_I(inode);
267 struct device_node *dp = oi->u.node; 267 struct device_node *dp = oi->u.node;
268 struct device_node *child; 268 struct device_node *child;
269 struct property *prop; 269 struct property *prop;
270 unsigned int ino;
271 int i; 270 int i;
272 271
273 mutex_lock(&op_mutex); 272 mutex_lock(&op_mutex);
274 273
275 ino = inode->i_ino; 274 if (ctx->pos == 0) {
276 i = filp->f_pos; 275 if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
277 switch (i) {
278 case 0:
279 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
280 goto out; 276 goto out;
281 i++; 277 ctx->pos = 1;
282 filp->f_pos++; 278 }
283 /* fall thru */ 279 if (ctx->pos == 1) {
284 case 1: 280 if (!dir_emit(ctx, "..", 2,
285 if (filldir(dirent, "..", 2, i,
286 (dp->parent == NULL ? 281 (dp->parent == NULL ?
287 OPENPROM_ROOT_INO : 282 OPENPROM_ROOT_INO :
288 dp->parent->unique_id), DT_DIR) < 0) 283 dp->parent->unique_id), DT_DIR))
289 goto out; 284 goto out;
290 i++; 285 ctx->pos = 2;
291 filp->f_pos++; 286 }
292 /* fall thru */ 287 i = ctx->pos - 2;
293 default:
294 i -= 2;
295
296 /* First, the children nodes as directories. */
297 child = dp->child;
298 while (i && child) {
299 child = child->sibling;
300 i--;
301 }
302 while (child) {
303 if (filldir(dirent,
304 child->path_component_name,
305 strlen(child->path_component_name),
306 filp->f_pos, child->unique_id, DT_DIR) < 0)
307 goto out;
308
309 filp->f_pos++;
310 child = child->sibling;
311 }
312 288
313 /* Next, the properties as files. */ 289 /* First, the children nodes as directories. */
314 prop = dp->properties; 290 child = dp->child;
315 while (i && prop) { 291 while (i && child) {
316 prop = prop->next; 292 child = child->sibling;
317 i--; 293 i--;
318 } 294 }
319 while (prop) { 295 while (child) {
320 if (filldir(dirent, prop->name, strlen(prop->name), 296 if (!dir_emit(ctx,
321 filp->f_pos, prop->unique_id, DT_REG) < 0) 297 child->path_component_name,
322 goto out; 298 strlen(child->path_component_name),
299 child->unique_id, DT_DIR))
300 goto out;
323 301
324 filp->f_pos++; 302 ctx->pos++;
325 prop = prop->next; 303 child = child->sibling;
326 } 304 }
305
306 /* Next, the properties as files. */
307 prop = dp->properties;
308 while (i && prop) {
309 prop = prop->next;
310 i--;
327 } 311 }
312 while (prop) {
313 if (!dir_emit(ctx, prop->name, strlen(prop->name),
314 prop->unique_id, DT_REG))
315 goto out;
316
317 ctx->pos++;
318 prop = prop->next;
319 }
320
328out: 321out:
329 mutex_unlock(&op_mutex); 322 mutex_unlock(&op_mutex);
330 return 0; 323 return 0;
diff --git a/fs/pnode.c b/fs/pnode.c
index 3d2a7141b87a..9af0df15256e 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -83,7 +83,8 @@ static int do_make_slave(struct mount *mnt)
83 if (peer_mnt == mnt) 83 if (peer_mnt == mnt)
84 peer_mnt = NULL; 84 peer_mnt = NULL;
85 } 85 }
86 if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) 86 if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) &&
87 list_empty(&mnt->mnt_share))
87 mnt_release_group_id(mnt); 88 mnt_release_group_id(mnt);
88 89
89 list_del_init(&mnt->mnt_share); 90 list_del_init(&mnt->mnt_share);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd51e50001fe..1485e38daaa3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1681,46 +1681,34 @@ const struct dentry_operations pid_dentry_operations =
1681 * reported by readdir in sync with the inode numbers reported 1681 * reported by readdir in sync with the inode numbers reported
1682 * by stat. 1682 * by stat.
1683 */ 1683 */
1684int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 1684bool proc_fill_cache(struct file *file, struct dir_context *ctx,
1685 const char *name, int len, 1685 const char *name, int len,
1686 instantiate_t instantiate, struct task_struct *task, const void *ptr) 1686 instantiate_t instantiate, struct task_struct *task, const void *ptr)
1687{ 1687{
1688 struct dentry *child, *dir = filp->f_path.dentry; 1688 struct dentry *child, *dir = file->f_path.dentry;
1689 struct qstr qname = QSTR_INIT(name, len);
1689 struct inode *inode; 1690 struct inode *inode;
1690 struct qstr qname; 1691 unsigned type;
1691 ino_t ino = 0; 1692 ino_t ino;
1692 unsigned type = DT_UNKNOWN;
1693
1694 qname.name = name;
1695 qname.len = len;
1696 qname.hash = full_name_hash(name, len);
1697 1693
1698 child = d_lookup(dir, &qname); 1694 child = d_hash_and_lookup(dir, &qname);
1699 if (!child) { 1695 if (!child) {
1700 struct dentry *new; 1696 child = d_alloc(dir, &qname);
1701 new = d_alloc(dir, &qname); 1697 if (!child)
1702 if (new) { 1698 goto end_instantiate;
1703 child = instantiate(dir->d_inode, new, task, ptr); 1699 if (instantiate(dir->d_inode, child, task, ptr) < 0) {
1704 if (child) 1700 dput(child);
1705 dput(new); 1701 goto end_instantiate;
1706 else
1707 child = new;
1708 } 1702 }
1709 } 1703 }
1710 if (!child || IS_ERR(child) || !child->d_inode)
1711 goto end_instantiate;
1712 inode = child->d_inode; 1704 inode = child->d_inode;
1713 if (inode) { 1705 ino = inode->i_ino;
1714 ino = inode->i_ino; 1706 type = inode->i_mode >> 12;
1715 type = inode->i_mode >> 12;
1716 }
1717 dput(child); 1707 dput(child);
1708 return dir_emit(ctx, name, len, ino, type);
1709
1718end_instantiate: 1710end_instantiate:
1719 if (!ino) 1711 return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
1720 ino = find_inode_number(dir, &qname);
1721 if (!ino)
1722 ino = 1;
1723 return filldir(dirent, name, len, filp->f_pos, ino, type);
1724} 1712}
1725 1713
1726#ifdef CONFIG_CHECKPOINT_RESTORE 1714#ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1846,7 +1834,7 @@ struct map_files_info {
1846 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 1834 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
1847}; 1835};
1848 1836
1849static struct dentry * 1837static int
1850proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 1838proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1851 struct task_struct *task, const void *ptr) 1839 struct task_struct *task, const void *ptr)
1852{ 1840{
@@ -1856,7 +1844,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1856 1844
1857 inode = proc_pid_make_inode(dir->i_sb, task); 1845 inode = proc_pid_make_inode(dir->i_sb, task);
1858 if (!inode) 1846 if (!inode)
1859 return ERR_PTR(-ENOENT); 1847 return -ENOENT;
1860 1848
1861 ei = PROC_I(inode); 1849 ei = PROC_I(inode);
1862 ei->op.proc_get_link = proc_map_files_get_link; 1850 ei->op.proc_get_link = proc_map_files_get_link;
@@ -1873,7 +1861,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1873 d_set_d_op(dentry, &tid_map_files_dentry_operations); 1861 d_set_d_op(dentry, &tid_map_files_dentry_operations);
1874 d_add(dentry, inode); 1862 d_add(dentry, inode);
1875 1863
1876 return NULL; 1864 return 0;
1877} 1865}
1878 1866
1879static struct dentry *proc_map_files_lookup(struct inode *dir, 1867static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -1882,23 +1870,23 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
1882 unsigned long vm_start, vm_end; 1870 unsigned long vm_start, vm_end;
1883 struct vm_area_struct *vma; 1871 struct vm_area_struct *vma;
1884 struct task_struct *task; 1872 struct task_struct *task;
1885 struct dentry *result; 1873 int result;
1886 struct mm_struct *mm; 1874 struct mm_struct *mm;
1887 1875
1888 result = ERR_PTR(-EPERM); 1876 result = -EPERM;
1889 if (!capable(CAP_SYS_ADMIN)) 1877 if (!capable(CAP_SYS_ADMIN))
1890 goto out; 1878 goto out;
1891 1879
1892 result = ERR_PTR(-ENOENT); 1880 result = -ENOENT;
1893 task = get_proc_task(dir); 1881 task = get_proc_task(dir);
1894 if (!task) 1882 if (!task)
1895 goto out; 1883 goto out;
1896 1884
1897 result = ERR_PTR(-EACCES); 1885 result = -EACCES;
1898 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 1886 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1899 goto out_put_task; 1887 goto out_put_task;
1900 1888
1901 result = ERR_PTR(-ENOENT); 1889 result = -ENOENT;
1902 if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) 1890 if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
1903 goto out_put_task; 1891 goto out_put_task;
1904 1892
@@ -1921,7 +1909,7 @@ out_no_vma:
1921out_put_task: 1909out_put_task:
1922 put_task_struct(task); 1910 put_task_struct(task);
1923out: 1911out:
1924 return result; 1912 return ERR_PTR(result);
1925} 1913}
1926 1914
1927static const struct inode_operations proc_map_files_inode_operations = { 1915static const struct inode_operations proc_map_files_inode_operations = {
@@ -1931,14 +1919,15 @@ static const struct inode_operations proc_map_files_inode_operations = {
1931}; 1919};
1932 1920
1933static int 1921static int
1934proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) 1922proc_map_files_readdir(struct file *file, struct dir_context *ctx)
1935{ 1923{
1936 struct dentry *dentry = filp->f_path.dentry;
1937 struct inode *inode = dentry->d_inode;
1938 struct vm_area_struct *vma; 1924 struct vm_area_struct *vma;
1939 struct task_struct *task; 1925 struct task_struct *task;
1940 struct mm_struct *mm; 1926 struct mm_struct *mm;
1941 ino_t ino; 1927 unsigned long nr_files, pos, i;
1928 struct flex_array *fa = NULL;
1929 struct map_files_info info;
1930 struct map_files_info *p;
1942 int ret; 1931 int ret;
1943 1932
1944 ret = -EPERM; 1933 ret = -EPERM;
@@ -1946,7 +1935,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
1946 goto out; 1935 goto out;
1947 1936
1948 ret = -ENOENT; 1937 ret = -ENOENT;
1949 task = get_proc_task(inode); 1938 task = get_proc_task(file_inode(file));
1950 if (!task) 1939 if (!task)
1951 goto out; 1940 goto out;
1952 1941
@@ -1955,91 +1944,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
1955 goto out_put_task; 1944 goto out_put_task;
1956 1945
1957 ret = 0; 1946 ret = 0;
1958 switch (filp->f_pos) { 1947 if (!dir_emit_dots(file, ctx))
1959 case 0: 1948 goto out_put_task;
1960 ino = inode->i_ino;
1961 if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
1962 goto out_put_task;
1963 filp->f_pos++;
1964 case 1:
1965 ino = parent_ino(dentry);
1966 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1967 goto out_put_task;
1968 filp->f_pos++;
1969 default:
1970 {
1971 unsigned long nr_files, pos, i;
1972 struct flex_array *fa = NULL;
1973 struct map_files_info info;
1974 struct map_files_info *p;
1975
1976 mm = get_task_mm(task);
1977 if (!mm)
1978 goto out_put_task;
1979 down_read(&mm->mmap_sem);
1980 1949
1981 nr_files = 0; 1950 mm = get_task_mm(task);
1951 if (!mm)
1952 goto out_put_task;
1953 down_read(&mm->mmap_sem);
1982 1954
1983 /* 1955 nr_files = 0;
1984 * We need two passes here:
1985 *
1986 * 1) Collect vmas of mapped files with mmap_sem taken
1987 * 2) Release mmap_sem and instantiate entries
1988 *
1989 * otherwise we get lockdep complained, since filldir()
1990 * routine might require mmap_sem taken in might_fault().
1991 */
1992 1956
1993 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { 1957 /*
1994 if (vma->vm_file && ++pos > filp->f_pos) 1958 * We need two passes here:
1995 nr_files++; 1959 *
1996 } 1960 * 1) Collect vmas of mapped files with mmap_sem taken
1961 * 2) Release mmap_sem and instantiate entries
1962 *
1963 * otherwise we get lockdep complained, since filldir()
1964 * routine might require mmap_sem taken in might_fault().
1965 */
1997 1966
1998 if (nr_files) { 1967 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
1999 fa = flex_array_alloc(sizeof(info), nr_files, 1968 if (vma->vm_file && ++pos > ctx->pos)
2000 GFP_KERNEL); 1969 nr_files++;
2001 if (!fa || flex_array_prealloc(fa, 0, nr_files, 1970 }
2002 GFP_KERNEL)) { 1971
2003 ret = -ENOMEM; 1972 if (nr_files) {
2004 if (fa) 1973 fa = flex_array_alloc(sizeof(info), nr_files,
2005 flex_array_free(fa); 1974 GFP_KERNEL);
2006 up_read(&mm->mmap_sem); 1975 if (!fa || flex_array_prealloc(fa, 0, nr_files,
2007 mmput(mm); 1976 GFP_KERNEL)) {
2008 goto out_put_task; 1977 ret = -ENOMEM;
2009 } 1978 if (fa)
2010 for (i = 0, vma = mm->mmap, pos = 2; vma; 1979 flex_array_free(fa);
2011 vma = vma->vm_next) { 1980 up_read(&mm->mmap_sem);
2012 if (!vma->vm_file) 1981 mmput(mm);
2013 continue; 1982 goto out_put_task;
2014 if (++pos <= filp->f_pos)
2015 continue;
2016
2017 info.mode = vma->vm_file->f_mode;
2018 info.len = snprintf(info.name,
2019 sizeof(info.name), "%lx-%lx",
2020 vma->vm_start, vma->vm_end);
2021 if (flex_array_put(fa, i++, &info, GFP_KERNEL))
2022 BUG();
2023 }
2024 } 1983 }
2025 up_read(&mm->mmap_sem); 1984 for (i = 0, vma = mm->mmap, pos = 2; vma;
2026 1985 vma = vma->vm_next) {
2027 for (i = 0; i < nr_files; i++) { 1986 if (!vma->vm_file)
2028 p = flex_array_get(fa, i); 1987 continue;
2029 ret = proc_fill_cache(filp, dirent, filldir, 1988 if (++pos <= ctx->pos)
2030 p->name, p->len, 1989 continue;
2031 proc_map_files_instantiate, 1990
2032 task, 1991 info.mode = vma->vm_file->f_mode;
2033 (void *)(unsigned long)p->mode); 1992 info.len = snprintf(info.name,
2034 if (ret) 1993 sizeof(info.name), "%lx-%lx",
2035 break; 1994 vma->vm_start, vma->vm_end);
2036 filp->f_pos++; 1995 if (flex_array_put(fa, i++, &info, GFP_KERNEL))
1996 BUG();
2037 } 1997 }
2038 if (fa)
2039 flex_array_free(fa);
2040 mmput(mm);
2041 } 1998 }
1999 up_read(&mm->mmap_sem);
2000
2001 for (i = 0; i < nr_files; i++) {
2002 p = flex_array_get(fa, i);
2003 if (!proc_fill_cache(file, ctx,
2004 p->name, p->len,
2005 proc_map_files_instantiate,
2006 task,
2007 (void *)(unsigned long)p->mode))
2008 break;
2009 ctx->pos++;
2042 } 2010 }
2011 if (fa)
2012 flex_array_free(fa);
2013 mmput(mm);
2043 2014
2044out_put_task: 2015out_put_task:
2045 put_task_struct(task); 2016 put_task_struct(task);
@@ -2049,7 +2020,7 @@ out:
2049 2020
2050static const struct file_operations proc_map_files_operations = { 2021static const struct file_operations proc_map_files_operations = {
2051 .read = generic_read_dir, 2022 .read = generic_read_dir,
2052 .readdir = proc_map_files_readdir, 2023 .iterate = proc_map_files_readdir,
2053 .llseek = default_llseek, 2024 .llseek = default_llseek,
2054}; 2025};
2055 2026
@@ -2118,6 +2089,7 @@ static int show_timer(struct seq_file *m, void *v)
2118 nstr[notify & ~SIGEV_THREAD_ID], 2089 nstr[notify & ~SIGEV_THREAD_ID],
2119 (notify & SIGEV_THREAD_ID) ? "tid" : "pid", 2090 (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
2120 pid_nr_ns(timer->it_pid, tp->ns)); 2091 pid_nr_ns(timer->it_pid, tp->ns));
2092 seq_printf(m, "ClockID: %d\n", timer->it_clock);
2121 2093
2122 return 0; 2094 return 0;
2123} 2095}
@@ -2151,13 +2123,12 @@ static const struct file_operations proc_timers_operations = {
2151}; 2123};
2152#endif /* CONFIG_CHECKPOINT_RESTORE */ 2124#endif /* CONFIG_CHECKPOINT_RESTORE */
2153 2125
2154static struct dentry *proc_pident_instantiate(struct inode *dir, 2126static int proc_pident_instantiate(struct inode *dir,
2155 struct dentry *dentry, struct task_struct *task, const void *ptr) 2127 struct dentry *dentry, struct task_struct *task, const void *ptr)
2156{ 2128{
2157 const struct pid_entry *p = ptr; 2129 const struct pid_entry *p = ptr;
2158 struct inode *inode; 2130 struct inode *inode;
2159 struct proc_inode *ei; 2131 struct proc_inode *ei;
2160 struct dentry *error = ERR_PTR(-ENOENT);
2161 2132
2162 inode = proc_pid_make_inode(dir->i_sb, task); 2133 inode = proc_pid_make_inode(dir->i_sb, task);
2163 if (!inode) 2134 if (!inode)
@@ -2176,9 +2147,9 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
2176 d_add(dentry, inode); 2147 d_add(dentry, inode);
2177 /* Close the race of the process dying before we return the dentry */ 2148 /* Close the race of the process dying before we return the dentry */
2178 if (pid_revalidate(dentry, 0)) 2149 if (pid_revalidate(dentry, 0))
2179 error = NULL; 2150 return 0;
2180out: 2151out:
2181 return error; 2152 return -ENOENT;
2182} 2153}
2183 2154
2184static struct dentry *proc_pident_lookup(struct inode *dir, 2155static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2186,11 +2157,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
2186 const struct pid_entry *ents, 2157 const struct pid_entry *ents,
2187 unsigned int nents) 2158 unsigned int nents)
2188{ 2159{
2189 struct dentry *error; 2160 int error;
2190 struct task_struct *task = get_proc_task(dir); 2161 struct task_struct *task = get_proc_task(dir);
2191 const struct pid_entry *p, *last; 2162 const struct pid_entry *p, *last;
2192 2163
2193 error = ERR_PTR(-ENOENT); 2164 error = -ENOENT;
2194 2165
2195 if (!task) 2166 if (!task)
2196 goto out_no_task; 2167 goto out_no_task;
@@ -2213,70 +2184,33 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
2213out: 2184out:
2214 put_task_struct(task); 2185 put_task_struct(task);
2215out_no_task: 2186out_no_task:
2216 return error; 2187 return ERR_PTR(error);
2217}
2218
2219static int proc_pident_fill_cache(struct file *filp, void *dirent,
2220 filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2221{
2222 return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2223 proc_pident_instantiate, task, p);
2224} 2188}
2225 2189
2226static int proc_pident_readdir(struct file *filp, 2190static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2227 void *dirent, filldir_t filldir,
2228 const struct pid_entry *ents, unsigned int nents) 2191 const struct pid_entry *ents, unsigned int nents)
2229{ 2192{
2230 int i; 2193 struct task_struct *task = get_proc_task(file_inode(file));
2231 struct dentry *dentry = filp->f_path.dentry; 2194 const struct pid_entry *p;
2232 struct inode *inode = dentry->d_inode;
2233 struct task_struct *task = get_proc_task(inode);
2234 const struct pid_entry *p, *last;
2235 ino_t ino;
2236 int ret;
2237 2195
2238 ret = -ENOENT;
2239 if (!task) 2196 if (!task)
2240 goto out_no_task; 2197 return -ENOENT;
2241 2198
2242 ret = 0; 2199 if (!dir_emit_dots(file, ctx))
2243 i = filp->f_pos; 2200 goto out;
2244 switch (i) { 2201
2245 case 0: 2202 if (ctx->pos >= nents + 2)
2246 ino = inode->i_ino; 2203 goto out;
2247 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
2248 goto out;
2249 i++;
2250 filp->f_pos++;
2251 /* fall through */
2252 case 1:
2253 ino = parent_ino(dentry);
2254 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
2255 goto out;
2256 i++;
2257 filp->f_pos++;
2258 /* fall through */
2259 default:
2260 i -= 2;
2261 if (i >= nents) {
2262 ret = 1;
2263 goto out;
2264 }
2265 p = ents + i;
2266 last = &ents[nents - 1];
2267 while (p <= last) {
2268 if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
2269 goto out;
2270 filp->f_pos++;
2271 p++;
2272 }
2273 }
2274 2204
2275 ret = 1; 2205 for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
2206 if (!proc_fill_cache(file, ctx, p->name, p->len,
2207 proc_pident_instantiate, task, p))
2208 break;
2209 ctx->pos++;
2210 }
2276out: 2211out:
2277 put_task_struct(task); 2212 put_task_struct(task);
2278out_no_task: 2213 return 0;
2279 return ret;
2280} 2214}
2281 2215
2282#ifdef CONFIG_SECURITY 2216#ifdef CONFIG_SECURITY
@@ -2361,16 +2295,15 @@ static const struct pid_entry attr_dir_stuff[] = {
2361 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2295 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2362}; 2296};
2363 2297
2364static int proc_attr_dir_readdir(struct file * filp, 2298static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
2365 void * dirent, filldir_t filldir)
2366{ 2299{
2367 return proc_pident_readdir(filp,dirent,filldir, 2300 return proc_pident_readdir(file, ctx,
2368 attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); 2301 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2369} 2302}
2370 2303
2371static const struct file_operations proc_attr_dir_operations = { 2304static const struct file_operations proc_attr_dir_operations = {
2372 .read = generic_read_dir, 2305 .read = generic_read_dir,
2373 .readdir = proc_attr_dir_readdir, 2306 .iterate = proc_attr_dir_readdir,
2374 .llseek = default_llseek, 2307 .llseek = default_llseek,
2375}; 2308};
2376 2309
@@ -2724,16 +2657,15 @@ static const struct pid_entry tgid_base_stuff[] = {
2724#endif 2657#endif
2725}; 2658};
2726 2659
2727static int proc_tgid_base_readdir(struct file * filp, 2660static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
2728 void * dirent, filldir_t filldir)
2729{ 2661{
2730 return proc_pident_readdir(filp,dirent,filldir, 2662 return proc_pident_readdir(file, ctx,
2731 tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); 2663 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2732} 2664}
2733 2665
2734static const struct file_operations proc_tgid_base_operations = { 2666static const struct file_operations proc_tgid_base_operations = {
2735 .read = generic_read_dir, 2667 .read = generic_read_dir,
2736 .readdir = proc_tgid_base_readdir, 2668 .iterate = proc_tgid_base_readdir,
2737 .llseek = default_llseek, 2669 .llseek = default_llseek,
2738}; 2670};
2739 2671
@@ -2835,11 +2767,10 @@ void proc_flush_task(struct task_struct *task)
2835 } 2767 }
2836} 2768}
2837 2769
2838static struct dentry *proc_pid_instantiate(struct inode *dir, 2770static int proc_pid_instantiate(struct inode *dir,
2839 struct dentry * dentry, 2771 struct dentry * dentry,
2840 struct task_struct *task, const void *ptr) 2772 struct task_struct *task, const void *ptr)
2841{ 2773{
2842 struct dentry *error = ERR_PTR(-ENOENT);
2843 struct inode *inode; 2774 struct inode *inode;
2844 2775
2845 inode = proc_pid_make_inode(dir->i_sb, task); 2776 inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2859,14 +2790,14 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
2859 d_add(dentry, inode); 2790 d_add(dentry, inode);
2860 /* Close the race of the process dying before we return the dentry */ 2791 /* Close the race of the process dying before we return the dentry */
2861 if (pid_revalidate(dentry, 0)) 2792 if (pid_revalidate(dentry, 0))
2862 error = NULL; 2793 return 0;
2863out: 2794out:
2864 return error; 2795 return -ENOENT;
2865} 2796}
2866 2797
2867struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2798struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2868{ 2799{
2869 struct dentry *result = NULL; 2800 int result = 0;
2870 struct task_struct *task; 2801 struct task_struct *task;
2871 unsigned tgid; 2802 unsigned tgid;
2872 struct pid_namespace *ns; 2803 struct pid_namespace *ns;
@@ -2887,7 +2818,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
2887 result = proc_pid_instantiate(dir, dentry, task, NULL); 2818 result = proc_pid_instantiate(dir, dentry, task, NULL);
2888 put_task_struct(task); 2819 put_task_struct(task);
2889out: 2820out:
2890 return result; 2821 return ERR_PTR(result);
2891} 2822}
2892 2823
2893/* 2824/*
@@ -2935,58 +2866,42 @@ retry:
2935 2866
2936#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) 2867#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
2937 2868
2938static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2939 struct tgid_iter iter)
2940{
2941 char name[PROC_NUMBUF];
2942 int len = snprintf(name, sizeof(name), "%d", iter.tgid);
2943 return proc_fill_cache(filp, dirent, filldir, name, len,
2944 proc_pid_instantiate, iter.task, NULL);
2945}
2946
2947static int fake_filldir(void *buf, const char *name, int namelen,
2948 loff_t offset, u64 ino, unsigned d_type)
2949{
2950 return 0;
2951}
2952
2953/* for the /proc/ directory itself, after non-process stuff has been done */ 2869/* for the /proc/ directory itself, after non-process stuff has been done */
2954int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 2870int proc_pid_readdir(struct file *file, struct dir_context *ctx)
2955{ 2871{
2956 struct tgid_iter iter; 2872 struct tgid_iter iter;
2957 struct pid_namespace *ns; 2873 struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
2958 filldir_t __filldir; 2874 loff_t pos = ctx->pos;
2959 loff_t pos = filp->f_pos;
2960 2875
2961 if (pos >= PID_MAX_LIMIT + TGID_OFFSET) 2876 if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
2962 goto out; 2877 return 0;
2963 2878
2964 if (pos == TGID_OFFSET - 1) { 2879 if (pos == TGID_OFFSET - 1) {
2965 if (proc_fill_cache(filp, dirent, filldir, "self", 4, 2880 struct inode *inode = ns->proc_self->d_inode;
2966 NULL, NULL, NULL) < 0) 2881 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
2967 goto out; 2882 return 0;
2968 iter.tgid = 0; 2883 iter.tgid = 0;
2969 } else { 2884 } else {
2970 iter.tgid = pos - TGID_OFFSET; 2885 iter.tgid = pos - TGID_OFFSET;
2971 } 2886 }
2972 iter.task = NULL; 2887 iter.task = NULL;
2973 ns = filp->f_dentry->d_sb->s_fs_info;
2974 for (iter = next_tgid(ns, iter); 2888 for (iter = next_tgid(ns, iter);
2975 iter.task; 2889 iter.task;
2976 iter.tgid += 1, iter = next_tgid(ns, iter)) { 2890 iter.tgid += 1, iter = next_tgid(ns, iter)) {
2977 if (has_pid_permissions(ns, iter.task, 2)) 2891 char name[PROC_NUMBUF];
2978 __filldir = filldir; 2892 int len;
2979 else 2893 if (!has_pid_permissions(ns, iter.task, 2))
2980 __filldir = fake_filldir; 2894 continue;
2981 2895
2982 filp->f_pos = iter.tgid + TGID_OFFSET; 2896 len = snprintf(name, sizeof(name), "%d", iter.tgid);
2983 if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { 2897 ctx->pos = iter.tgid + TGID_OFFSET;
2898 if (!proc_fill_cache(file, ctx, name, len,
2899 proc_pid_instantiate, iter.task, NULL)) {
2984 put_task_struct(iter.task); 2900 put_task_struct(iter.task);
2985 goto out; 2901 return 0;
2986 } 2902 }
2987 } 2903 }
2988 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; 2904 ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
2989out:
2990 return 0; 2905 return 0;
2991} 2906}
2992 2907
@@ -3074,11 +2989,10 @@ static const struct pid_entry tid_base_stuff[] = {
3074#endif 2989#endif
3075}; 2990};
3076 2991
3077static int proc_tid_base_readdir(struct file * filp, 2992static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
3078 void * dirent, filldir_t filldir)
3079{ 2993{
3080 return proc_pident_readdir(filp,dirent,filldir, 2994 return proc_pident_readdir(file, ctx,
3081 tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); 2995 tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3082} 2996}
3083 2997
3084static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2998static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -3089,7 +3003,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
3089 3003
3090static const struct file_operations proc_tid_base_operations = { 3004static const struct file_operations proc_tid_base_operations = {
3091 .read = generic_read_dir, 3005 .read = generic_read_dir,
3092 .readdir = proc_tid_base_readdir, 3006 .iterate = proc_tid_base_readdir,
3093 .llseek = default_llseek, 3007 .llseek = default_llseek,
3094}; 3008};
3095 3009
@@ -3099,10 +3013,9 @@ static const struct inode_operations proc_tid_base_inode_operations = {
3099 .setattr = proc_setattr, 3013 .setattr = proc_setattr,
3100}; 3014};
3101 3015
3102static struct dentry *proc_task_instantiate(struct inode *dir, 3016static int proc_task_instantiate(struct inode *dir,
3103 struct dentry *dentry, struct task_struct *task, const void *ptr) 3017 struct dentry *dentry, struct task_struct *task, const void *ptr)
3104{ 3018{
3105 struct dentry *error = ERR_PTR(-ENOENT);
3106 struct inode *inode; 3019 struct inode *inode;
3107 inode = proc_pid_make_inode(dir->i_sb, task); 3020 inode = proc_pid_make_inode(dir->i_sb, task);
3108 3021
@@ -3121,14 +3034,14 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
3121 d_add(dentry, inode); 3034 d_add(dentry, inode);
3122 /* Close the race of the process dying before we return the dentry */ 3035 /* Close the race of the process dying before we return the dentry */
3123 if (pid_revalidate(dentry, 0)) 3036 if (pid_revalidate(dentry, 0))
3124 error = NULL; 3037 return 0;
3125out: 3038out:
3126 return error; 3039 return -ENOENT;
3127} 3040}
3128 3041
3129static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 3042static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3130{ 3043{
3131 struct dentry *result = ERR_PTR(-ENOENT); 3044 int result = -ENOENT;
3132 struct task_struct *task; 3045 struct task_struct *task;
3133 struct task_struct *leader = get_proc_task(dir); 3046 struct task_struct *leader = get_proc_task(dir);
3134 unsigned tid; 3047 unsigned tid;
@@ -3158,7 +3071,7 @@ out_drop_task:
3158out: 3071out:
3159 put_task_struct(leader); 3072 put_task_struct(leader);
3160out_no_task: 3073out_no_task:
3161 return result; 3074 return ERR_PTR(result);
3162} 3075}
3163 3076
3164/* 3077/*
@@ -3230,30 +3143,16 @@ static struct task_struct *next_tid(struct task_struct *start)
3230 return pos; 3143 return pos;
3231} 3144}
3232 3145
3233static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
3234 struct task_struct *task, int tid)
3235{
3236 char name[PROC_NUMBUF];
3237 int len = snprintf(name, sizeof(name), "%d", tid);
3238 return proc_fill_cache(filp, dirent, filldir, name, len,
3239 proc_task_instantiate, task, NULL);
3240}
3241
3242/* for the /proc/TGID/task/ directories */ 3146/* for the /proc/TGID/task/ directories */
3243static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) 3147static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3244{ 3148{
3245 struct dentry *dentry = filp->f_path.dentry;
3246 struct inode *inode = dentry->d_inode;
3247 struct task_struct *leader = NULL; 3149 struct task_struct *leader = NULL;
3248 struct task_struct *task; 3150 struct task_struct *task = get_proc_task(file_inode(file));
3249 int retval = -ENOENT;
3250 ino_t ino;
3251 int tid;
3252 struct pid_namespace *ns; 3151 struct pid_namespace *ns;
3152 int tid;
3253 3153
3254 task = get_proc_task(inode);
3255 if (!task) 3154 if (!task)
3256 goto out_no_task; 3155 return -ENOENT;
3257 rcu_read_lock(); 3156 rcu_read_lock();
3258 if (pid_alive(task)) { 3157 if (pid_alive(task)) {
3259 leader = task->group_leader; 3158 leader = task->group_leader;
@@ -3262,46 +3161,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3262 rcu_read_unlock(); 3161 rcu_read_unlock();
3263 put_task_struct(task); 3162 put_task_struct(task);
3264 if (!leader) 3163 if (!leader)
3265 goto out_no_task; 3164 return -ENOENT;
3266 retval = 0;
3267 3165
3268 switch ((unsigned long)filp->f_pos) { 3166 if (!dir_emit_dots(file, ctx))
3269 case 0: 3167 goto out;
3270 ino = inode->i_ino;
3271 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
3272 goto out;
3273 filp->f_pos++;
3274 /* fall through */
3275 case 1:
3276 ino = parent_ino(dentry);
3277 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
3278 goto out;
3279 filp->f_pos++;
3280 /* fall through */
3281 }
3282 3168
3283 /* f_version caches the tgid value that the last readdir call couldn't 3169 /* f_version caches the tgid value that the last readdir call couldn't
3284 * return. lseek aka telldir automagically resets f_version to 0. 3170 * return. lseek aka telldir automagically resets f_version to 0.
3285 */ 3171 */
3286 ns = filp->f_dentry->d_sb->s_fs_info; 3172 ns = file->f_dentry->d_sb->s_fs_info;
3287 tid = (int)filp->f_version; 3173 tid = (int)file->f_version;
3288 filp->f_version = 0; 3174 file->f_version = 0;
3289 for (task = first_tid(leader, tid, filp->f_pos - 2, ns); 3175 for (task = first_tid(leader, tid, ctx->pos - 2, ns);
3290 task; 3176 task;
3291 task = next_tid(task), filp->f_pos++) { 3177 task = next_tid(task), ctx->pos++) {
3178 char name[PROC_NUMBUF];
3179 int len;
3292 tid = task_pid_nr_ns(task, ns); 3180 tid = task_pid_nr_ns(task, ns);
3293 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { 3181 len = snprintf(name, sizeof(name), "%d", tid);
3182 if (!proc_fill_cache(file, ctx, name, len,
3183 proc_task_instantiate, task, NULL)) {
3294 /* returning this tgid failed, save it as the first 3184 /* returning this tgid failed, save it as the first
3295 * pid for the next readir call */ 3185 * pid for the next readir call */
3296 filp->f_version = (u64)tid; 3186 file->f_version = (u64)tid;
3297 put_task_struct(task); 3187 put_task_struct(task);
3298 break; 3188 break;
3299 } 3189 }
3300 } 3190 }
3301out: 3191out:
3302 put_task_struct(leader); 3192 put_task_struct(leader);
3303out_no_task: 3193 return 0;
3304 return retval;
3305} 3194}
3306 3195
3307static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 3196static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3327,6 +3216,6 @@ static const struct inode_operations proc_task_inode_operations = {
3327 3216
3328static const struct file_operations proc_task_operations = { 3217static const struct file_operations proc_task_operations = {
3329 .read = generic_read_dir, 3218 .read = generic_read_dir,
3330 .readdir = proc_task_readdir, 3219 .iterate = proc_task_readdir,
3331 .llseek = default_llseek, 3220 .llseek = default_llseek,
3332}; 3221};
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d7a4a28ef630..75f2890abbd8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -167,11 +167,10 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
167 return ret; 167 return ret;
168} 168}
169 169
170static struct dentry * 170static int
171proc_fd_instantiate(struct inode *dir, struct dentry *dentry, 171proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
172 struct task_struct *task, const void *ptr) 172 struct task_struct *task, const void *ptr)
173{ 173{
174 struct dentry *error = ERR_PTR(-ENOENT);
175 unsigned fd = (unsigned long)ptr; 174 unsigned fd = (unsigned long)ptr;
176 struct proc_inode *ei; 175 struct proc_inode *ei;
177 struct inode *inode; 176 struct inode *inode;
@@ -194,9 +193,9 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
194 193
195 /* Close the race of the process dying before we return the dentry */ 194 /* Close the race of the process dying before we return the dentry */
196 if (tid_fd_revalidate(dentry, 0)) 195 if (tid_fd_revalidate(dentry, 0))
197 error = NULL; 196 return 0;
198 out: 197 out:
199 return error; 198 return -ENOENT;
200} 199}
201 200
202static struct dentry *proc_lookupfd_common(struct inode *dir, 201static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -204,7 +203,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
204 instantiate_t instantiate) 203 instantiate_t instantiate)
205{ 204{
206 struct task_struct *task = get_proc_task(dir); 205 struct task_struct *task = get_proc_task(dir);
207 struct dentry *result = ERR_PTR(-ENOENT); 206 int result = -ENOENT;
208 unsigned fd = name_to_int(dentry); 207 unsigned fd = name_to_int(dentry);
209 208
210 if (!task) 209 if (!task)
@@ -216,77 +215,61 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
216out: 215out:
217 put_task_struct(task); 216 put_task_struct(task);
218out_no_task: 217out_no_task:
219 return result; 218 return ERR_PTR(result);
220} 219}
221 220
222static int proc_readfd_common(struct file * filp, void * dirent, 221static int proc_readfd_common(struct file *file, struct dir_context *ctx,
223 filldir_t filldir, instantiate_t instantiate) 222 instantiate_t instantiate)
224{ 223{
225 struct dentry *dentry = filp->f_path.dentry; 224 struct task_struct *p = get_proc_task(file_inode(file));
226 struct inode *inode = dentry->d_inode;
227 struct task_struct *p = get_proc_task(inode);
228 struct files_struct *files; 225 struct files_struct *files;
229 unsigned int fd, ino; 226 unsigned int fd;
230 int retval;
231 227
232 retval = -ENOENT;
233 if (!p) 228 if (!p)
234 goto out_no_task; 229 return -ENOENT;
235 retval = 0;
236
237 fd = filp->f_pos;
238 switch (fd) {
239 case 0:
240 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
241 goto out;
242 filp->f_pos++;
243 case 1:
244 ino = parent_ino(dentry);
245 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
246 goto out;
247 filp->f_pos++;
248 default:
249 files = get_files_struct(p);
250 if (!files)
251 goto out;
252 rcu_read_lock();
253 for (fd = filp->f_pos - 2;
254 fd < files_fdtable(files)->max_fds;
255 fd++, filp->f_pos++) {
256 char name[PROC_NUMBUF];
257 int len;
258 int rv;
259
260 if (!fcheck_files(files, fd))
261 continue;
262 rcu_read_unlock();
263 230
264 len = snprintf(name, sizeof(name), "%d", fd); 231 if (!dir_emit_dots(file, ctx))
265 rv = proc_fill_cache(filp, dirent, filldir, 232 goto out;
266 name, len, instantiate, p, 233 if (!dir_emit_dots(file, ctx))
267 (void *)(unsigned long)fd); 234 goto out;
268 if (rv < 0) 235 files = get_files_struct(p);
269 goto out_fd_loop; 236 if (!files)
270 rcu_read_lock(); 237 goto out;
271 } 238
272 rcu_read_unlock(); 239 rcu_read_lock();
273out_fd_loop: 240 for (fd = ctx->pos - 2;
274 put_files_struct(files); 241 fd < files_fdtable(files)->max_fds;
242 fd++, ctx->pos++) {
243 char name[PROC_NUMBUF];
244 int len;
245
246 if (!fcheck_files(files, fd))
247 continue;
248 rcu_read_unlock();
249
250 len = snprintf(name, sizeof(name), "%d", fd);
251 if (!proc_fill_cache(file, ctx,
252 name, len, instantiate, p,
253 (void *)(unsigned long)fd))
254 goto out_fd_loop;
255 rcu_read_lock();
275 } 256 }
257 rcu_read_unlock();
258out_fd_loop:
259 put_files_struct(files);
276out: 260out:
277 put_task_struct(p); 261 put_task_struct(p);
278out_no_task: 262 return 0;
279 return retval;
280} 263}
281 264
282static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) 265static int proc_readfd(struct file *file, struct dir_context *ctx)
283{ 266{
284 return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); 267 return proc_readfd_common(file, ctx, proc_fd_instantiate);
285} 268}
286 269
287const struct file_operations proc_fd_operations = { 270const struct file_operations proc_fd_operations = {
288 .read = generic_read_dir, 271 .read = generic_read_dir,
289 .readdir = proc_readfd, 272 .iterate = proc_readfd,
290 .llseek = default_llseek, 273 .llseek = default_llseek,
291}; 274};
292 275
@@ -316,11 +299,10 @@ const struct inode_operations proc_fd_inode_operations = {
316 .setattr = proc_setattr, 299 .setattr = proc_setattr,
317}; 300};
318 301
319static struct dentry * 302static int
320proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, 303proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
321 struct task_struct *task, const void *ptr) 304 struct task_struct *task, const void *ptr)
322{ 305{
323 struct dentry *error = ERR_PTR(-ENOENT);
324 unsigned fd = (unsigned long)ptr; 306 unsigned fd = (unsigned long)ptr;
325 struct proc_inode *ei; 307 struct proc_inode *ei;
326 struct inode *inode; 308 struct inode *inode;
@@ -340,9 +322,9 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
340 322
341 /* Close the race of the process dying before we return the dentry */ 323 /* Close the race of the process dying before we return the dentry */
342 if (tid_fd_revalidate(dentry, 0)) 324 if (tid_fd_revalidate(dentry, 0))
343 error = NULL; 325 return 0;
344 out: 326 out:
345 return error; 327 return -ENOENT;
346} 328}
347 329
348static struct dentry * 330static struct dentry *
@@ -351,9 +333,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
351 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); 333 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
352} 334}
353 335
354static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) 336static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
355{ 337{
356 return proc_readfd_common(filp, dirent, filldir, 338 return proc_readfd_common(file, ctx,
357 proc_fdinfo_instantiate); 339 proc_fdinfo_instantiate);
358} 340}
359 341
@@ -364,6 +346,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
364 346
365const struct file_operations proc_fdinfo_operations = { 347const struct file_operations proc_fdinfo_operations = {
366 .read = generic_read_dir, 348 .read = generic_read_dir,
367 .readdir = proc_readfdinfo, 349 .iterate = proc_readfdinfo,
368 .llseek = default_llseek, 350 .llseek = default_llseek,
369}; 351};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a2596afffae6..94441a407337 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -233,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
233 * value of the readdir() call, as long as it's non-negative 233 * value of the readdir() call, as long as it's non-negative
234 * for success.. 234 * for success..
235 */ 235 */
236int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, 236int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
237 filldir_t filldir) 237 struct dir_context *ctx)
238{ 238{
239 unsigned int ino;
240 int i; 239 int i;
241 struct inode *inode = file_inode(filp);
242 int ret = 0;
243
244 ino = inode->i_ino;
245 i = filp->f_pos;
246 switch (i) {
247 case 0:
248 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
249 goto out;
250 i++;
251 filp->f_pos++;
252 /* fall through */
253 case 1:
254 if (filldir(dirent, "..", 2, i,
255 parent_ino(filp->f_path.dentry),
256 DT_DIR) < 0)
257 goto out;
258 i++;
259 filp->f_pos++;
260 /* fall through */
261 default:
262 spin_lock(&proc_subdir_lock);
263 de = de->subdir;
264 i -= 2;
265 for (;;) {
266 if (!de) {
267 ret = 1;
268 spin_unlock(&proc_subdir_lock);
269 goto out;
270 }
271 if (!i)
272 break;
273 de = de->next;
274 i--;
275 }
276 240
277 do { 241 if (!dir_emit_dots(file, ctx))
278 struct proc_dir_entry *next; 242 return 0;
279 243
280 /* filldir passes info to user space */ 244 spin_lock(&proc_subdir_lock);
281 pde_get(de); 245 de = de->subdir;
282 spin_unlock(&proc_subdir_lock); 246 i = ctx->pos - 2;
283 if (filldir(dirent, de->name, de->namelen, filp->f_pos, 247 for (;;) {
284 de->low_ino, de->mode >> 12) < 0) { 248 if (!de) {
285 pde_put(de);
286 goto out;
287 }
288 spin_lock(&proc_subdir_lock);
289 filp->f_pos++;
290 next = de->next;
291 pde_put(de);
292 de = next;
293 } while (de);
294 spin_unlock(&proc_subdir_lock); 249 spin_unlock(&proc_subdir_lock);
250 return 0;
251 }
252 if (!i)
253 break;
254 de = de->next;
255 i--;
295 } 256 }
296 ret = 1; 257
297out: 258 do {
298 return ret; 259 struct proc_dir_entry *next;
260 pde_get(de);
261 spin_unlock(&proc_subdir_lock);
262 if (!dir_emit(ctx, de->name, de->namelen,
263 de->low_ino, de->mode >> 12)) {
264 pde_put(de);
265 return 0;
266 }
267 spin_lock(&proc_subdir_lock);
268 ctx->pos++;
269 next = de->next;
270 pde_put(de);
271 de = next;
272 } while (de);
273 spin_unlock(&proc_subdir_lock);
274 return 0;
299} 275}
300 276
301int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) 277int proc_readdir(struct file *file, struct dir_context *ctx)
302{ 278{
303 struct inode *inode = file_inode(filp); 279 struct inode *inode = file_inode(file);
304 280
305 return proc_readdir_de(PDE(inode), filp, dirent, filldir); 281 return proc_readdir_de(PDE(inode), file, ctx);
306} 282}
307 283
308/* 284/*
@@ -313,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
313static const struct file_operations proc_dir_operations = { 289static const struct file_operations proc_dir_operations = {
314 .llseek = generic_file_llseek, 290 .llseek = generic_file_llseek,
315 .read = generic_read_dir, 291 .read = generic_read_dir,
316 .readdir = proc_readdir, 292 .iterate = proc_readdir,
317}; 293};
318 294
319/* 295/*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d600fb098b6a..651d09a11dde 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -165,14 +165,14 @@ extern int proc_setattr(struct dentry *, struct iattr *);
165extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); 165extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
166extern int pid_revalidate(struct dentry *, unsigned int); 166extern int pid_revalidate(struct dentry *, unsigned int);
167extern int pid_delete_dentry(const struct dentry *); 167extern int pid_delete_dentry(const struct dentry *);
168extern int proc_pid_readdir(struct file *, void *, filldir_t); 168extern int proc_pid_readdir(struct file *, struct dir_context *);
169extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); 169extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
170extern loff_t mem_lseek(struct file *, loff_t, int); 170extern loff_t mem_lseek(struct file *, loff_t, int);
171 171
172/* Lookups */ 172/* Lookups */
173typedef struct dentry *instantiate_t(struct inode *, struct dentry *, 173typedef int instantiate_t(struct inode *, struct dentry *,
174 struct task_struct *, const void *); 174 struct task_struct *, const void *);
175extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int, 175extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
176 instantiate_t, struct task_struct *, const void *); 176 instantiate_t, struct task_struct *, const void *);
177 177
178/* 178/*
@@ -183,8 +183,8 @@ extern spinlock_t proc_subdir_lock;
183extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); 183extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
184extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, 184extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
185 struct dentry *); 185 struct dentry *);
186extern int proc_readdir(struct file *, void *, filldir_t); 186extern int proc_readdir(struct file *, struct dir_context *);
187extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t); 187extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
188 188
189static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) 189static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
190{ 190{
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bd4b5a740ff1..bdfabdaefdce 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait;
21 21
22static int kmsg_open(struct inode * inode, struct file * file) 22static int kmsg_open(struct inode * inode, struct file * file)
23{ 23{
24 return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); 24 return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
25} 25}
26 26
27static int kmsg_release(struct inode * inode, struct file * file) 27static int kmsg_release(struct inode * inode, struct file * file)
28{ 28{
29 (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); 29 (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC);
30 return 0; 30 return 0;
31} 31}
32 32
@@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
34 size_t count, loff_t *ppos) 34 size_t count, loff_t *ppos)
35{ 35{
36 if ((file->f_flags & O_NONBLOCK) && 36 if ((file->f_flags & O_NONBLOCK) &&
37 !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) 37 !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
38 return -EAGAIN; 38 return -EAGAIN;
39 return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); 39 return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC);
40} 40}
41 41
42static unsigned int kmsg_poll(struct file *file, poll_table *wait) 42static unsigned int kmsg_poll(struct file *file, poll_table *wait)
43{ 43{
44 poll_wait(file, &log_wait, wait); 44 poll_wait(file, &log_wait, wait);
45 if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) 45 if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
46 return POLLIN | POLLRDNORM; 46 return POLLIN | POLLRDNORM;
47 return 0; 47 return 0;
48} 48}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 54bdc6701e9f..49a7fff2e83a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -187,13 +187,12 @@ static const struct inode_operations proc_ns_link_inode_operations = {
187 .setattr = proc_setattr, 187 .setattr = proc_setattr,
188}; 188};
189 189
190static struct dentry *proc_ns_instantiate(struct inode *dir, 190static int proc_ns_instantiate(struct inode *dir,
191 struct dentry *dentry, struct task_struct *task, const void *ptr) 191 struct dentry *dentry, struct task_struct *task, const void *ptr)
192{ 192{
193 const struct proc_ns_operations *ns_ops = ptr; 193 const struct proc_ns_operations *ns_ops = ptr;
194 struct inode *inode; 194 struct inode *inode;
195 struct proc_inode *ei; 195 struct proc_inode *ei;
196 struct dentry *error = ERR_PTR(-ENOENT);
197 196
198 inode = proc_pid_make_inode(dir->i_sb, task); 197 inode = proc_pid_make_inode(dir->i_sb, task);
199 if (!inode) 198 if (!inode)
@@ -208,90 +207,52 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
208 d_add(dentry, inode); 207 d_add(dentry, inode);
209 /* Close the race of the process dying before we return the dentry */ 208 /* Close the race of the process dying before we return the dentry */
210 if (pid_revalidate(dentry, 0)) 209 if (pid_revalidate(dentry, 0))
211 error = NULL; 210 return 0;
212out: 211out:
213 return error; 212 return -ENOENT;
214}
215
216static int proc_ns_fill_cache(struct file *filp, void *dirent,
217 filldir_t filldir, struct task_struct *task,
218 const struct proc_ns_operations *ops)
219{
220 return proc_fill_cache(filp, dirent, filldir,
221 ops->name, strlen(ops->name),
222 proc_ns_instantiate, task, ops);
223} 213}
224 214
225static int proc_ns_dir_readdir(struct file *filp, void *dirent, 215static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
226 filldir_t filldir)
227{ 216{
228 int i; 217 struct task_struct *task = get_proc_task(file_inode(file));
229 struct dentry *dentry = filp->f_path.dentry;
230 struct inode *inode = dentry->d_inode;
231 struct task_struct *task = get_proc_task(inode);
232 const struct proc_ns_operations **entry, **last; 218 const struct proc_ns_operations **entry, **last;
233 ino_t ino;
234 int ret;
235 219
236 ret = -ENOENT;
237 if (!task) 220 if (!task)
238 goto out_no_task; 221 return -ENOENT;
239 222
240 ret = 0; 223 if (!dir_emit_dots(file, ctx))
241 i = filp->f_pos; 224 goto out;
242 switch (i) { 225 if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
243 case 0: 226 goto out;
244 ino = inode->i_ino; 227 entry = ns_entries + (ctx->pos - 2);
245 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) 228 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
246 goto out; 229 while (entry <= last) {
247 i++; 230 const struct proc_ns_operations *ops = *entry;
248 filp->f_pos++; 231 if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
249 /* fall through */ 232 proc_ns_instantiate, task, ops))
250 case 1: 233 break;
251 ino = parent_ino(dentry); 234 ctx->pos++;
252 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 235 entry++;
253 goto out;
254 i++;
255 filp->f_pos++;
256 /* fall through */
257 default:
258 i -= 2;
259 if (i >= ARRAY_SIZE(ns_entries)) {
260 ret = 1;
261 goto out;
262 }
263 entry = ns_entries + i;
264 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
265 while (entry <= last) {
266 if (proc_ns_fill_cache(filp, dirent, filldir,
267 task, *entry) < 0)
268 goto out;
269 filp->f_pos++;
270 entry++;
271 }
272 } 236 }
273
274 ret = 1;
275out: 237out:
276 put_task_struct(task); 238 put_task_struct(task);
277out_no_task: 239 return 0;
278 return ret;
279} 240}
280 241
281const struct file_operations proc_ns_dir_operations = { 242const struct file_operations proc_ns_dir_operations = {
282 .read = generic_read_dir, 243 .read = generic_read_dir,
283 .readdir = proc_ns_dir_readdir, 244 .iterate = proc_ns_dir_readdir,
284}; 245};
285 246
286static struct dentry *proc_ns_dir_lookup(struct inode *dir, 247static struct dentry *proc_ns_dir_lookup(struct inode *dir,
287 struct dentry *dentry, unsigned int flags) 248 struct dentry *dentry, unsigned int flags)
288{ 249{
289 struct dentry *error; 250 int error;
290 struct task_struct *task = get_proc_task(dir); 251 struct task_struct *task = get_proc_task(dir);
291 const struct proc_ns_operations **entry, **last; 252 const struct proc_ns_operations **entry, **last;
292 unsigned int len = dentry->d_name.len; 253 unsigned int len = dentry->d_name.len;
293 254
294 error = ERR_PTR(-ENOENT); 255 error = -ENOENT;
295 256
296 if (!task) 257 if (!task)
297 goto out_no_task; 258 goto out_no_task;
@@ -310,7 +271,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
310out: 271out:
311 put_task_struct(task); 272 put_task_struct(task);
312out_no_task: 273out_no_task:
313 return error; 274 return ERR_PTR(error);
314} 275}
315 276
316const struct inode_operations proc_ns_dir_inode_operations = { 277const struct inode_operations proc_ns_dir_inode_operations = {
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 986e83220d56..4677bb7dc7c2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -160,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = {
160 .getattr = proc_tgid_net_getattr, 160 .getattr = proc_tgid_net_getattr,
161}; 161};
162 162
163static int proc_tgid_net_readdir(struct file *filp, void *dirent, 163static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
164 filldir_t filldir)
165{ 164{
166 int ret; 165 int ret;
167 struct net *net; 166 struct net *net;
168 167
169 ret = -EINVAL; 168 ret = -EINVAL;
170 net = get_proc_task_net(file_inode(filp)); 169 net = get_proc_task_net(file_inode(file));
171 if (net != NULL) { 170 if (net != NULL) {
172 ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); 171 ret = proc_readdir_de(net->proc_net, file, ctx);
173 put_net(net); 172 put_net(net);
174 } 173 }
175 return ret; 174 return ret;
@@ -178,7 +177,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
178const struct file_operations proc_net_operations = { 177const struct file_operations proc_net_operations = {
179 .llseek = generic_file_llseek, 178 .llseek = generic_file_llseek,
180 .read = generic_read_dir, 179 .read = generic_read_dir,
181 .readdir = proc_tgid_net_readdir, 180 .iterate = proc_tgid_net_readdir,
182}; 181};
183 182
184static __net_init int proc_net_ns_init(struct net *net) 183static __net_init int proc_net_ns_init(struct net *net)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ac05f33a0dde..71290463a1d3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -573,12 +573,12 @@ out:
573 return ret; 573 return ret;
574} 574}
575 575
576static int proc_sys_fill_cache(struct file *filp, void *dirent, 576static bool proc_sys_fill_cache(struct file *file,
577 filldir_t filldir, 577 struct dir_context *ctx,
578 struct ctl_table_header *head, 578 struct ctl_table_header *head,
579 struct ctl_table *table) 579 struct ctl_table *table)
580{ 580{
581 struct dentry *child, *dir = filp->f_path.dentry; 581 struct dentry *child, *dir = file->f_path.dentry;
582 struct inode *inode; 582 struct inode *inode;
583 struct qstr qname; 583 struct qstr qname;
584 ino_t ino = 0; 584 ino_t ino = 0;
@@ -595,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
595 inode = proc_sys_make_inode(dir->d_sb, head, table); 595 inode = proc_sys_make_inode(dir->d_sb, head, table);
596 if (!inode) { 596 if (!inode) {
597 dput(child); 597 dput(child);
598 return -ENOMEM; 598 return false;
599 } else { 599 } else {
600 d_set_d_op(child, &proc_sys_dentry_operations); 600 d_set_d_op(child, &proc_sys_dentry_operations);
601 d_add(child, inode); 601 d_add(child, inode);
602 } 602 }
603 } else { 603 } else {
604 return -ENOMEM; 604 return false;
605 } 605 }
606 } 606 }
607 inode = child->d_inode; 607 inode = child->d_inode;
608 ino = inode->i_ino; 608 ino = inode->i_ino;
609 type = inode->i_mode >> 12; 609 type = inode->i_mode >> 12;
610 dput(child); 610 dput(child);
611 return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); 611 return dir_emit(ctx, qname.name, qname.len, ino, type);
612} 612}
613 613
614static int proc_sys_link_fill_cache(struct file *filp, void *dirent, 614static bool proc_sys_link_fill_cache(struct file *file,
615 filldir_t filldir, 615 struct dir_context *ctx,
616 struct ctl_table_header *head, 616 struct ctl_table_header *head,
617 struct ctl_table *table) 617 struct ctl_table *table)
618{ 618{
619 int err, ret = 0; 619 bool ret = true;
620 head = sysctl_head_grab(head); 620 head = sysctl_head_grab(head);
621 621
622 if (S_ISLNK(table->mode)) { 622 if (S_ISLNK(table->mode)) {
623 /* It is not an error if we can not follow the link ignore it */ 623 /* It is not an error if we can not follow the link ignore it */
624 err = sysctl_follow_link(&head, &table, current->nsproxy); 624 int err = sysctl_follow_link(&head, &table, current->nsproxy);
625 if (err) 625 if (err)
626 goto out; 626 goto out;
627 } 627 }
628 628
629 ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); 629 ret = proc_sys_fill_cache(file, ctx, head, table);
630out: 630out:
631 sysctl_head_finish(head); 631 sysctl_head_finish(head);
632 return ret; 632 return ret;
@@ -634,67 +634,50 @@ out:
634 634
635static int scan(struct ctl_table_header *head, ctl_table *table, 635static int scan(struct ctl_table_header *head, ctl_table *table,
636 unsigned long *pos, struct file *file, 636 unsigned long *pos, struct file *file,
637 void *dirent, filldir_t filldir) 637 struct dir_context *ctx)
638{ 638{
639 int res; 639 bool res;
640 640
641 if ((*pos)++ < file->f_pos) 641 if ((*pos)++ < ctx->pos)
642 return 0; 642 return true;
643 643
644 if (unlikely(S_ISLNK(table->mode))) 644 if (unlikely(S_ISLNK(table->mode)))
645 res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); 645 res = proc_sys_link_fill_cache(file, ctx, head, table);
646 else 646 else
647 res = proc_sys_fill_cache(file, dirent, filldir, head, table); 647 res = proc_sys_fill_cache(file, ctx, head, table);
648 648
649 if (res == 0) 649 if (res)
650 file->f_pos = *pos; 650 ctx->pos = *pos;
651 651
652 return res; 652 return res;
653} 653}
654 654
655static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) 655static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
656{ 656{
657 struct dentry *dentry = filp->f_path.dentry; 657 struct ctl_table_header *head = grab_header(file_inode(file));
658 struct inode *inode = dentry->d_inode;
659 struct ctl_table_header *head = grab_header(inode);
660 struct ctl_table_header *h = NULL; 658 struct ctl_table_header *h = NULL;
661 struct ctl_table *entry; 659 struct ctl_table *entry;
662 struct ctl_dir *ctl_dir; 660 struct ctl_dir *ctl_dir;
663 unsigned long pos; 661 unsigned long pos;
664 int ret = -EINVAL;
665 662
666 if (IS_ERR(head)) 663 if (IS_ERR(head))
667 return PTR_ERR(head); 664 return PTR_ERR(head);
668 665
669 ctl_dir = container_of(head, struct ctl_dir, header); 666 ctl_dir = container_of(head, struct ctl_dir, header);
670 667
671 ret = 0; 668 if (!dir_emit_dots(file, ctx))
672 /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ 669 return 0;
673 if (filp->f_pos == 0) { 670
674 if (filldir(dirent, ".", 1, filp->f_pos,
675 inode->i_ino, DT_DIR) < 0)
676 goto out;
677 filp->f_pos++;
678 }
679 if (filp->f_pos == 1) {
680 if (filldir(dirent, "..", 2, filp->f_pos,
681 parent_ino(dentry), DT_DIR) < 0)
682 goto out;
683 filp->f_pos++;
684 }
685 pos = 2; 671 pos = 2;
686 672
687 for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { 673 for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
688 ret = scan(h, entry, &pos, filp, dirent, filldir); 674 if (!scan(h, entry, &pos, file, ctx)) {
689 if (ret) {
690 sysctl_head_finish(h); 675 sysctl_head_finish(h);
691 break; 676 break;
692 } 677 }
693 } 678 }
694 ret = 1;
695out:
696 sysctl_head_finish(head); 679 sysctl_head_finish(head);
697 return ret; 680 return 0;
698} 681}
699 682
700static int proc_sys_permission(struct inode *inode, int mask) 683static int proc_sys_permission(struct inode *inode, int mask)
@@ -769,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = {
769 752
770static const struct file_operations proc_sys_dir_file_operations = { 753static const struct file_operations proc_sys_dir_file_operations = {
771 .read = generic_read_dir, 754 .read = generic_read_dir,
772 .readdir = proc_sys_readdir, 755 .iterate = proc_sys_readdir,
773 .llseek = generic_file_llseek, 756 .llseek = generic_file_llseek,
774}; 757};
775 758
@@ -813,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p)
813 return res; 796 return res;
814} 797}
815 798
816static int proc_sys_compare(const struct dentry *parent, 799static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
817 const struct inode *pinode,
818 const struct dentry *dentry, const struct inode *inode,
819 unsigned int len, const char *str, const struct qstr *name) 800 unsigned int len, const char *str, const struct qstr *name)
820{ 801{
821 struct ctl_table_header *head; 802 struct ctl_table_header *head;
803 struct inode *inode;
804
822 /* Although proc doesn't have negative dentries, rcu-walk means 805 /* Although proc doesn't have negative dentries, rcu-walk means
823 * that inode here can be NULL */ 806 * that inode here can be NULL */
824 /* AV: can it, indeed? */ 807 /* AV: can it, indeed? */
808 inode = ACCESS_ONCE(dentry->d_inode);
825 if (!inode) 809 if (!inode)
826 return 1; 810 return 1;
827 if (name->len != len) 811 if (name->len != len)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 41a6ea93f486..229e366598da 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -202,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
202 return proc_pid_lookup(dir, dentry, flags); 202 return proc_pid_lookup(dir, dentry, flags);
203} 203}
204 204
205static int proc_root_readdir(struct file * filp, 205static int proc_root_readdir(struct file *file, struct dir_context *ctx)
206 void * dirent, filldir_t filldir)
207{ 206{
208 unsigned int nr = filp->f_pos; 207 if (ctx->pos < FIRST_PROCESS_ENTRY) {
209 int ret; 208 proc_readdir(file, ctx);
210 209 ctx->pos = FIRST_PROCESS_ENTRY;
211 if (nr < FIRST_PROCESS_ENTRY) {
212 int error = proc_readdir(filp, dirent, filldir);
213 if (error <= 0)
214 return error;
215 filp->f_pos = FIRST_PROCESS_ENTRY;
216 } 210 }
217 211
218 ret = proc_pid_readdir(filp, dirent, filldir); 212 return proc_pid_readdir(file, ctx);
219 return ret;
220} 213}
221 214
222/* 215/*
@@ -226,7 +219,7 @@ static int proc_root_readdir(struct file * filp,
226 */ 219 */
227static const struct file_operations proc_root_operations = { 220static const struct file_operations proc_root_operations = {
228 .read = generic_read_dir, 221 .read = generic_read_dir,
229 .readdir = proc_root_readdir, 222 .iterate = proc_root_readdir,
230 .llseek = default_llseek, 223 .llseek = default_llseek,
231}; 224};
232 225
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index e4bcb2cf055a..bfd95bf38005 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
178 if (p->psi->erase) 178 if (p->psi->erase)
179 p->psi->erase(p->type, p->id, p->count, 179 p->psi->erase(p->type, p->id, p->count,
180 dentry->d_inode->i_ctime, p->psi); 180 dentry->d_inode->i_ctime, p->psi);
181 else
182 return -EPERM;
181 183
182 return simple_unlink(dir, dentry); 184 return simple_unlink(dir, dentry);
183} 185}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 86d1038b5a12..b7ffe2bcd9c4 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -239,17 +239,15 @@ int pstore_register(struct pstore_info *psi)
239{ 239{
240 struct module *owner = psi->owner; 240 struct module *owner = psi->owner;
241 241
242 if (backend && strcmp(backend, psi->name))
243 return -EPERM;
244
242 spin_lock(&pstore_lock); 245 spin_lock(&pstore_lock);
243 if (psinfo) { 246 if (psinfo) {
244 spin_unlock(&pstore_lock); 247 spin_unlock(&pstore_lock);
245 return -EBUSY; 248 return -EBUSY;
246 } 249 }
247 250
248 if (backend && strcmp(backend, psi->name)) {
249 spin_unlock(&pstore_lock);
250 return -EINVAL;
251 }
252
253 if (!psi->write) 251 if (!psi->write)
254 psi->write = pstore_write_compat; 252 psi->write = pstore_write_compat;
255 psinfo = psi; 253 psinfo = psi;
@@ -274,6 +272,9 @@ int pstore_register(struct pstore_info *psi)
274 add_timer(&pstore_timer); 272 add_timer(&pstore_timer);
275 } 273 }
276 274
275 pr_info("pstore: Registered %s as persistent store backend\n",
276 psi->name);
277
277 return 0; 278 return 0;
278} 279}
279EXPORT_SYMBOL_GPL(pstore_register); 280EXPORT_SYMBOL_GPL(pstore_register);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1376e5a8f0d6..43abee2c6cb9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -399,8 +399,6 @@ static int ramoops_probe(struct platform_device *pdev)
399 goto fail_out; 399 goto fail_out;
400 } 400 }
401 401
402 if (!is_power_of_2(pdata->mem_size))
403 pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
404 if (!is_power_of_2(pdata->record_size)) 402 if (!is_power_of_2(pdata->record_size))
405 pdata->record_size = rounddown_pow_of_two(pdata->record_size); 403 pdata->record_size = rounddown_pow_of_two(pdata->record_size);
406 if (!is_power_of_2(pdata->console_size)) 404 if (!is_power_of_2(pdata->console_size))
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 59337326e288..de272d426763 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -46,7 +46,7 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
46} 46}
47 47
48/* increase and wrap the start pointer, returning the old value */ 48/* increase and wrap the start pointer, returning the old value */
49static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) 49static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
50{ 50{
51 int old; 51 int old;
52 int new; 52 int new;
@@ -62,7 +62,7 @@ static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
62} 62}
63 63
64/* increase the size counter until it hits the max size */ 64/* increase the size counter until it hits the max size */
65static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a) 65static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
66{ 66{
67 size_t old; 67 size_t old;
68 size_t new; 68 size_t new;
@@ -78,6 +78,53 @@ static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
78 } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old); 78 } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
79} 79}
80 80
81static DEFINE_RAW_SPINLOCK(buffer_lock);
82
83/* increase and wrap the start pointer, returning the old value */
84static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
85{
86 int old;
87 int new;
88 unsigned long flags;
89
90 raw_spin_lock_irqsave(&buffer_lock, flags);
91
92 old = atomic_read(&prz->buffer->start);
93 new = old + a;
94 while (unlikely(new > prz->buffer_size))
95 new -= prz->buffer_size;
96 atomic_set(&prz->buffer->start, new);
97
98 raw_spin_unlock_irqrestore(&buffer_lock, flags);
99
100 return old;
101}
102
103/* increase the size counter until it hits the max size */
104static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
105{
106 size_t old;
107 size_t new;
108 unsigned long flags;
109
110 raw_spin_lock_irqsave(&buffer_lock, flags);
111
112 old = atomic_read(&prz->buffer->size);
113 if (old == prz->buffer_size)
114 goto exit;
115
116 new = old + a;
117 if (new > prz->buffer_size)
118 new = prz->buffer_size;
119 atomic_set(&prz->buffer->size, new);
120
121exit:
122 raw_spin_unlock_irqrestore(&buffer_lock, flags);
123}
124
125static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
126static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
127
81static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, 128static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
82 uint8_t *data, size_t len, uint8_t *ecc) 129 uint8_t *data, size_t len, uint8_t *ecc)
83{ 130{
@@ -372,6 +419,9 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
372 return NULL; 419 return NULL;
373 } 420 }
374 421
422 buffer_start_add = buffer_start_add_locked;
423 buffer_size_add = buffer_size_add_locked;
424
375 return ioremap(start, size); 425 return ioremap(start, size);
376} 426}
377 427
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 28ce014b3cef..b218f965817b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -14,9 +14,9 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include "qnx4.h" 15#include "qnx4.h"
16 16
17static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) 17static int qnx4_readdir(struct file *file, struct dir_context *ctx)
18{ 18{
19 struct inode *inode = file_inode(filp); 19 struct inode *inode = file_inode(file);
20 unsigned int offset; 20 unsigned int offset;
21 struct buffer_head *bh; 21 struct buffer_head *bh;
22 struct qnx4_inode_entry *de; 22 struct qnx4_inode_entry *de;
@@ -26,48 +26,44 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
26 int size; 26 int size;
27 27
28 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); 28 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
29 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos)); 29 QNX4DEBUG((KERN_INFO "pos = %ld\n", (long) ctx->pos));
30 30
31 while (filp->f_pos < inode->i_size) { 31 while (ctx->pos < inode->i_size) {
32 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS ); 32 blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS);
33 bh = sb_bread(inode->i_sb, blknum); 33 bh = sb_bread(inode->i_sb, blknum);
34 if(bh==NULL) { 34 if (bh == NULL) {
35 printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum); 35 printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
36 break; 36 return 0;
37 } 37 }
38 ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; 38 ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
39 while (ix < QNX4_INODES_PER_BLOCK) { 39 for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
40 offset = ix * QNX4_DIR_ENTRY_SIZE; 40 offset = ix * QNX4_DIR_ENTRY_SIZE;
41 de = (struct qnx4_inode_entry *) (bh->b_data + offset); 41 de = (struct qnx4_inode_entry *) (bh->b_data + offset);
42 size = strlen(de->di_fname); 42 if (!de->di_fname[0])
43 if (size) { 43 continue;
44 if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX ) 44 if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
45 size = QNX4_SHORT_NAME_MAX; 45 continue;
46 else if ( size > QNX4_NAME_MAX ) 46 if (!(de->di_status & QNX4_FILE_LINK))
47 size = QNX4_NAME_MAX; 47 size = QNX4_SHORT_NAME_MAX;
48 48 else
49 if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) { 49 size = QNX4_NAME_MAX;
50 QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); 50 size = strnlen(de->di_fname, size);
51 if ( ( de->di_status & QNX4_FILE_LINK ) == 0 ) 51 QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
52 ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; 52 if (!(de->di_status & QNX4_FILE_LINK))
53 else { 53 ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
54 le = (struct qnx4_link_info*)de; 54 else {
55 ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * 55 le = (struct qnx4_link_info*)de;
56 QNX4_INODES_PER_BLOCK + 56 ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
57 le->dl_inode_ndx; 57 QNX4_INODES_PER_BLOCK +
58 } 58 le->dl_inode_ndx;
59 if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) { 59 }
60 brelse(bh); 60 if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
61 goto out; 61 brelse(bh);
62 } 62 return 0;
63 }
64 } 63 }
65 ix++;
66 filp->f_pos += QNX4_DIR_ENTRY_SIZE;
67 } 64 }
68 brelse(bh); 65 brelse(bh);
69 } 66 }
70out:
71 return 0; 67 return 0;
72} 68}
73 69
@@ -75,7 +71,7 @@ const struct file_operations qnx4_dir_operations =
75{ 71{
76 .llseek = generic_file_llseek, 72 .llseek = generic_file_llseek,
77 .read = generic_read_dir, 73 .read = generic_read_dir,
78 .readdir = qnx4_readdir, 74 .iterate = qnx4_readdir,
79 .fsync = generic_file_fsync, 75 .fsync = generic_file_fsync,
80}; 76};
81 77
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 8798d065e400..15b7d92ed60d 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -65,8 +65,8 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
65 65
66static int qnx6_dir_longfilename(struct inode *inode, 66static int qnx6_dir_longfilename(struct inode *inode,
67 struct qnx6_long_dir_entry *de, 67 struct qnx6_long_dir_entry *de,
68 void *dirent, loff_t pos, 68 struct dir_context *ctx,
69 unsigned de_inode, filldir_t filldir) 69 unsigned de_inode)
70{ 70{
71 struct qnx6_long_filename *lf; 71 struct qnx6_long_filename *lf;
72 struct super_block *s = inode->i_sb; 72 struct super_block *s = inode->i_sb;
@@ -104,8 +104,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
104 104
105 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n", 105 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
106 lf_size, lf->lf_fname, de_inode)); 106 lf_size, lf->lf_fname, de_inode));
107 if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode, 107 if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
108 DT_UNKNOWN) < 0) {
109 qnx6_put_page(page); 108 qnx6_put_page(page);
110 return 0; 109 return 0;
111 } 110 }
@@ -115,18 +114,19 @@ static int qnx6_dir_longfilename(struct inode *inode,
115 return 1; 114 return 1;
116} 115}
117 116
118static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) 117static int qnx6_readdir(struct file *file, struct dir_context *ctx)
119{ 118{
120 struct inode *inode = file_inode(filp); 119 struct inode *inode = file_inode(file);
121 struct super_block *s = inode->i_sb; 120 struct super_block *s = inode->i_sb;
122 struct qnx6_sb_info *sbi = QNX6_SB(s); 121 struct qnx6_sb_info *sbi = QNX6_SB(s);
123 loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1); 122 loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
124 unsigned long npages = dir_pages(inode); 123 unsigned long npages = dir_pages(inode);
125 unsigned long n = pos >> PAGE_CACHE_SHIFT; 124 unsigned long n = pos >> PAGE_CACHE_SHIFT;
126 unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; 125 unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
127 bool done = false; 126 bool done = false;
128 127
129 if (filp->f_pos >= inode->i_size) 128 ctx->pos = pos;
129 if (ctx->pos >= inode->i_size)
130 return 0; 130 return 0;
131 131
132 for ( ; !done && n < npages; n++, start = 0) { 132 for ( ; !done && n < npages; n++, start = 0) {
@@ -137,11 +137,11 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
137 137
138 if (IS_ERR(page)) { 138 if (IS_ERR(page)) {
139 printk(KERN_ERR "qnx6_readdir: read failed\n"); 139 printk(KERN_ERR "qnx6_readdir: read failed\n");
140 filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT; 140 ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
141 return PTR_ERR(page); 141 return PTR_ERR(page);
142 } 142 }
143 de = ((struct qnx6_dir_entry *)page_address(page)) + start; 143 de = ((struct qnx6_dir_entry *)page_address(page)) + start;
144 for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) { 144 for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
145 int size = de->de_size; 145 int size = de->de_size;
146 u32 no_inode = fs32_to_cpu(sbi, de->de_inode); 146 u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
147 147
@@ -154,8 +154,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
154 structure / block */ 154 structure / block */
155 if (!qnx6_dir_longfilename(inode, 155 if (!qnx6_dir_longfilename(inode,
156 (struct qnx6_long_dir_entry *)de, 156 (struct qnx6_long_dir_entry *)de,
157 dirent, pos, no_inode, 157 ctx, no_inode)) {
158 filldir)) {
159 done = true; 158 done = true;
160 break; 159 break;
161 } 160 }
@@ -163,9 +162,8 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
163 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s" 162 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
164 " inode:%u\n", size, de->de_fname, 163 " inode:%u\n", size, de->de_fname,
165 no_inode)); 164 no_inode));
166 if (filldir(dirent, de->de_fname, size, 165 if (!dir_emit(ctx, de->de_fname, size,
167 pos, no_inode, DT_UNKNOWN) 166 no_inode, DT_UNKNOWN)) {
168 < 0) {
169 done = true; 167 done = true;
170 break; 168 break;
171 } 169 }
@@ -173,7 +171,6 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
173 } 171 }
174 qnx6_put_page(page); 172 qnx6_put_page(page);
175 } 173 }
176 filp->f_pos = pos;
177 return 0; 174 return 0;
178} 175}
179 176
@@ -282,7 +279,7 @@ found:
282const struct file_operations qnx6_dir_operations = { 279const struct file_operations qnx6_dir_operations = {
283 .llseek = generic_file_llseek, 280 .llseek = generic_file_llseek,
284 .read = generic_read_dir, 281 .read = generic_read_dir,
285 .readdir = qnx6_readdir, 282 .iterate = qnx6_readdir,
286 .fsync = generic_file_fsync, 283 .fsync = generic_file_fsync,
287}; 284};
288 285
diff --git a/fs/read_write.c b/fs/read_write.c
index 03430008704e..122a3846d9e1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -41,8 +41,19 @@ static inline int unsigned_offsets(struct file *file)
41 return file->f_mode & FMODE_UNSIGNED_OFFSET; 41 return file->f_mode & FMODE_UNSIGNED_OFFSET;
42} 42}
43 43
44static loff_t lseek_execute(struct file *file, struct inode *inode, 44/**
45 loff_t offset, loff_t maxsize) 45 * vfs_setpos - update the file offset for lseek
46 * @file: file structure in question
47 * @offset: file offset to seek to
48 * @maxsize: maximum file size
49 *
50 * This is a low-level filesystem helper for updating the file offset to
51 * the value specified by @offset if the given offset is valid and it is
52 * not equal to the current file offset.
53 *
54 * Return the specified offset on success and -EINVAL on invalid offset.
55 */
56loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
46{ 57{
47 if (offset < 0 && !unsigned_offsets(file)) 58 if (offset < 0 && !unsigned_offsets(file))
48 return -EINVAL; 59 return -EINVAL;
@@ -55,6 +66,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
55 } 66 }
56 return offset; 67 return offset;
57} 68}
69EXPORT_SYMBOL(vfs_setpos);
58 70
59/** 71/**
60 * generic_file_llseek_size - generic llseek implementation for regular files 72 * generic_file_llseek_size - generic llseek implementation for regular files
@@ -76,8 +88,6 @@ loff_t
76generic_file_llseek_size(struct file *file, loff_t offset, int whence, 88generic_file_llseek_size(struct file *file, loff_t offset, int whence,
77 loff_t maxsize, loff_t eof) 89 loff_t maxsize, loff_t eof)
78{ 90{
79 struct inode *inode = file->f_mapping->host;
80
81 switch (whence) { 91 switch (whence) {
82 case SEEK_END: 92 case SEEK_END:
83 offset += eof; 93 offset += eof;
@@ -97,8 +107,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
97 * like SEEK_SET. 107 * like SEEK_SET.
98 */ 108 */
99 spin_lock(&file->f_lock); 109 spin_lock(&file->f_lock);
100 offset = lseek_execute(file, inode, file->f_pos + offset, 110 offset = vfs_setpos(file, file->f_pos + offset, maxsize);
101 maxsize);
102 spin_unlock(&file->f_lock); 111 spin_unlock(&file->f_lock);
103 return offset; 112 return offset;
104 case SEEK_DATA: 113 case SEEK_DATA:
@@ -120,7 +129,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
120 break; 129 break;
121 } 130 }
122 131
123 return lseek_execute(file, inode, offset, maxsize); 132 return vfs_setpos(file, offset, maxsize);
124} 133}
125EXPORT_SYMBOL(generic_file_llseek_size); 134EXPORT_SYMBOL(generic_file_llseek_size);
126 135
@@ -145,6 +154,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
145EXPORT_SYMBOL(generic_file_llseek); 154EXPORT_SYMBOL(generic_file_llseek);
146 155
147/** 156/**
157 * fixed_size_llseek - llseek implementation for fixed-sized devices
158 * @file: file structure to seek on
159 * @offset: file offset to seek to
160 * @whence: type of seek
161 * @size: size of the file
162 *
163 */
164loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
165{
166 switch (whence) {
167 case SEEK_SET: case SEEK_CUR: case SEEK_END:
168 return generic_file_llseek_size(file, offset, whence,
169 size, size);
170 default:
171 return -EINVAL;
172 }
173}
174EXPORT_SYMBOL(fixed_size_llseek);
175
176/**
148 * noop_llseek - No Operation Performed llseek implementation 177 * noop_llseek - No Operation Performed llseek implementation
149 * @file: file structure to seek on 178 * @file: file structure to seek on
150 * @offset: file offset to seek to 179 * @offset: file offset to seek to
@@ -296,7 +325,7 @@ out_putf:
296 * them to something that fits in "int" so that others 325 * them to something that fits in "int" so that others
297 * won't have to do range checks all the time. 326 * won't have to do range checks all the time.
298 */ 327 */
299int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) 328int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
300{ 329{
301 struct inode *inode; 330 struct inode *inode;
302 loff_t pos; 331 loff_t pos;
@@ -477,7 +506,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
477 if (f.file) { 506 if (f.file) {
478 loff_t pos = file_pos_read(f.file); 507 loff_t pos = file_pos_read(f.file);
479 ret = vfs_read(f.file, buf, count, &pos); 508 ret = vfs_read(f.file, buf, count, &pos);
480 file_pos_write(f.file, pos); 509 if (ret >= 0)
510 file_pos_write(f.file, pos);
481 fdput(f); 511 fdput(f);
482 } 512 }
483 return ret; 513 return ret;
@@ -492,7 +522,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
492 if (f.file) { 522 if (f.file) {
493 loff_t pos = file_pos_read(f.file); 523 loff_t pos = file_pos_read(f.file);
494 ret = vfs_write(f.file, buf, count, &pos); 524 ret = vfs_write(f.file, buf, count, &pos);
495 file_pos_write(f.file, pos); 525 if (ret >= 0)
526 file_pos_write(f.file, pos);
496 fdput(f); 527 fdput(f);
497 } 528 }
498 529
@@ -780,7 +811,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
780 if (f.file) { 811 if (f.file) {
781 loff_t pos = file_pos_read(f.file); 812 loff_t pos = file_pos_read(f.file);
782 ret = vfs_readv(f.file, vec, vlen, &pos); 813 ret = vfs_readv(f.file, vec, vlen, &pos);
783 file_pos_write(f.file, pos); 814 if (ret >= 0)
815 file_pos_write(f.file, pos);
784 fdput(f); 816 fdput(f);
785 } 817 }
786 818
@@ -799,7 +831,8 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
799 if (f.file) { 831 if (f.file) {
800 loff_t pos = file_pos_read(f.file); 832 loff_t pos = file_pos_read(f.file);
801 ret = vfs_writev(f.file, vec, vlen, &pos); 833 ret = vfs_writev(f.file, vec, vlen, &pos);
802 file_pos_write(f.file, pos); 834 if (ret >= 0)
835 file_pos_write(f.file, pos);
803 fdput(f); 836 fdput(f);
804 } 837 }
805 838
@@ -959,7 +992,8 @@ COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
959 return -EBADF; 992 return -EBADF;
960 pos = f.file->f_pos; 993 pos = f.file->f_pos;
961 ret = compat_readv(f.file, vec, vlen, &pos); 994 ret = compat_readv(f.file, vec, vlen, &pos);
962 f.file->f_pos = pos; 995 if (ret >= 0)
996 f.file->f_pos = pos;
963 fdput(f); 997 fdput(f);
964 return ret; 998 return ret;
965} 999}
@@ -1025,7 +1059,8 @@ COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
1025 return -EBADF; 1059 return -EBADF;
1026 pos = f.file->f_pos; 1060 pos = f.file->f_pos;
1027 ret = compat_writev(f.file, vec, vlen, &pos); 1061 ret = compat_writev(f.file, vec, vlen, &pos);
1028 f.file->f_pos = pos; 1062 if (ret >= 0)
1063 f.file->f_pos = pos;
1029 fdput(f); 1064 fdput(f);
1030 return ret; 1065 return ret;
1031} 1066}
@@ -1064,6 +1099,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1064 struct fd in, out; 1099 struct fd in, out;
1065 struct inode *in_inode, *out_inode; 1100 struct inode *in_inode, *out_inode;
1066 loff_t pos; 1101 loff_t pos;
1102 loff_t out_pos;
1067 ssize_t retval; 1103 ssize_t retval;
1068 int fl; 1104 int fl;
1069 1105
@@ -1077,12 +1113,14 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1077 if (!(in.file->f_mode & FMODE_READ)) 1113 if (!(in.file->f_mode & FMODE_READ))
1078 goto fput_in; 1114 goto fput_in;
1079 retval = -ESPIPE; 1115 retval = -ESPIPE;
1080 if (!ppos) 1116 if (!ppos) {
1081 ppos = &in.file->f_pos; 1117 pos = in.file->f_pos;
1082 else 1118 } else {
1119 pos = *ppos;
1083 if (!(in.file->f_mode & FMODE_PREAD)) 1120 if (!(in.file->f_mode & FMODE_PREAD))
1084 goto fput_in; 1121 goto fput_in;
1085 retval = rw_verify_area(READ, in.file, ppos, count); 1122 }
1123 retval = rw_verify_area(READ, in.file, &pos, count);
1086 if (retval < 0) 1124 if (retval < 0)
1087 goto fput_in; 1125 goto fput_in;
1088 count = retval; 1126 count = retval;
@@ -1099,7 +1137,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1099 retval = -EINVAL; 1137 retval = -EINVAL;
1100 in_inode = file_inode(in.file); 1138 in_inode = file_inode(in.file);
1101 out_inode = file_inode(out.file); 1139 out_inode = file_inode(out.file);
1102 retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count); 1140 out_pos = out.file->f_pos;
1141 retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1103 if (retval < 0) 1142 if (retval < 0)
1104 goto fput_out; 1143 goto fput_out;
1105 count = retval; 1144 count = retval;
@@ -1107,7 +1146,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1107 if (!max) 1146 if (!max)
1108 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 1147 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1109 1148
1110 pos = *ppos;
1111 if (unlikely(pos + count > max)) { 1149 if (unlikely(pos + count > max)) {
1112 retval = -EOVERFLOW; 1150 retval = -EOVERFLOW;
1113 if (pos >= max) 1151 if (pos >= max)
@@ -1126,18 +1164,25 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1126 if (in.file->f_flags & O_NONBLOCK) 1164 if (in.file->f_flags & O_NONBLOCK)
1127 fl = SPLICE_F_NONBLOCK; 1165 fl = SPLICE_F_NONBLOCK;
1128#endif 1166#endif
1129 retval = do_splice_direct(in.file, ppos, out.file, count, fl); 1167 file_start_write(out.file);
1168 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1169 file_end_write(out.file);
1130 1170
1131 if (retval > 0) { 1171 if (retval > 0) {
1132 add_rchar(current, retval); 1172 add_rchar(current, retval);
1133 add_wchar(current, retval); 1173 add_wchar(current, retval);
1134 fsnotify_access(in.file); 1174 fsnotify_access(in.file);
1135 fsnotify_modify(out.file); 1175 fsnotify_modify(out.file);
1176 out.file->f_pos = out_pos;
1177 if (ppos)
1178 *ppos = pos;
1179 else
1180 in.file->f_pos = pos;
1136 } 1181 }
1137 1182
1138 inc_syscr(current); 1183 inc_syscr(current);
1139 inc_syscw(current); 1184 inc_syscw(current);
1140 if (*ppos > max) 1185 if (pos > max)
1141 retval = -EOVERFLOW; 1186 retval = -EOVERFLOW;
1142 1187
1143fput_out: 1188fput_out:
diff --git a/fs/readdir.c b/fs/readdir.c
index fee38e04fae4..93d71e574310 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,11 +20,11 @@
20 20
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22 22
23int vfs_readdir(struct file *file, filldir_t filler, void *buf) 23int iterate_dir(struct file *file, struct dir_context *ctx)
24{ 24{
25 struct inode *inode = file_inode(file); 25 struct inode *inode = file_inode(file);
26 int res = -ENOTDIR; 26 int res = -ENOTDIR;
27 if (!file->f_op || !file->f_op->readdir) 27 if (!file->f_op || !file->f_op->iterate)
28 goto out; 28 goto out;
29 29
30 res = security_file_permission(file, MAY_READ); 30 res = security_file_permission(file, MAY_READ);
@@ -37,15 +37,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
37 37
38 res = -ENOENT; 38 res = -ENOENT;
39 if (!IS_DEADDIR(inode)) { 39 if (!IS_DEADDIR(inode)) {
40 res = file->f_op->readdir(file, buf, filler); 40 ctx->pos = file->f_pos;
41 res = file->f_op->iterate(file, ctx);
42 file->f_pos = ctx->pos;
41 file_accessed(file); 43 file_accessed(file);
42 } 44 }
43 mutex_unlock(&inode->i_mutex); 45 mutex_unlock(&inode->i_mutex);
44out: 46out:
45 return res; 47 return res;
46} 48}
47 49EXPORT_SYMBOL(iterate_dir);
48EXPORT_SYMBOL(vfs_readdir);
49 50
50/* 51/*
51 * Traditional linux readdir() handling.. 52 * Traditional linux readdir() handling..
@@ -66,6 +67,7 @@ struct old_linux_dirent {
66}; 67};
67 68
68struct readdir_callback { 69struct readdir_callback {
70 struct dir_context ctx;
69 struct old_linux_dirent __user * dirent; 71 struct old_linux_dirent __user * dirent;
70 int result; 72 int result;
71}; 73};
@@ -73,7 +75,7 @@ struct readdir_callback {
73static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, 75static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
74 u64 ino, unsigned int d_type) 76 u64 ino, unsigned int d_type)
75{ 77{
76 struct readdir_callback * buf = (struct readdir_callback *) __buf; 78 struct readdir_callback *buf = (struct readdir_callback *) __buf;
77 struct old_linux_dirent __user * dirent; 79 struct old_linux_dirent __user * dirent;
78 unsigned long d_ino; 80 unsigned long d_ino;
79 81
@@ -107,15 +109,15 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
107{ 109{
108 int error; 110 int error;
109 struct fd f = fdget(fd); 111 struct fd f = fdget(fd);
110 struct readdir_callback buf; 112 struct readdir_callback buf = {
113 .ctx.actor = fillonedir,
114 .dirent = dirent
115 };
111 116
112 if (!f.file) 117 if (!f.file)
113 return -EBADF; 118 return -EBADF;
114 119
115 buf.result = 0; 120 error = iterate_dir(f.file, &buf.ctx);
116 buf.dirent = dirent;
117
118 error = vfs_readdir(f.file, fillonedir, &buf);
119 if (buf.result) 121 if (buf.result)
120 error = buf.result; 122 error = buf.result;
121 123
@@ -137,6 +139,7 @@ struct linux_dirent {
137}; 139};
138 140
139struct getdents_callback { 141struct getdents_callback {
142 struct dir_context ctx;
140 struct linux_dirent __user * current_dir; 143 struct linux_dirent __user * current_dir;
141 struct linux_dirent __user * previous; 144 struct linux_dirent __user * previous;
142 int count; 145 int count;
@@ -191,7 +194,11 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
191{ 194{
192 struct fd f; 195 struct fd f;
193 struct linux_dirent __user * lastdirent; 196 struct linux_dirent __user * lastdirent;
194 struct getdents_callback buf; 197 struct getdents_callback buf = {
198 .ctx.actor = filldir,
199 .count = count,
200 .current_dir = dirent
201 };
195 int error; 202 int error;
196 203
197 if (!access_ok(VERIFY_WRITE, dirent, count)) 204 if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -201,17 +208,12 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
201 if (!f.file) 208 if (!f.file)
202 return -EBADF; 209 return -EBADF;
203 210
204 buf.current_dir = dirent; 211 error = iterate_dir(f.file, &buf.ctx);
205 buf.previous = NULL;
206 buf.count = count;
207 buf.error = 0;
208
209 error = vfs_readdir(f.file, filldir, &buf);
210 if (error >= 0) 212 if (error >= 0)
211 error = buf.error; 213 error = buf.error;
212 lastdirent = buf.previous; 214 lastdirent = buf.previous;
213 if (lastdirent) { 215 if (lastdirent) {
214 if (put_user(f.file->f_pos, &lastdirent->d_off)) 216 if (put_user(buf.ctx.pos, &lastdirent->d_off))
215 error = -EFAULT; 217 error = -EFAULT;
216 else 218 else
217 error = count - buf.count; 219 error = count - buf.count;
@@ -221,6 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
221} 223}
222 224
223struct getdents_callback64 { 225struct getdents_callback64 {
226 struct dir_context ctx;
224 struct linux_dirent64 __user * current_dir; 227 struct linux_dirent64 __user * current_dir;
225 struct linux_dirent64 __user * previous; 228 struct linux_dirent64 __user * previous;
226 int count; 229 int count;
@@ -271,7 +274,11 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
271{ 274{
272 struct fd f; 275 struct fd f;
273 struct linux_dirent64 __user * lastdirent; 276 struct linux_dirent64 __user * lastdirent;
274 struct getdents_callback64 buf; 277 struct getdents_callback64 buf = {
278 .ctx.actor = filldir64,
279 .count = count,
280 .current_dir = dirent
281 };
275 int error; 282 int error;
276 283
277 if (!access_ok(VERIFY_WRITE, dirent, count)) 284 if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -281,17 +288,12 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
281 if (!f.file) 288 if (!f.file)
282 return -EBADF; 289 return -EBADF;
283 290
284 buf.current_dir = dirent; 291 error = iterate_dir(f.file, &buf.ctx);
285 buf.previous = NULL;
286 buf.count = count;
287 buf.error = 0;
288
289 error = vfs_readdir(f.file, filldir64, &buf);
290 if (error >= 0) 292 if (error >= 0)
291 error = buf.error; 293 error = buf.error;
292 lastdirent = buf.previous; 294 lastdirent = buf.previous;
293 if (lastdirent) { 295 if (lastdirent) {
294 typeof(lastdirent->d_off) d_off = f.file->f_pos; 296 typeof(lastdirent->d_off) d_off = buf.ctx.pos;
295 if (__put_user(d_off, &lastdirent->d_off)) 297 if (__put_user(d_off, &lastdirent->d_off))
296 error = -EFAULT; 298 error = -EFAULT;
297 else 299 else
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 66c53b642a88..03e4ca5624d6 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -13,14 +13,14 @@
13 13
14extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
15 15
16static int reiserfs_readdir(struct file *, void *, filldir_t); 16static int reiserfs_readdir(struct file *, struct dir_context *);
17static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, 17static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
18 int datasync); 18 int datasync);
19 19
20const struct file_operations reiserfs_dir_operations = { 20const struct file_operations reiserfs_dir_operations = {
21 .llseek = generic_file_llseek, 21 .llseek = generic_file_llseek,
22 .read = generic_read_dir, 22 .read = generic_read_dir,
23 .readdir = reiserfs_readdir, 23 .iterate = reiserfs_readdir,
24 .fsync = reiserfs_dir_fsync, 24 .fsync = reiserfs_dir_fsync,
25 .unlocked_ioctl = reiserfs_ioctl, 25 .unlocked_ioctl = reiserfs_ioctl,
26#ifdef CONFIG_COMPAT 26#ifdef CONFIG_COMPAT
@@ -50,18 +50,15 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
50 50
51#define store_ih(where,what) copy_item_head (where, what) 51#define store_ih(where,what) copy_item_head (where, what)
52 52
53static inline bool is_privroot_deh(struct dentry *dir, 53static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
54 struct reiserfs_de_head *deh)
55{ 54{
56 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; 55 struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
57 return (dir == dir->d_parent && privroot->d_inode && 56 return (privroot->d_inode &&
58 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); 57 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
59} 58}
60 59
61int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, 60int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
62 filldir_t filldir, loff_t *pos)
63{ 61{
64 struct inode *inode = dentry->d_inode;
65 struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ 62 struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
66 INITIALIZE_PATH(path_to_entry); 63 INITIALIZE_PATH(path_to_entry);
67 struct buffer_head *bh; 64 struct buffer_head *bh;
@@ -81,7 +78,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
81 78
82 /* form key for search the next directory entry using f_pos field of 79 /* form key for search the next directory entry using f_pos field of
83 file structure */ 80 file structure */
84 make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); 81 make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
85 next_pos = cpu_key_k_offset(&pos_key); 82 next_pos = cpu_key_k_offset(&pos_key);
86 83
87 path_to_entry.reada = PATH_READA; 84 path_to_entry.reada = PATH_READA;
@@ -126,7 +123,6 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
126 entry_num++, deh++) { 123 entry_num++, deh++) {
127 int d_reclen; 124 int d_reclen;
128 char *d_name; 125 char *d_name;
129 off_t d_off;
130 ino_t d_ino; 126 ino_t d_ino;
131 127
132 if (!de_visible(deh)) 128 if (!de_visible(deh))
@@ -155,11 +151,10 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
155 } 151 }
156 152
157 /* Ignore the .reiserfs_priv entry */ 153 /* Ignore the .reiserfs_priv entry */
158 if (is_privroot_deh(dentry, deh)) 154 if (is_privroot_deh(inode, deh))
159 continue; 155 continue;
160 156
161 d_off = deh_offset(deh); 157 ctx->pos = deh_offset(deh);
162 *pos = d_off;
163 d_ino = deh_objectid(deh); 158 d_ino = deh_objectid(deh);
164 if (d_reclen <= 32) { 159 if (d_reclen <= 32) {
165 local_buf = small_buf; 160 local_buf = small_buf;
@@ -187,9 +182,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
187 * the write lock here for other waiters 182 * the write lock here for other waiters
188 */ 183 */
189 reiserfs_write_unlock(inode->i_sb); 184 reiserfs_write_unlock(inode->i_sb);
190 if (filldir 185 if (!dir_emit
191 (dirent, local_buf, d_reclen, d_off, d_ino, 186 (ctx, local_buf, d_reclen, d_ino,
192 DT_UNKNOWN) < 0) { 187 DT_UNKNOWN)) {
193 reiserfs_write_lock(inode->i_sb); 188 reiserfs_write_lock(inode->i_sb);
194 if (local_buf != small_buf) { 189 if (local_buf != small_buf) {
195 kfree(local_buf); 190 kfree(local_buf);
@@ -204,6 +199,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
204 next_pos = deh_offset(deh) + 1; 199 next_pos = deh_offset(deh) + 1;
205 200
206 if (item_moved(&tmp_ih, &path_to_entry)) { 201 if (item_moved(&tmp_ih, &path_to_entry)) {
202 set_cpu_key_k_offset(&pos_key,
203 next_pos);
207 goto research; 204 goto research;
208 } 205 }
209 } /* for */ 206 } /* for */
@@ -235,7 +232,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
235 } /* while */ 232 } /* while */
236 233
237end: 234end:
238 *pos = next_pos; 235 ctx->pos = next_pos;
239 pathrelse(&path_to_entry); 236 pathrelse(&path_to_entry);
240 reiserfs_check_path(&path_to_entry); 237 reiserfs_check_path(&path_to_entry);
241out: 238out:
@@ -243,10 +240,9 @@ out:
243 return ret; 240 return ret;
244} 241}
245 242
246static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir) 243static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
247{ 244{
248 struct dentry *dentry = file->f_path.dentry; 245 return reiserfs_readdir_inode(file_inode(file), ctx);
249 return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
250} 246}
251 247
252/* compose directory item containing "." and ".." entries (entries are 248/* compose directory item containing "." and ".." entries (entries are
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 77d6d47abc83..0048cc16a6a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1811,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1811 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); 1811 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1812 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); 1812 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1813 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); 1813 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
1814 if (insert_inode_locked4(inode, args.objectid, 1814
1815 reiserfs_find_actor, &args) < 0) { 1815 reiserfs_write_unlock(inode->i_sb);
1816 err = insert_inode_locked4(inode, args.objectid,
1817 reiserfs_find_actor, &args);
1818 reiserfs_write_lock(inode->i_sb);
1819 if (err) {
1816 err = -EINVAL; 1820 err = -EINVAL;
1817 goto out_bad_inode; 1821 goto out_bad_inode;
1818 } 1822 }
1823
1819 if (old_format_only(sb)) 1824 if (old_format_only(sb))
1820 /* not a perfect generation count, as object ids can be reused, but 1825 /* not a perfect generation count, as object ids can be reused, but
1821 ** this is as good as reiserfs can do right now. 1826 ** this is as good as reiserfs can do right now.
@@ -2970,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2970} 2975}
2971 2976
2972/* clm -- taken from fs/buffer.c:block_invalidate_page */ 2977/* clm -- taken from fs/buffer.c:block_invalidate_page */
2973static void reiserfs_invalidatepage(struct page *page, unsigned long offset) 2978static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
2979 unsigned int length)
2974{ 2980{
2975 struct buffer_head *head, *bh, *next; 2981 struct buffer_head *head, *bh, *next;
2976 struct inode *inode = page->mapping->host; 2982 struct inode *inode = page->mapping->host;
2977 unsigned int curr_off = 0; 2983 unsigned int curr_off = 0;
2984 unsigned int stop = offset + length;
2985 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2978 int ret = 1; 2986 int ret = 1;
2979 2987
2980 BUG_ON(!PageLocked(page)); 2988 BUG_ON(!PageLocked(page));
2981 2989
2982 if (offset == 0) 2990 if (!partial_page)
2983 ClearPageChecked(page); 2991 ClearPageChecked(page);
2984 2992
2985 if (!page_has_buffers(page)) 2993 if (!page_has_buffers(page))
@@ -2991,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
2991 unsigned int next_off = curr_off + bh->b_size; 2999 unsigned int next_off = curr_off + bh->b_size;
2992 next = bh->b_this_page; 3000 next = bh->b_this_page;
2993 3001
3002 if (next_off > stop)
3003 goto out;
3004
2994 /* 3005 /*
2995 * is this block fully invalidated? 3006 * is this block fully invalidated?
2996 */ 3007 */
@@ -3009,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
3009 * The get_block cached value has been unconditionally invalidated, 3020 * The get_block cached value has been unconditionally invalidated,
3010 * so real IO is not possible anymore. 3021 * so real IO is not possible anymore.
3011 */ 3022 */
3012 if (!offset && ret) { 3023 if (!partial_page && ret) {
3013 ret = try_to_release_page(page, 0); 3024 ret = try_to_release_page(page, 0);
3014 /* maybe should BUG_ON(!ret); - neilb */ 3025 /* maybe should BUG_ON(!ret); - neilb */
3015 } 3026 }
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 157e474ab303..3df5ce6c724d 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2709,7 +2709,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations;
2709extern const struct inode_operations reiserfs_symlink_inode_operations; 2709extern const struct inode_operations reiserfs_symlink_inode_operations;
2710extern const struct inode_operations reiserfs_special_inode_operations; 2710extern const struct inode_operations reiserfs_special_inode_operations;
2711extern const struct file_operations reiserfs_dir_operations; 2711extern const struct file_operations reiserfs_dir_operations;
2712int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *); 2712int reiserfs_readdir_inode(struct inode *, struct dir_context *);
2713 2713
2714/* tail_conversion.c */ 2714/* tail_conversion.c */
2715int direct2indirect(struct reiserfs_transaction_handle *, struct inode *, 2715int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 4cce1d9552fb..c69cdd749f09 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -171,6 +171,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
171 * modifying extended attributes. This includes operations such as permissions 171 * modifying extended attributes. This includes operations such as permissions
172 * or ownership changes, object deletions, etc. */ 172 * or ownership changes, object deletions, etc. */
173struct reiserfs_dentry_buf { 173struct reiserfs_dentry_buf {
174 struct dir_context ctx;
174 struct dentry *xadir; 175 struct dentry *xadir;
175 int count; 176 int count;
176 struct dentry *dentries[8]; 177 struct dentry *dentries[8];
@@ -223,9 +224,8 @@ static int reiserfs_for_each_xattr(struct inode *inode,
223{ 224{
224 struct dentry *dir; 225 struct dentry *dir;
225 int i, err = 0; 226 int i, err = 0;
226 loff_t pos = 0;
227 struct reiserfs_dentry_buf buf = { 227 struct reiserfs_dentry_buf buf = {
228 .count = 0, 228 .ctx.actor = fill_with_dentries,
229 }; 229 };
230 230
231 /* Skip out, an xattr has no xattrs associated with it */ 231 /* Skip out, an xattr has no xattrs associated with it */
@@ -249,29 +249,27 @@ static int reiserfs_for_each_xattr(struct inode *inode,
249 reiserfs_write_lock(inode->i_sb); 249 reiserfs_write_lock(inode->i_sb);
250 250
251 buf.xadir = dir; 251 buf.xadir = dir;
252 err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); 252 while (1) {
253 while ((err == 0 || err == -ENOSPC) && buf.count) { 253 err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
254 err = 0; 254 if (err)
255 255 break;
256 for (i = 0; i < buf.count && buf.dentries[i]; i++) { 256 if (!buf.count)
257 int lerr = 0; 257 break;
258 for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
258 struct dentry *dentry = buf.dentries[i]; 259 struct dentry *dentry = buf.dentries[i];
259 260
260 if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode)) 261 if (!S_ISDIR(dentry->d_inode->i_mode))
261 lerr = action(dentry, data); 262 err = action(dentry, data);
262 263
263 dput(dentry); 264 dput(dentry);
264 buf.dentries[i] = NULL; 265 buf.dentries[i] = NULL;
265 err = lerr ?: err;
266 } 266 }
267 if (err)
268 break;
267 buf.count = 0; 269 buf.count = 0;
268 if (!err)
269 err = reiserfs_readdir_dentry(dir, &buf,
270 fill_with_dentries, &pos);
271 } 270 }
272 mutex_unlock(&dir->d_inode->i_mutex); 271 mutex_unlock(&dir->d_inode->i_mutex);
273 272
274 /* Clean up after a failed readdir */
275 cleanup_dentry_buf(&buf); 273 cleanup_dentry_buf(&buf);
276 274
277 if (!err) { 275 if (!err) {
@@ -318,7 +316,19 @@ static int delete_one_xattr(struct dentry *dentry, void *data)
318static int chown_one_xattr(struct dentry *dentry, void *data) 316static int chown_one_xattr(struct dentry *dentry, void *data)
319{ 317{
320 struct iattr *attrs = data; 318 struct iattr *attrs = data;
321 return reiserfs_setattr(dentry, attrs); 319 int ia_valid = attrs->ia_valid;
320 int err;
321
322 /*
323 * We only want the ownership bits. Otherwise, we'll do
324 * things like change a directory to a regular file if
325 * ATTR_MODE is set.
326 */
327 attrs->ia_valid &= (ATTR_UID|ATTR_GID);
328 err = reiserfs_setattr(dentry, attrs);
329 attrs->ia_valid = ia_valid;
330
331 return err;
322} 332}
323 333
324/* No i_mutex, but the inode is unconnected. */ 334/* No i_mutex, but the inode is unconnected. */
@@ -788,6 +798,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
788} 798}
789 799
790struct listxattr_buf { 800struct listxattr_buf {
801 struct dir_context ctx;
791 size_t size; 802 size_t size;
792 size_t pos; 803 size_t pos;
793 char *buf; 804 char *buf;
@@ -833,8 +844,8 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
833{ 844{
834 struct dentry *dir; 845 struct dentry *dir;
835 int err = 0; 846 int err = 0;
836 loff_t pos = 0;
837 struct listxattr_buf buf = { 847 struct listxattr_buf buf = {
848 .ctx.actor = listxattr_filler,
838 .dentry = dentry, 849 .dentry = dentry,
839 .buf = buffer, 850 .buf = buffer,
840 .size = buffer ? size : 0, 851 .size = buffer ? size : 0,
@@ -856,7 +867,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
856 } 867 }
857 868
858 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); 869 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
859 err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos); 870 err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
860 mutex_unlock(&dir->d_inode->i_mutex); 871 mutex_unlock(&dir->d_inode->i_mutex);
861 872
862 if (!err) 873 if (!err)
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d7c01ef64eda..6c8767fdfc6a 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -443,6 +443,9 @@ int reiserfs_acl_chmod(struct inode *inode)
443 int depth; 443 int depth;
444 int error; 444 int error;
445 445
446 if (IS_PRIVATE(inode))
447 return 0;
448
446 if (S_ISLNK(inode->i_mode)) 449 if (S_ISLNK(inode->i_mode))
447 return -EOPNOTSUPP; 450 return -EOPNOTSUPP;
448 451
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 15cbc41ee365..ff1d3d42e72a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -145,19 +145,18 @@ static const struct address_space_operations romfs_aops = {
145/* 145/*
146 * read the entries from a directory 146 * read the entries from a directory
147 */ 147 */
148static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 148static int romfs_readdir(struct file *file, struct dir_context *ctx)
149{ 149{
150 struct inode *i = file_inode(filp); 150 struct inode *i = file_inode(file);
151 struct romfs_inode ri; 151 struct romfs_inode ri;
152 unsigned long offset, maxoff; 152 unsigned long offset, maxoff;
153 int j, ino, nextfh; 153 int j, ino, nextfh;
154 int stored = 0;
155 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ 154 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
156 int ret; 155 int ret;
157 156
158 maxoff = romfs_maxsize(i->i_sb); 157 maxoff = romfs_maxsize(i->i_sb);
159 158
160 offset = filp->f_pos; 159 offset = ctx->pos;
161 if (!offset) { 160 if (!offset) {
162 offset = i->i_ino & ROMFH_MASK; 161 offset = i->i_ino & ROMFH_MASK;
163 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); 162 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -170,10 +169,10 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
170 for (;;) { 169 for (;;) {
171 if (!offset || offset >= maxoff) { 170 if (!offset || offset >= maxoff) {
172 offset = maxoff; 171 offset = maxoff;
173 filp->f_pos = offset; 172 ctx->pos = offset;
174 goto out; 173 goto out;
175 } 174 }
176 filp->f_pos = offset; 175 ctx->pos = offset;
177 176
178 /* Fetch inode info */ 177 /* Fetch inode info */
179 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); 178 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -194,16 +193,14 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
194 nextfh = be32_to_cpu(ri.next); 193 nextfh = be32_to_cpu(ri.next);
195 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) 194 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
196 ino = be32_to_cpu(ri.spec); 195 ino = be32_to_cpu(ri.spec);
197 if (filldir(dirent, fsname, j, offset, ino, 196 if (!dir_emit(ctx, fsname, j, ino,
198 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) 197 romfs_dtype_table[nextfh & ROMFH_TYPE]))
199 goto out; 198 goto out;
200 199
201 stored++;
202 offset = nextfh & ROMFH_MASK; 200 offset = nextfh & ROMFH_MASK;
203 } 201 }
204
205out: 202out:
206 return stored; 203 return 0;
207} 204}
208 205
209/* 206/*
@@ -281,7 +278,7 @@ error:
281 278
282static const struct file_operations romfs_dir_operations = { 279static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir, 280 .read = generic_read_dir,
284 .readdir = romfs_readdir, 281 .iterate = romfs_readdir,
285 .llseek = default_llseek, 282 .llseek = default_llseek,
286}; 283};
287 284
diff --git a/fs/splice.c b/fs/splice.c
index e6b25598c8c4..3b7ee656f3aa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1098,27 +1098,13 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1098{ 1098{
1099 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, 1099 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1100 loff_t *, size_t, unsigned int); 1100 loff_t *, size_t, unsigned int);
1101 int ret;
1102
1103 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1104 return -EBADF;
1105
1106 if (unlikely(out->f_flags & O_APPEND))
1107 return -EINVAL;
1108
1109 ret = rw_verify_area(WRITE, out, ppos, len);
1110 if (unlikely(ret < 0))
1111 return ret;
1112 1101
1113 if (out->f_op && out->f_op->splice_write) 1102 if (out->f_op && out->f_op->splice_write)
1114 splice_write = out->f_op->splice_write; 1103 splice_write = out->f_op->splice_write;
1115 else 1104 else
1116 splice_write = default_file_splice_write; 1105 splice_write = default_file_splice_write;
1117 1106
1118 file_start_write(out); 1107 return splice_write(pipe, out, ppos, len, flags);
1119 ret = splice_write(pipe, out, ppos, len, flags);
1120 file_end_write(out);
1121 return ret;
1122} 1108}
1123 1109
1124/* 1110/*
@@ -1274,7 +1260,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
1274{ 1260{
1275 struct file *file = sd->u.file; 1261 struct file *file = sd->u.file;
1276 1262
1277 return do_splice_from(pipe, file, &file->f_pos, sd->total_len, 1263 return do_splice_from(pipe, file, sd->opos, sd->total_len,
1278 sd->flags); 1264 sd->flags);
1279} 1265}
1280 1266
@@ -1283,6 +1269,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
1283 * @in: file to splice from 1269 * @in: file to splice from
1284 * @ppos: input file offset 1270 * @ppos: input file offset
1285 * @out: file to splice to 1271 * @out: file to splice to
1272 * @opos: output file offset
1286 * @len: number of bytes to splice 1273 * @len: number of bytes to splice
1287 * @flags: splice modifier flags 1274 * @flags: splice modifier flags
1288 * 1275 *
@@ -1294,7 +1281,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
1294 * 1281 *
1295 */ 1282 */
1296long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1283long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1297 size_t len, unsigned int flags) 1284 loff_t *opos, size_t len, unsigned int flags)
1298{ 1285{
1299 struct splice_desc sd = { 1286 struct splice_desc sd = {
1300 .len = len, 1287 .len = len,
@@ -1302,9 +1289,20 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1302 .flags = flags, 1289 .flags = flags,
1303 .pos = *ppos, 1290 .pos = *ppos,
1304 .u.file = out, 1291 .u.file = out,
1292 .opos = opos,
1305 }; 1293 };
1306 long ret; 1294 long ret;
1307 1295
1296 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1297 return -EBADF;
1298
1299 if (unlikely(out->f_flags & O_APPEND))
1300 return -EINVAL;
1301
1302 ret = rw_verify_area(WRITE, out, opos, len);
1303 if (unlikely(ret < 0))
1304 return ret;
1305
1308 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1306 ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1309 if (ret > 0) 1307 if (ret > 0)
1310 *ppos = sd.pos; 1308 *ppos = sd.pos;
@@ -1325,7 +1323,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1325{ 1323{
1326 struct pipe_inode_info *ipipe; 1324 struct pipe_inode_info *ipipe;
1327 struct pipe_inode_info *opipe; 1325 struct pipe_inode_info *opipe;
1328 loff_t offset, *off; 1326 loff_t offset;
1329 long ret; 1327 long ret;
1330 1328
1331 ipipe = get_pipe_info(in); 1329 ipipe = get_pipe_info(in);
@@ -1356,13 +1354,27 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1356 return -EINVAL; 1354 return -EINVAL;
1357 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1355 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1358 return -EFAULT; 1356 return -EFAULT;
1359 off = &offset; 1357 } else {
1360 } else 1358 offset = out->f_pos;
1361 off = &out->f_pos; 1359 }
1362 1360
1363 ret = do_splice_from(ipipe, out, off, len, flags); 1361 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1362 return -EBADF;
1364 1363
1365 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1364 if (unlikely(out->f_flags & O_APPEND))
1365 return -EINVAL;
1366
1367 ret = rw_verify_area(WRITE, out, &offset, len);
1368 if (unlikely(ret < 0))
1369 return ret;
1370
1371 file_start_write(out);
1372 ret = do_splice_from(ipipe, out, &offset, len, flags);
1373 file_end_write(out);
1374
1375 if (!off_out)
1376 out->f_pos = offset;
1377 else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1366 ret = -EFAULT; 1378 ret = -EFAULT;
1367 1379
1368 return ret; 1380 return ret;
@@ -1376,13 +1388,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1376 return -EINVAL; 1388 return -EINVAL;
1377 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1389 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1378 return -EFAULT; 1390 return -EFAULT;
1379 off = &offset; 1391 } else {
1380 } else 1392 offset = in->f_pos;
1381 off = &in->f_pos; 1393 }
1382 1394
1383 ret = do_splice_to(in, off, opipe, len, flags); 1395 ret = do_splice_to(in, &offset, opipe, len, flags);
1384 1396
1385 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1397 if (!off_in)
1398 in->f_pos = offset;
1399 else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1386 ret = -EFAULT; 1400 ret = -EFAULT;
1387 1401
1388 return ret; 1402 return ret;
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 57dc70ebbb19..f7f527bf8c10 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -100,7 +100,7 @@ static int get_dir_index_using_offset(struct super_block *sb,
100} 100}
101 101
102 102
103static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) 103static int squashfs_readdir(struct file *file, struct dir_context *ctx)
104{ 104{
105 struct inode *inode = file_inode(file); 105 struct inode *inode = file_inode(file);
106 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; 106 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
@@ -127,11 +127,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
127 * It also means that the external f_pos is offset by 3 from the 127 * It also means that the external f_pos is offset by 3 from the
128 * on-disk directory f_pos. 128 * on-disk directory f_pos.
129 */ 129 */
130 while (file->f_pos < 3) { 130 while (ctx->pos < 3) {
131 char *name; 131 char *name;
132 int i_ino; 132 int i_ino;
133 133
134 if (file->f_pos == 0) { 134 if (ctx->pos == 0) {
135 name = "."; 135 name = ".";
136 size = 1; 136 size = 1;
137 i_ino = inode->i_ino; 137 i_ino = inode->i_ino;
@@ -141,24 +141,18 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
141 i_ino = squashfs_i(inode)->parent; 141 i_ino = squashfs_i(inode)->parent;
142 } 142 }
143 143
144 TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n", 144 if (!dir_emit(ctx, name, size, i_ino,
145 dirent, name, size, file->f_pos, i_ino, 145 squashfs_filetype_table[1]))
146 squashfs_filetype_table[1]);
147
148 if (filldir(dirent, name, size, file->f_pos, i_ino,
149 squashfs_filetype_table[1]) < 0) {
150 TRACE("Filldir returned less than 0\n");
151 goto finish; 146 goto finish;
152 }
153 147
154 file->f_pos += size; 148 ctx->pos += size;
155 } 149 }
156 150
157 length = get_dir_index_using_offset(inode->i_sb, &block, &offset, 151 length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
158 squashfs_i(inode)->dir_idx_start, 152 squashfs_i(inode)->dir_idx_start,
159 squashfs_i(inode)->dir_idx_offset, 153 squashfs_i(inode)->dir_idx_offset,
160 squashfs_i(inode)->dir_idx_cnt, 154 squashfs_i(inode)->dir_idx_cnt,
161 file->f_pos); 155 ctx->pos);
162 156
163 while (length < i_size_read(inode)) { 157 while (length < i_size_read(inode)) {
164 /* 158 /*
@@ -198,7 +192,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
198 192
199 length += sizeof(*dire) + size; 193 length += sizeof(*dire) + size;
200 194
201 if (file->f_pos >= length) 195 if (ctx->pos >= length)
202 continue; 196 continue;
203 197
204 dire->name[size] = '\0'; 198 dire->name[size] = '\0';
@@ -206,22 +200,12 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
206 ((short) le16_to_cpu(dire->inode_number)); 200 ((short) le16_to_cpu(dire->inode_number));
207 type = le16_to_cpu(dire->type); 201 type = le16_to_cpu(dire->type);
208 202
209 TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)" 203 if (!dir_emit(ctx, dire->name, size,
210 "\n", dirent, dire->name, size,
211 file->f_pos,
212 le32_to_cpu(dirh.start_block),
213 le16_to_cpu(dire->offset),
214 inode_number,
215 squashfs_filetype_table[type]);
216
217 if (filldir(dirent, dire->name, size, file->f_pos,
218 inode_number, 204 inode_number,
219 squashfs_filetype_table[type]) < 0) { 205 squashfs_filetype_table[type]))
220 TRACE("Filldir returned less than 0\n");
221 goto finish; 206 goto finish;
222 }
223 207
224 file->f_pos = length; 208 ctx->pos = length;
225 } 209 }
226 } 210 }
227 211
@@ -238,6 +222,6 @@ failed_read:
238 222
239const struct file_operations squashfs_dir_ops = { 223const struct file_operations squashfs_dir_ops = {
240 .read = generic_read_dir, 224 .read = generic_read_dir,
241 .readdir = squashfs_readdir, 225 .iterate = squashfs_readdir,
242 .llseek = default_llseek, 226 .llseek = default_llseek,
243}; 227};
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e8e0e71b29d5..e068e744dbdd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
74} 74}
75 75
76/** 76/**
77 * sysfs_link_subling - link sysfs_dirent into sibling rbtree 77 * sysfs_link_sibling - link sysfs_dirent into sibling rbtree
78 * @sd: sysfs_dirent of interest 78 * @sd: sysfs_dirent of interest
79 * 79 *
80 * Link @sd into its sibling rbtree which starts from 80 * Link @sd into its sibling rbtree which starts from
@@ -998,68 +998,38 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
998 return pos; 998 return pos;
999} 999}
1000 1000
1001static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 1001static int sysfs_readdir(struct file *file, struct dir_context *ctx)
1002{ 1002{
1003 struct dentry *dentry = filp->f_path.dentry; 1003 struct dentry *dentry = file->f_path.dentry;
1004 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 1004 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
1005 struct sysfs_dirent *pos = filp->private_data; 1005 struct sysfs_dirent *pos = file->private_data;
1006 enum kobj_ns_type type; 1006 enum kobj_ns_type type;
1007 const void *ns; 1007 const void *ns;
1008 ino_t ino;
1009 loff_t off;
1010 1008
1011 type = sysfs_ns_type(parent_sd); 1009 type = sysfs_ns_type(parent_sd);
1012 ns = sysfs_info(dentry->d_sb)->ns[type]; 1010 ns = sysfs_info(dentry->d_sb)->ns[type];
1013 1011
1014 if (filp->f_pos == 0) { 1012 if (!dir_emit_dots(file, ctx))
1015 ino = parent_sd->s_ino; 1013 return 0;
1016 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
1017 filp->f_pos++;
1018 else
1019 return 0;
1020 }
1021 if (filp->f_pos == 1) {
1022 if (parent_sd->s_parent)
1023 ino = parent_sd->s_parent->s_ino;
1024 else
1025 ino = parent_sd->s_ino;
1026 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
1027 filp->f_pos++;
1028 else
1029 return 0;
1030 }
1031 mutex_lock(&sysfs_mutex); 1014 mutex_lock(&sysfs_mutex);
1032 off = filp->f_pos; 1015 for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
1033 for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
1034 pos; 1016 pos;
1035 pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { 1017 pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
1036 const char * name; 1018 const char *name = pos->s_name;
1037 unsigned int type; 1019 unsigned int type = dt_type(pos);
1038 int len, ret; 1020 int len = strlen(name);
1039 1021 ino_t ino = pos->s_ino;
1040 name = pos->s_name; 1022 ctx->pos = pos->s_hash;
1041 len = strlen(name); 1023 file->private_data = sysfs_get(pos);
1042 ino = pos->s_ino;
1043 type = dt_type(pos);
1044 off = filp->f_pos = pos->s_hash;
1045 filp->private_data = sysfs_get(pos);
1046 1024
1047 mutex_unlock(&sysfs_mutex); 1025 mutex_unlock(&sysfs_mutex);
1048 ret = filldir(dirent, name, len, off, ino, type); 1026 if (!dir_emit(ctx, name, len, ino, type))
1027 return 0;
1049 mutex_lock(&sysfs_mutex); 1028 mutex_lock(&sysfs_mutex);
1050 if (ret < 0)
1051 break;
1052 } 1029 }
1053 mutex_unlock(&sysfs_mutex); 1030 mutex_unlock(&sysfs_mutex);
1054 1031 file->private_data = NULL;
1055 /* don't reference last entry if its refcount is dropped */ 1032 ctx->pos = INT_MAX;
1056 if (!pos) {
1057 filp->private_data = NULL;
1058
1059 /* EOF and not changed as 0 or 1 in read/write path */
1060 if (off == filp->f_pos && off > 1)
1061 filp->f_pos = INT_MAX;
1062 }
1063 return 0; 1033 return 0;
1064} 1034}
1065 1035
@@ -1077,7 +1047,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
1077 1047
1078const struct file_operations sysfs_dir_operations = { 1048const struct file_operations sysfs_dir_operations = {
1079 .read = generic_read_dir, 1049 .read = generic_read_dir,
1080 .readdir = sysfs_readdir, 1050 .iterate = sysfs_readdir,
1081 .release = sysfs_dir_release, 1051 .release = sysfs_dir_release,
1082 .llseek = sysfs_dir_llseek, 1052 .llseek = sysfs_dir_llseek,
1083}; 1053};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 602f56db0442..d2bb7ed8fa74 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
449 449
450 spin_lock_irqsave(&sysfs_open_dirent_lock, flags); 450 spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
451 451
452 od = sd->s_attr.open; 452 if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
453 if (od) { 453 od = sd->s_attr.open;
454 atomic_inc(&od->event); 454 if (od) {
455 wake_up_interruptible(&od->poll); 455 atomic_inc(&od->event);
456 wake_up_interruptible(&od->poll);
457 }
456 } 458 }
457 459
458 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); 460 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0ce3ccf7f401..3e2837a633ed 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -24,8 +24,6 @@
24#include <linux/security.h> 24#include <linux/security.h>
25#include "sysfs.h" 25#include "sysfs.h"
26 26
27extern struct super_block * sysfs_sb;
28
29static const struct address_space_operations sysfs_aops = { 27static const struct address_space_operations sysfs_aops = {
30 .readpage = simple_readpage, 28 .readpage = simple_readpage,
31 .write_begin = simple_write_begin, 29 .write_begin = simple_write_begin,
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 3799e8dac3eb..d42291d08215 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -18,12 +18,12 @@
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include "sysv.h" 19#include "sysv.h"
20 20
21static int sysv_readdir(struct file *, void *, filldir_t); 21static int sysv_readdir(struct file *, struct dir_context *);
22 22
23const struct file_operations sysv_dir_operations = { 23const struct file_operations sysv_dir_operations = {
24 .llseek = generic_file_llseek, 24 .llseek = generic_file_llseek,
25 .read = generic_read_dir, 25 .read = generic_read_dir,
26 .readdir = sysv_readdir, 26 .iterate = sysv_readdir,
27 .fsync = generic_file_fsync, 27 .fsync = generic_file_fsync,
28}; 28};
29 29
@@ -65,18 +65,21 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
65 return page; 65 return page;
66} 66}
67 67
68static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) 68static int sysv_readdir(struct file *file, struct dir_context *ctx)
69{ 69{
70 unsigned long pos = filp->f_pos; 70 unsigned long pos = ctx->pos;
71 struct inode *inode = file_inode(filp); 71 struct inode *inode = file_inode(file);
72 struct super_block *sb = inode->i_sb; 72 struct super_block *sb = inode->i_sb;
73 unsigned offset = pos & ~PAGE_CACHE_MASK;
74 unsigned long n = pos >> PAGE_CACHE_SHIFT;
75 unsigned long npages = dir_pages(inode); 73 unsigned long npages = dir_pages(inode);
74 unsigned offset;
75 unsigned long n;
76 76
77 pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); 77 ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
78 if (pos >= inode->i_size) 78 if (pos >= inode->i_size)
79 goto done; 79 return 0;
80
81 offset = pos & ~PAGE_CACHE_MASK;
82 n = pos >> PAGE_CACHE_SHIFT;
80 83
81 for ( ; n < npages; n++, offset = 0) { 84 for ( ; n < npages; n++, offset = 0) {
82 char *kaddr, *limit; 85 char *kaddr, *limit;
@@ -88,29 +91,21 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
88 kaddr = (char *)page_address(page); 91 kaddr = (char *)page_address(page);
89 de = (struct sysv_dir_entry *)(kaddr+offset); 92 de = (struct sysv_dir_entry *)(kaddr+offset);
90 limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE; 93 limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
91 for ( ;(char*)de <= limit; de++) { 94 for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
92 char *name = de->name; 95 char *name = de->name;
93 int over;
94 96
95 if (!de->inode) 97 if (!de->inode)
96 continue; 98 continue;
97 99
98 offset = (char *)de - kaddr; 100 if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
99
100 over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
101 ((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
102 fs16_to_cpu(SYSV_SB(sb), de->inode), 101 fs16_to_cpu(SYSV_SB(sb), de->inode),
103 DT_UNKNOWN); 102 DT_UNKNOWN)) {
104 if (over) {
105 dir_put_page(page); 103 dir_put_page(page);
106 goto done; 104 return 0;
107 } 105 }
108 } 106 }
109 dir_put_page(page); 107 dir_put_page(page);
110 } 108 }
111
112done:
113 filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
114 return 0; 109 return 0;
115} 110}
116 111
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 1c0d5f264767..731b2bbcaab3 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,8 +27,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
27 return err; 27 return err;
28} 28}
29 29
30static int sysv_hash(const struct dentry *dentry, const struct inode *inode, 30static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
31 struct qstr *qstr)
32{ 31{
33 /* Truncate the name in place, avoids having to define a compare 32 /* Truncate the name in place, avoids having to define a compare
34 function. */ 33 function. */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index de08c92f2e23..6b4947f75af7 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -346,38 +346,46 @@ static unsigned int vfs_dent_type(uint8_t type)
346 * This means that UBIFS cannot support NFS which requires full 346 * This means that UBIFS cannot support NFS which requires full
347 * 'seekdir()'/'telldir()' support. 347 * 'seekdir()'/'telldir()' support.
348 */ 348 */
349static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) 349static int ubifs_readdir(struct file *file, struct dir_context *ctx)
350{ 350{
351 int err, over = 0; 351 int err;
352 struct qstr nm; 352 struct qstr nm;
353 union ubifs_key key; 353 union ubifs_key key;
354 struct ubifs_dent_node *dent; 354 struct ubifs_dent_node *dent;
355 struct inode *dir = file_inode(file); 355 struct inode *dir = file_inode(file);
356 struct ubifs_info *c = dir->i_sb->s_fs_info; 356 struct ubifs_info *c = dir->i_sb->s_fs_info;
357 357
358 dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); 358 dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
359 359
360 if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) 360 if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
361 /* 361 /*
362 * The directory was seek'ed to a senseless position or there 362 * The directory was seek'ed to a senseless position or there
363 * are no more entries. 363 * are no more entries.
364 */ 364 */
365 return 0; 365 return 0;
366 366
367 /* File positions 0 and 1 correspond to "." and ".." */ 367 if (file->f_version == 0) {
368 if (file->f_pos == 0) { 368 /*
369 ubifs_assert(!file->private_data); 369 * The file was seek'ed, which means that @file->private_data
370 over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); 370 * is now invalid. This may also be just the first
371 if (over) 371 * 'ubifs_readdir()' invocation, in which case
372 return 0; 372 * @file->private_data is NULL, and the below code is
373 file->f_pos = 1; 373 * basically a no-op.
374 */
375 kfree(file->private_data);
376 file->private_data = NULL;
374 } 377 }
375 378
376 if (file->f_pos == 1) { 379 /*
380 * 'generic_file_llseek()' unconditionally sets @file->f_version to
381 * zero, and we use this for detecting whether the file was seek'ed.
382 */
383 file->f_version = 1;
384
385 /* File positions 0 and 1 correspond to "." and ".." */
386 if (ctx->pos < 2) {
377 ubifs_assert(!file->private_data); 387 ubifs_assert(!file->private_data);
378 over = filldir(dirent, "..", 2, 1, 388 if (!dir_emit_dots(file, ctx))
379 parent_ino(file->f_path.dentry), DT_DIR);
380 if (over)
381 return 0; 389 return 0;
382 390
383 /* Find the first entry in TNC and save it */ 391 /* Find the first entry in TNC and save it */
@@ -389,7 +397,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
389 goto out; 397 goto out;
390 } 398 }
391 399
392 file->f_pos = key_hash_flash(c, &dent->key); 400 ctx->pos = key_hash_flash(c, &dent->key);
393 file->private_data = dent; 401 file->private_data = dent;
394 } 402 }
395 403
@@ -397,17 +405,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
397 if (!dent) { 405 if (!dent) {
398 /* 406 /*
399 * The directory was seek'ed to and is now readdir'ed. 407 * The directory was seek'ed to and is now readdir'ed.
400 * Find the entry corresponding to @file->f_pos or the 408 * Find the entry corresponding to @ctx->pos or the closest one.
401 * closest one.
402 */ 409 */
403 dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); 410 dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
404 nm.name = NULL; 411 nm.name = NULL;
405 dent = ubifs_tnc_next_ent(c, &key, &nm); 412 dent = ubifs_tnc_next_ent(c, &key, &nm);
406 if (IS_ERR(dent)) { 413 if (IS_ERR(dent)) {
407 err = PTR_ERR(dent); 414 err = PTR_ERR(dent);
408 goto out; 415 goto out;
409 } 416 }
410 file->f_pos = key_hash_flash(c, &dent->key); 417 ctx->pos = key_hash_flash(c, &dent->key);
411 file->private_data = dent; 418 file->private_data = dent;
412 } 419 }
413 420
@@ -419,10 +426,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
419 ubifs_inode(dir)->creat_sqnum); 426 ubifs_inode(dir)->creat_sqnum);
420 427
421 nm.len = le16_to_cpu(dent->nlen); 428 nm.len = le16_to_cpu(dent->nlen);
422 over = filldir(dirent, dent->name, nm.len, file->f_pos, 429 if (!dir_emit(ctx, dent->name, nm.len,
423 le64_to_cpu(dent->inum), 430 le64_to_cpu(dent->inum),
424 vfs_dent_type(dent->type)); 431 vfs_dent_type(dent->type)))
425 if (over)
426 return 0; 432 return 0;
427 433
428 /* Switch to the next entry */ 434 /* Switch to the next entry */
@@ -435,7 +441,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
435 } 441 }
436 442
437 kfree(file->private_data); 443 kfree(file->private_data);
438 file->f_pos = key_hash_flash(c, &dent->key); 444 ctx->pos = key_hash_flash(c, &dent->key);
439 file->private_data = dent; 445 file->private_data = dent;
440 cond_resched(); 446 cond_resched();
441 } 447 }
@@ -448,18 +454,11 @@ out:
448 454
449 kfree(file->private_data); 455 kfree(file->private_data);
450 file->private_data = NULL; 456 file->private_data = NULL;
451 file->f_pos = 2; 457 /* 2 is a special value indicating that there are no more direntries */
458 ctx->pos = 2;
452 return 0; 459 return 0;
453} 460}
454 461
455/* If a directory is seeked, we have to free saved readdir() state */
456static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
457{
458 kfree(file->private_data);
459 file->private_data = NULL;
460 return generic_file_llseek(file, offset, whence);
461}
462
463/* Free saved readdir() state when the directory is closed */ 462/* Free saved readdir() state when the directory is closed */
464static int ubifs_dir_release(struct inode *dir, struct file *file) 463static int ubifs_dir_release(struct inode *dir, struct file *file)
465{ 464{
@@ -1177,10 +1176,10 @@ const struct inode_operations ubifs_dir_inode_operations = {
1177}; 1176};
1178 1177
1179const struct file_operations ubifs_dir_operations = { 1178const struct file_operations ubifs_dir_operations = {
1180 .llseek = ubifs_dir_llseek, 1179 .llseek = generic_file_llseek,
1181 .release = ubifs_dir_release, 1180 .release = ubifs_dir_release,
1182 .read = generic_read_dir, 1181 .read = generic_read_dir,
1183 .readdir = ubifs_readdir, 1182 .iterate = ubifs_readdir,
1184 .fsync = ubifs_fsync, 1183 .fsync = ubifs_fsync,
1185 .unlocked_ioctl = ubifs_ioctl, 1184 .unlocked_ioctl = ubifs_ioctl,
1186#ifdef CONFIG_COMPAT 1185#ifdef CONFIG_COMPAT
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 14374530784c..123c79b7261e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
1277 return err; 1277 return err;
1278} 1278}
1279 1279
1280static void ubifs_invalidatepage(struct page *page, unsigned long offset) 1280static void ubifs_invalidatepage(struct page *page, unsigned int offset,
1281 unsigned int length)
1281{ 1282{
1282 struct inode *inode = page->mapping->host; 1283 struct inode *inode = page->mapping->host;
1283 struct ubifs_info *c = inode->i_sb->s_fs_info; 1284 struct ubifs_info *c = inode->i_sb->s_fs_info;
1284 1285
1285 ubifs_assert(PagePrivate(page)); 1286 ubifs_assert(PagePrivate(page));
1286 if (offset) 1287 if (offset || length < PAGE_CACHE_SIZE)
1287 /* Partial page remains dirty */ 1288 /* Partial page remains dirty */
1288 return; 1289 return;
1289 1290
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index b3e93f5e17c3..a012c51caffd 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -35,14 +35,16 @@
35#include "udf_i.h" 35#include "udf_i.h"
36#include "udf_sb.h" 36#include "udf_sb.h"
37 37
38static int do_udf_readdir(struct inode *dir, struct file *filp, 38
39 filldir_t filldir, void *dirent) 39static int udf_readdir(struct file *file, struct dir_context *ctx)
40{ 40{
41 struct inode *dir = file_inode(file);
42 struct udf_inode_info *iinfo = UDF_I(dir);
41 struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; 43 struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
42 struct fileIdentDesc *fi = NULL; 44 struct fileIdentDesc *fi = NULL;
43 struct fileIdentDesc cfi; 45 struct fileIdentDesc cfi;
44 int block, iblock; 46 int block, iblock;
45 loff_t nf_pos = (filp->f_pos - 1) << 2; 47 loff_t nf_pos;
46 int flen; 48 int flen;
47 unsigned char *fname = NULL; 49 unsigned char *fname = NULL;
48 unsigned char *nameptr; 50 unsigned char *nameptr;
@@ -54,10 +56,14 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
54 uint32_t elen; 56 uint32_t elen;
55 sector_t offset; 57 sector_t offset;
56 int i, num, ret = 0; 58 int i, num, ret = 0;
57 unsigned int dt_type;
58 struct extent_position epos = { NULL, 0, {0, 0} }; 59 struct extent_position epos = { NULL, 0, {0, 0} };
59 struct udf_inode_info *iinfo;
60 60
61 if (ctx->pos == 0) {
62 if (!dir_emit_dot(file, ctx))
63 return 0;
64 ctx->pos = 1;
65 }
66 nf_pos = (ctx->pos - 1) << 2;
61 if (nf_pos >= size) 67 if (nf_pos >= size)
62 goto out; 68 goto out;
63 69
@@ -71,7 +77,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
71 nf_pos = udf_ext0_offset(dir); 77 nf_pos = udf_ext0_offset(dir);
72 78
73 fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); 79 fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
74 iinfo = UDF_I(dir);
75 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 80 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
76 if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, 81 if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
77 &epos, &eloc, &elen, &offset) 82 &epos, &eloc, &elen, &offset)
@@ -116,7 +121,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
116 } 121 }
117 122
118 while (nf_pos < size) { 123 while (nf_pos < size) {
119 filp->f_pos = (nf_pos >> 2) + 1; 124 struct kernel_lb_addr tloc;
125
126 ctx->pos = (nf_pos >> 2) + 1;
120 127
121 fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, 128 fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
122 &elen, &offset); 129 &elen, &offset);
@@ -155,24 +162,22 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
155 } 162 }
156 163
157 if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { 164 if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
158 iblock = parent_ino(filp->f_path.dentry); 165 if (!dir_emit_dotdot(file, ctx))
159 flen = 2; 166 goto out;
160 memcpy(fname, "..", flen); 167 continue;
161 dt_type = DT_DIR;
162 } else {
163 struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
164
165 iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
166 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
167 dt_type = DT_UNKNOWN;
168 } 168 }
169 169
170 if (flen && filldir(dirent, fname, flen, filp->f_pos, 170 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
171 iblock, dt_type) < 0) 171 if (!flen)
172 continue;
173
174 tloc = lelb_to_cpu(cfi.icb.extLocation);
175 iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
176 if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
172 goto out; 177 goto out;
173 } /* end while */ 178 } /* end while */
174 179
175 filp->f_pos = (nf_pos >> 2) + 1; 180 ctx->pos = (nf_pos >> 2) + 1;
176 181
177out: 182out:
178 if (fibh.sbh != fibh.ebh) 183 if (fibh.sbh != fibh.ebh)
@@ -184,27 +189,11 @@ out:
184 return ret; 189 return ret;
185} 190}
186 191
187static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
188{
189 struct inode *dir = file_inode(filp);
190 int result;
191
192 if (filp->f_pos == 0) {
193 if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
194 return 0;
195 }
196 filp->f_pos++;
197 }
198
199 result = do_udf_readdir(dir, filp, filldir, dirent);
200 return result;
201}
202
203/* readdir and lookup functions */ 192/* readdir and lookup functions */
204const struct file_operations udf_dir_operations = { 193const struct file_operations udf_dir_operations = {
205 .llseek = generic_file_llseek, 194 .llseek = generic_file_llseek,
206 .read = generic_read_dir, 195 .read = generic_read_dir,
207 .readdir = udf_readdir, 196 .iterate = udf_readdir,
208 .unlocked_ioctl = udf_ioctl, 197 .unlocked_ioctl = udf_ioctl,
209 .fsync = generic_file_fsync, 198 .fsync = generic_file_fsync,
210}; 199};
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 102c072c6bbf..5f6fc17d6bc5 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -594,6 +594,29 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
594 return 0; 594 return 0;
595} 595}
596 596
597static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
598{
599 struct inode *inode;
600 struct udf_inode_info *iinfo;
601 int err;
602
603 inode = udf_new_inode(dir, mode, &err);
604 if (!inode)
605 return err;
606
607 iinfo = UDF_I(inode);
608 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
609 inode->i_data.a_ops = &udf_adinicb_aops;
610 else
611 inode->i_data.a_ops = &udf_aops;
612 inode->i_op = &udf_file_inode_operations;
613 inode->i_fop = &udf_file_operations;
614 mark_inode_dirty(inode);
615
616 d_tmpfile(dentry, inode);
617 return 0;
618}
619
597static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, 620static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
598 dev_t rdev) 621 dev_t rdev)
599{ 622{
@@ -1311,6 +1334,7 @@ const struct inode_operations udf_dir_inode_operations = {
1311 .rmdir = udf_rmdir, 1334 .rmdir = udf_rmdir,
1312 .mknod = udf_mknod, 1335 .mknod = udf_mknod,
1313 .rename = udf_rename, 1336 .rename = udf_rename,
1337 .tmpfile = udf_tmpfile,
1314}; 1338};
1315const struct inode_operations udf_symlink_inode_operations = { 1339const struct inode_operations udf_symlink_inode_operations = {
1316 .readlink = generic_readlink, 1340 .readlink = generic_readlink,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 3a75ca09c506..0ecc2cebed8f 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -430,16 +430,16 @@ ufs_validate_entry(struct super_block *sb, char *base,
430 * This is blatantly stolen from ext2fs 430 * This is blatantly stolen from ext2fs
431 */ 431 */
432static int 432static int
433ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) 433ufs_readdir(struct file *file, struct dir_context *ctx)
434{ 434{
435 loff_t pos = filp->f_pos; 435 loff_t pos = ctx->pos;
436 struct inode *inode = file_inode(filp); 436 struct inode *inode = file_inode(file);
437 struct super_block *sb = inode->i_sb; 437 struct super_block *sb = inode->i_sb;
438 unsigned int offset = pos & ~PAGE_CACHE_MASK; 438 unsigned int offset = pos & ~PAGE_CACHE_MASK;
439 unsigned long n = pos >> PAGE_CACHE_SHIFT; 439 unsigned long n = pos >> PAGE_CACHE_SHIFT;
440 unsigned long npages = ufs_dir_pages(inode); 440 unsigned long npages = ufs_dir_pages(inode);
441 unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); 441 unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
442 int need_revalidate = filp->f_version != inode->i_version; 442 int need_revalidate = file->f_version != inode->i_version;
443 unsigned flags = UFS_SB(sb)->s_flags; 443 unsigned flags = UFS_SB(sb)->s_flags;
444 444
445 UFSD("BEGIN\n"); 445 UFSD("BEGIN\n");
@@ -457,16 +457,16 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
457 ufs_error(sb, __func__, 457 ufs_error(sb, __func__,
458 "bad page in #%lu", 458 "bad page in #%lu",
459 inode->i_ino); 459 inode->i_ino);
460 filp->f_pos += PAGE_CACHE_SIZE - offset; 460 ctx->pos += PAGE_CACHE_SIZE - offset;
461 return -EIO; 461 return -EIO;
462 } 462 }
463 kaddr = page_address(page); 463 kaddr = page_address(page);
464 if (unlikely(need_revalidate)) { 464 if (unlikely(need_revalidate)) {
465 if (offset) { 465 if (offset) {
466 offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); 466 offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
467 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; 467 ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
468 } 468 }
469 filp->f_version = inode->i_version; 469 file->f_version = inode->i_version;
470 need_revalidate = 0; 470 need_revalidate = 0;
471 } 471 }
472 de = (struct ufs_dir_entry *)(kaddr+offset); 472 de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -479,11 +479,8 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
479 return -EIO; 479 return -EIO;
480 } 480 }
481 if (de->d_ino) { 481 if (de->d_ino) {
482 int over;
483 unsigned char d_type = DT_UNKNOWN; 482 unsigned char d_type = DT_UNKNOWN;
484 483
485 offset = (char *)de - kaddr;
486
487 UFSD("filldir(%s,%u)\n", de->d_name, 484 UFSD("filldir(%s,%u)\n", de->d_name,
488 fs32_to_cpu(sb, de->d_ino)); 485 fs32_to_cpu(sb, de->d_ino));
489 UFSD("namlen %u\n", ufs_get_de_namlen(sb, de)); 486 UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
@@ -491,16 +488,15 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
491 if ((flags & UFS_DE_MASK) == UFS_DE_44BSD) 488 if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
492 d_type = de->d_u.d_44.d_type; 489 d_type = de->d_u.d_44.d_type;
493 490
494 over = filldir(dirent, de->d_name, 491 if (!dir_emit(ctx, de->d_name,
495 ufs_get_de_namlen(sb, de), 492 ufs_get_de_namlen(sb, de),
496 (n<<PAGE_CACHE_SHIFT) | offset, 493 fs32_to_cpu(sb, de->d_ino),
497 fs32_to_cpu(sb, de->d_ino), d_type); 494 d_type)) {
498 if (over) {
499 ufs_put_page(page); 495 ufs_put_page(page);
500 return 0; 496 return 0;
501 } 497 }
502 } 498 }
503 filp->f_pos += fs16_to_cpu(sb, de->d_reclen); 499 ctx->pos += fs16_to_cpu(sb, de->d_reclen);
504 } 500 }
505 ufs_put_page(page); 501 ufs_put_page(page);
506 } 502 }
@@ -660,7 +656,7 @@ not_empty:
660 656
661const struct file_operations ufs_dir_operations = { 657const struct file_operations ufs_dir_operations = {
662 .read = generic_read_dir, 658 .read = generic_read_dir,
663 .readdir = ufs_readdir, 659 .iterate = ufs_readdir,
664 .fsync = generic_file_fsync, 660 .fsync = generic_file_fsync,
665 .llseek = generic_file_llseek, 661 .llseek = generic_file_llseek,
666}; 662};
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 1d32f1d52763..306d883d89bc 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -21,6 +21,8 @@
21#include "xfs_bmap_btree.h" 21#include "xfs_bmap_btree.h"
22#include "xfs_inode.h" 22#include "xfs_inode.h"
23#include "xfs_vnodeops.h" 23#include "xfs_vnodeops.h"
24#include "xfs_sb.h"
25#include "xfs_mount.h"
24#include "xfs_trace.h" 26#include "xfs_trace.h"
25#include <linux/slab.h> 27#include <linux/slab.h>
26#include <linux/xattr.h> 28#include <linux/xattr.h>
@@ -34,7 +36,9 @@
34 */ 36 */
35 37
36STATIC struct posix_acl * 38STATIC struct posix_acl *
37xfs_acl_from_disk(struct xfs_acl *aclp) 39xfs_acl_from_disk(
40 struct xfs_acl *aclp,
41 int max_entries)
38{ 42{
39 struct posix_acl_entry *acl_e; 43 struct posix_acl_entry *acl_e;
40 struct posix_acl *acl; 44 struct posix_acl *acl;
@@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
42 unsigned int count, i; 46 unsigned int count, i;
43 47
44 count = be32_to_cpu(aclp->acl_cnt); 48 count = be32_to_cpu(aclp->acl_cnt);
45 if (count > XFS_ACL_MAX_ENTRIES) 49 if (count > max_entries)
46 return ERR_PTR(-EFSCORRUPTED); 50 return ERR_PTR(-EFSCORRUPTED);
47 51
48 acl = posix_acl_alloc(count, GFP_KERNEL); 52 acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type)
108 struct xfs_inode *ip = XFS_I(inode); 112 struct xfs_inode *ip = XFS_I(inode);
109 struct posix_acl *acl; 113 struct posix_acl *acl;
110 struct xfs_acl *xfs_acl; 114 struct xfs_acl *xfs_acl;
111 int len = sizeof(struct xfs_acl);
112 unsigned char *ea_name; 115 unsigned char *ea_name;
113 int error; 116 int error;
117 int len;
114 118
115 acl = get_cached_acl(inode, type); 119 acl = get_cached_acl(inode, type);
116 if (acl != ACL_NOT_CACHED) 120 if (acl != ACL_NOT_CACHED)
@@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type)
133 * If we have a cached ACLs value just return it, not need to 137 * If we have a cached ACLs value just return it, not need to
134 * go out to the disk. 138 * go out to the disk.
135 */ 139 */
136 140 len = XFS_ACL_MAX_SIZE(ip->i_mount);
137 xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); 141 xfs_acl = kzalloc(len, GFP_KERNEL);
138 if (!xfs_acl) 142 if (!xfs_acl)
139 return ERR_PTR(-ENOMEM); 143 return ERR_PTR(-ENOMEM);
140 144
@@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type)
153 goto out; 157 goto out;
154 } 158 }
155 159
156 acl = xfs_acl_from_disk(xfs_acl); 160 acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
157 if (IS_ERR(acl)) 161 if (IS_ERR(acl))
158 goto out; 162 goto out;
159 163
@@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
189 193
190 if (acl) { 194 if (acl) {
191 struct xfs_acl *xfs_acl; 195 struct xfs_acl *xfs_acl;
192 int len; 196 int len = XFS_ACL_MAX_SIZE(ip->i_mount);
193 197
194 xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); 198 xfs_acl = kzalloc(len, GFP_KERNEL);
195 if (!xfs_acl) 199 if (!xfs_acl)
196 return -ENOMEM; 200 return -ENOMEM;
197 201
198 xfs_acl_to_disk(xfs_acl, acl); 202 xfs_acl_to_disk(xfs_acl, acl);
199 len = sizeof(struct xfs_acl) - 203
200 (sizeof(struct xfs_acl_entry) * 204 /* subtract away the unused acl entries */
201 (XFS_ACL_MAX_ENTRIES - acl->a_count)); 205 len -= sizeof(struct xfs_acl_entry) *
206 (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
202 207
203 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, 208 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
204 len, ATTR_ROOT); 209 len, ATTR_ROOT);
@@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
243static int 248static int
244xfs_acl_exists(struct inode *inode, unsigned char *name) 249xfs_acl_exists(struct inode *inode, unsigned char *name)
245{ 250{
246 int len = sizeof(struct xfs_acl); 251 int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
247 252
248 return (xfs_attr_get(XFS_I(inode), name, NULL, &len, 253 return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
249 ATTR_ROOT|ATTR_KERNOVAL) == 0); 254 ATTR_ROOT|ATTR_KERNOVAL) == 0);
@@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
379 goto out_release; 384 goto out_release;
380 385
381 error = -EINVAL; 386 error = -EINVAL;
382 if (acl->a_count > XFS_ACL_MAX_ENTRIES) 387 if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
383 goto out_release; 388 goto out_release;
384 389
385 if (type == ACL_TYPE_ACCESS) { 390 if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 39632d941354..4016a567b83c 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,19 +22,36 @@ struct inode;
22struct posix_acl; 22struct posix_acl;
23struct xfs_inode; 23struct xfs_inode;
24 24
25#define XFS_ACL_MAX_ENTRIES 25
26#define XFS_ACL_NOT_PRESENT (-1) 25#define XFS_ACL_NOT_PRESENT (-1)
27 26
28/* On-disk XFS access control list structure */ 27/* On-disk XFS access control list structure */
28struct xfs_acl_entry {
29 __be32 ae_tag;
30 __be32 ae_id;
31 __be16 ae_perm;
32 __be16 ae_pad; /* fill the implicit hole in the structure */
33};
34
29struct xfs_acl { 35struct xfs_acl {
30 __be32 acl_cnt; 36 __be32 acl_cnt;
31 struct xfs_acl_entry { 37 struct xfs_acl_entry acl_entry[0];
32 __be32 ae_tag;
33 __be32 ae_id;
34 __be16 ae_perm;
35 } acl_entry[XFS_ACL_MAX_ENTRIES];
36}; 38};
37 39
40/*
41 * The number of ACL entries allowed is defined by the on-disk format.
42 * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
43 * limited only by the maximum size of the xattr that stores the information.
44 */
45#define XFS_ACL_MAX_ENTRIES(mp) \
46 (xfs_sb_version_hascrc(&mp->m_sb) \
47 ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
48 sizeof(struct xfs_acl_entry) \
49 : 25)
50
51#define XFS_ACL_MAX_SIZE(mp) \
52 (sizeof(struct xfs_acl) + \
53 sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
54
38/* On-disk XFS extended attribute names */ 55/* On-disk XFS extended attribute names */
39#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" 56#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
40#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" 57#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2b2691b73428..596ec71da00e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -725,6 +725,25 @@ xfs_convert_page(
725 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, 725 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
726 i_size_read(inode)); 726 i_size_read(inode));
727 727
728 /*
729 * If the current map does not span the entire page we are about to try
730 * to write, then give up. The only way we can write a page that spans
731 * multiple mappings in a single writeback iteration is via the
732 * xfs_vm_writepage() function. Data integrity writeback requires the
733 * entire page to be written in a single attempt, otherwise the part of
734 * the page we don't write here doesn't get written as part of the data
735 * integrity sync.
736 *
737 * For normal writeback, we also don't attempt to write partial pages
738 * here as it simply means that write_cache_pages() will see it under
739 * writeback and ignore the page until some point in the future, at
740 * which time this will be the only page in the file that needs
741 * writeback. Hence for more optimal IO patterns, we should always
742 * avoid partial page writeback due to multiple mappings on a page here.
743 */
744 if (!xfs_imap_valid(inode, imap, end_offset))
745 goto fail_unlock_page;
746
728 len = 1 << inode->i_blkbits; 747 len = 1 << inode->i_blkbits;
729 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 748 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
730 PAGE_CACHE_SIZE); 749 PAGE_CACHE_SIZE);
@@ -824,10 +843,12 @@ xfs_cluster_write(
824STATIC void 843STATIC void
825xfs_vm_invalidatepage( 844xfs_vm_invalidatepage(
826 struct page *page, 845 struct page *page,
827 unsigned long offset) 846 unsigned int offset,
847 unsigned int length)
828{ 848{
829 trace_xfs_invalidatepage(page->mapping->host, page, offset); 849 trace_xfs_invalidatepage(page->mapping->host, page, offset,
830 block_invalidatepage(page, offset); 850 length);
851 block_invalidatepage(page, offset, length);
831} 852}
832 853
833/* 854/*
@@ -891,7 +912,7 @@ next_buffer:
891 912
892 xfs_iunlock(ip, XFS_ILOCK_EXCL); 913 xfs_iunlock(ip, XFS_ILOCK_EXCL);
893out_invalidate: 914out_invalidate:
894 xfs_vm_invalidatepage(page, 0); 915 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
895 return; 916 return;
896} 917}
897 918
@@ -921,7 +942,7 @@ xfs_vm_writepage(
921 int count = 0; 942 int count = 0;
922 int nonblocking = 0; 943 int nonblocking = 0;
923 944
924 trace_xfs_writepage(inode, page, 0); 945 trace_xfs_writepage(inode, page, 0, 0);
925 946
926 ASSERT(page_has_buffers(page)); 947 ASSERT(page_has_buffers(page));
927 948
@@ -1152,7 +1173,7 @@ xfs_vm_releasepage(
1152{ 1173{
1153 int delalloc, unwritten; 1174 int delalloc, unwritten;
1154 1175
1155 trace_xfs_releasepage(page->mapping->host, page, 0); 1176 trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1156 1177
1157 xfs_count_page_state(page, &delalloc, &unwritten); 1178 xfs_count_page_state(page, &delalloc, &unwritten);
1158 1179
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 08d5457c948e..31d3cd129269 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -931,20 +931,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
931 */ 931 */
932int 932int
933xfs_attr_shortform_allfit( 933xfs_attr_shortform_allfit(
934 struct xfs_buf *bp, 934 struct xfs_buf *bp,
935 struct xfs_inode *dp) 935 struct xfs_inode *dp)
936{ 936{
937 xfs_attr_leafblock_t *leaf; 937 struct xfs_attr_leafblock *leaf;
938 xfs_attr_leaf_entry_t *entry; 938 struct xfs_attr_leaf_entry *entry;
939 xfs_attr_leaf_name_local_t *name_loc; 939 xfs_attr_leaf_name_local_t *name_loc;
940 int bytes, i; 940 struct xfs_attr3_icleaf_hdr leafhdr;
941 int bytes;
942 int i;
941 943
942 leaf = bp->b_addr; 944 leaf = bp->b_addr;
943 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 945 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
946 entry = xfs_attr3_leaf_entryp(leaf);
944 947
945 entry = &leaf->entries[0];
946 bytes = sizeof(struct xfs_attr_sf_hdr); 948 bytes = sizeof(struct xfs_attr_sf_hdr);
947 for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { 949 for (i = 0; i < leafhdr.count; entry++, i++) {
948 if (entry->flags & XFS_ATTR_INCOMPLETE) 950 if (entry->flags & XFS_ATTR_INCOMPLETE)
949 continue; /* don't copy partial entries */ 951 continue; /* don't copy partial entries */
950 if (!(entry->flags & XFS_ATTR_LOCAL)) 952 if (!(entry->flags & XFS_ATTR_LOCAL))
@@ -954,15 +956,15 @@ xfs_attr_shortform_allfit(
954 return(0); 956 return(0);
955 if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) 957 if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
956 return(0); 958 return(0);
957 bytes += sizeof(struct xfs_attr_sf_entry)-1 959 bytes += sizeof(struct xfs_attr_sf_entry) - 1
958 + name_loc->namelen 960 + name_loc->namelen
959 + be16_to_cpu(name_loc->valuelen); 961 + be16_to_cpu(name_loc->valuelen);
960 } 962 }
961 if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && 963 if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
962 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && 964 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
963 (bytes == sizeof(struct xfs_attr_sf_hdr))) 965 (bytes == sizeof(struct xfs_attr_sf_hdr)))
964 return(-1); 966 return -1;
965 return(xfs_attr_shortform_bytesfit(dp, bytes)); 967 return xfs_attr_shortform_bytesfit(dp, bytes);
966} 968}
967 969
968/* 970/*
@@ -1410,7 +1412,7 @@ xfs_attr3_leaf_add_work(
1410 name_rmt->valuelen = 0; 1412 name_rmt->valuelen = 0;
1411 name_rmt->valueblk = 0; 1413 name_rmt->valueblk = 0;
1412 args->rmtblkno = 1; 1414 args->rmtblkno = 1;
1413 args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); 1415 args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
1414 } 1416 }
1415 xfs_trans_log_buf(args->trans, bp, 1417 xfs_trans_log_buf(args->trans, bp,
1416 XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), 1418 XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
@@ -1443,11 +1445,12 @@ xfs_attr3_leaf_add_work(
1443STATIC void 1445STATIC void
1444xfs_attr3_leaf_compact( 1446xfs_attr3_leaf_compact(
1445 struct xfs_da_args *args, 1447 struct xfs_da_args *args,
1446 struct xfs_attr3_icleaf_hdr *ichdr_d, 1448 struct xfs_attr3_icleaf_hdr *ichdr_dst,
1447 struct xfs_buf *bp) 1449 struct xfs_buf *bp)
1448{ 1450{
1449 xfs_attr_leafblock_t *leaf_s, *leaf_d; 1451 struct xfs_attr_leafblock *leaf_src;
1450 struct xfs_attr3_icleaf_hdr ichdr_s; 1452 struct xfs_attr_leafblock *leaf_dst;
1453 struct xfs_attr3_icleaf_hdr ichdr_src;
1451 struct xfs_trans *trans = args->trans; 1454 struct xfs_trans *trans = args->trans;
1452 struct xfs_mount *mp = trans->t_mountp; 1455 struct xfs_mount *mp = trans->t_mountp;
1453 char *tmpbuffer; 1456 char *tmpbuffer;
@@ -1455,29 +1458,38 @@ xfs_attr3_leaf_compact(
1455 trace_xfs_attr_leaf_compact(args); 1458 trace_xfs_attr_leaf_compact(args);
1456 1459
1457 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); 1460 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
1458 ASSERT(tmpbuffer != NULL);
1459 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); 1461 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
1460 memset(bp->b_addr, 0, XFS_LBSIZE(mp)); 1462 memset(bp->b_addr, 0, XFS_LBSIZE(mp));
1463 leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
1464 leaf_dst = bp->b_addr;
1461 1465
1462 /* 1466 /*
1463 * Copy basic information 1467 * Copy the on-disk header back into the destination buffer to ensure
1468 * all the information in the header that is not part of the incore
1469 * header structure is preserved.
1464 */ 1470 */
1465 leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; 1471 memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
1466 leaf_d = bp->b_addr; 1472
1467 ichdr_s = *ichdr_d; /* struct copy */ 1473 /* Initialise the incore headers */
1468 ichdr_d->firstused = XFS_LBSIZE(mp); 1474 ichdr_src = *ichdr_dst; /* struct copy */
1469 ichdr_d->usedbytes = 0; 1475 ichdr_dst->firstused = XFS_LBSIZE(mp);
1470 ichdr_d->count = 0; 1476 ichdr_dst->usedbytes = 0;
1471 ichdr_d->holes = 0; 1477 ichdr_dst->count = 0;
1472 ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s); 1478 ichdr_dst->holes = 0;
1473 ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; 1479 ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
1480 ichdr_dst->freemap[0].size = ichdr_dst->firstused -
1481 ichdr_dst->freemap[0].base;
1482
1483
1484 /* write the header back to initialise the underlying buffer */
1485 xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
1474 1486
1475 /* 1487 /*
1476 * Copy all entry's in the same (sorted) order, 1488 * Copy all entry's in the same (sorted) order,
1477 * but allocate name/value pairs packed and in sequence. 1489 * but allocate name/value pairs packed and in sequence.
1478 */ 1490 */
1479 xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0, 1491 xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0,
1480 ichdr_s.count, mp); 1492 ichdr_src.count, mp);
1481 /* 1493 /*
1482 * this logs the entire buffer, but the caller must write the header 1494 * this logs the entire buffer, but the caller must write the header
1483 * back to the buffer when it is finished modifying it. 1495 * back to the buffer when it is finished modifying it.
@@ -2179,14 +2191,24 @@ xfs_attr3_leaf_unbalance(
2179 struct xfs_attr_leafblock *tmp_leaf; 2191 struct xfs_attr_leafblock *tmp_leaf;
2180 struct xfs_attr3_icleaf_hdr tmphdr; 2192 struct xfs_attr3_icleaf_hdr tmphdr;
2181 2193
2182 tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP); 2194 tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP);
2183 memset(tmp_leaf, 0, state->blocksize); 2195
2184 memset(&tmphdr, 0, sizeof(tmphdr)); 2196 /*
2197 * Copy the header into the temp leaf so that all the stuff
2198 * not in the incore header is present and gets copied back in
2199 * once we've moved all the entries.
2200 */
2201 memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
2185 2202
2203 memset(&tmphdr, 0, sizeof(tmphdr));
2186 tmphdr.magic = savehdr.magic; 2204 tmphdr.magic = savehdr.magic;
2187 tmphdr.forw = savehdr.forw; 2205 tmphdr.forw = savehdr.forw;
2188 tmphdr.back = savehdr.back; 2206 tmphdr.back = savehdr.back;
2189 tmphdr.firstused = state->blocksize; 2207 tmphdr.firstused = state->blocksize;
2208
2209 /* write the header to the temp buffer to initialise it */
2210 xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
2211
2190 if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, 2212 if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
2191 drop_blk->bp, &drophdr)) { 2213 drop_blk->bp, &drophdr)) {
2192 xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, 2214 xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
@@ -2330,9 +2352,11 @@ xfs_attr3_leaf_lookup_int(
2330 if (!xfs_attr_namesp_match(args->flags, entry->flags)) 2352 if (!xfs_attr_namesp_match(args->flags, entry->flags))
2331 continue; 2353 continue;
2332 args->index = probe; 2354 args->index = probe;
2355 args->valuelen = be32_to_cpu(name_rmt->valuelen);
2333 args->rmtblkno = be32_to_cpu(name_rmt->valueblk); 2356 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2334 args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, 2357 args->rmtblkcnt = xfs_attr3_rmt_blocks(
2335 be32_to_cpu(name_rmt->valuelen)); 2358 args->dp->i_mount,
2359 args->valuelen);
2336 return XFS_ERROR(EEXIST); 2360 return XFS_ERROR(EEXIST);
2337 } 2361 }
2338 } 2362 }
@@ -2383,7 +2407,8 @@ xfs_attr3_leaf_getvalue(
2383 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); 2407 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
2384 valuelen = be32_to_cpu(name_rmt->valuelen); 2408 valuelen = be32_to_cpu(name_rmt->valuelen);
2385 args->rmtblkno = be32_to_cpu(name_rmt->valueblk); 2409 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2386 args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); 2410 args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
2411 valuelen);
2387 if (args->flags & ATTR_KERNOVAL) { 2412 if (args->flags & ATTR_KERNOVAL) {
2388 args->valuelen = valuelen; 2413 args->valuelen = valuelen;
2389 return 0; 2414 return 0;
@@ -2709,7 +2734,8 @@ xfs_attr3_leaf_list_int(
2709 args.valuelen = valuelen; 2734 args.valuelen = valuelen;
2710 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); 2735 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
2711 args.rmtblkno = be32_to_cpu(name_rmt->valueblk); 2736 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
2712 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); 2737 args.rmtblkcnt = xfs_attr3_rmt_blocks(
2738 args.dp->i_mount, valuelen);
2713 retval = xfs_attr_rmtval_get(&args); 2739 retval = xfs_attr_rmtval_get(&args);
2714 if (retval) 2740 if (retval)
2715 return retval; 2741 return retval;
@@ -3232,7 +3258,7 @@ xfs_attr3_leaf_inactive(
3232 name_rmt = xfs_attr3_leaf_name_remote(leaf, i); 3258 name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
3233 if (name_rmt->valueblk) { 3259 if (name_rmt->valueblk) {
3234 lp->valueblk = be32_to_cpu(name_rmt->valueblk); 3260 lp->valueblk = be32_to_cpu(name_rmt->valueblk);
3235 lp->valuelen = XFS_B_TO_FSB(dp->i_mount, 3261 lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
3236 be32_to_cpu(name_rmt->valuelen)); 3262 be32_to_cpu(name_rmt->valuelen));
3237 lp++; 3263 lp++;
3238 } 3264 }
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index f9d7846097e2..444a7704596c 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -128,6 +128,7 @@ struct xfs_attr3_leaf_hdr {
128 __u8 holes; 128 __u8 holes;
129 __u8 pad1; 129 __u8 pad1;
130 struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; 130 struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
131 __be32 pad2; /* 64 bit alignment */
131}; 132};
132 133
133#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) 134#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index dee84466dcc9..ef6b0c124528 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -47,22 +47,55 @@
47 * Each contiguous block has a header, so it is not just a simple attribute 47 * Each contiguous block has a header, so it is not just a simple attribute
48 * length to FSB conversion. 48 * length to FSB conversion.
49 */ 49 */
50static int 50int
51xfs_attr3_rmt_blocks( 51xfs_attr3_rmt_blocks(
52 struct xfs_mount *mp, 52 struct xfs_mount *mp,
53 int attrlen) 53 int attrlen)
54{ 54{
55 int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, 55 if (xfs_sb_version_hascrc(&mp->m_sb)) {
56 mp->m_sb.sb_blocksize); 56 int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
57 return (attrlen + buflen - 1) / buflen; 57 return (attrlen + buflen - 1) / buflen;
58 }
59 return XFS_B_TO_FSB(mp, attrlen);
60}
61
62/*
63 * Checking of the remote attribute header is split into two parts. The verifier
64 * does CRC, location and bounds checking, the unpacking function checks the
65 * attribute parameters and owner.
66 */
67static bool
68xfs_attr3_rmt_hdr_ok(
69 struct xfs_mount *mp,
70 void *ptr,
71 xfs_ino_t ino,
72 uint32_t offset,
73 uint32_t size,
74 xfs_daddr_t bno)
75{
76 struct xfs_attr3_rmt_hdr *rmt = ptr;
77
78 if (bno != be64_to_cpu(rmt->rm_blkno))
79 return false;
80 if (offset != be32_to_cpu(rmt->rm_offset))
81 return false;
82 if (size != be32_to_cpu(rmt->rm_bytes))
83 return false;
84 if (ino != be64_to_cpu(rmt->rm_owner))
85 return false;
86
87 /* ok */
88 return true;
58} 89}
59 90
60static bool 91static bool
61xfs_attr3_rmt_verify( 92xfs_attr3_rmt_verify(
62 struct xfs_buf *bp) 93 struct xfs_mount *mp,
94 void *ptr,
95 int fsbsize,
96 xfs_daddr_t bno)
63{ 97{
64 struct xfs_mount *mp = bp->b_target->bt_mount; 98 struct xfs_attr3_rmt_hdr *rmt = ptr;
65 struct xfs_attr3_rmt_hdr *rmt = bp->b_addr;
66 99
67 if (!xfs_sb_version_hascrc(&mp->m_sb)) 100 if (!xfs_sb_version_hascrc(&mp->m_sb))
68 return false; 101 return false;
@@ -70,7 +103,9 @@ xfs_attr3_rmt_verify(
70 return false; 103 return false;
71 if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) 104 if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
72 return false; 105 return false;
73 if (bp->b_bn != be64_to_cpu(rmt->rm_blkno)) 106 if (be64_to_cpu(rmt->rm_blkno) != bno)
107 return false;
108 if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
74 return false; 109 return false;
75 if (be32_to_cpu(rmt->rm_offset) + 110 if (be32_to_cpu(rmt->rm_offset) +
76 be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) 111 be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
@@ -86,17 +121,40 @@ xfs_attr3_rmt_read_verify(
86 struct xfs_buf *bp) 121 struct xfs_buf *bp)
87{ 122{
88 struct xfs_mount *mp = bp->b_target->bt_mount; 123 struct xfs_mount *mp = bp->b_target->bt_mount;
124 char *ptr;
125 int len;
126 bool corrupt = false;
127 xfs_daddr_t bno;
89 128
90 /* no verification of non-crc buffers */ 129 /* no verification of non-crc buffers */
91 if (!xfs_sb_version_hascrc(&mp->m_sb)) 130 if (!xfs_sb_version_hascrc(&mp->m_sb))
92 return; 131 return;
93 132
94 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 133 ptr = bp->b_addr;
95 XFS_ATTR3_RMT_CRC_OFF) || 134 bno = bp->b_bn;
96 !xfs_attr3_rmt_verify(bp)) { 135 len = BBTOB(bp->b_length);
136 ASSERT(len >= XFS_LBSIZE(mp));
137
138 while (len > 0) {
139 if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
140 XFS_ATTR3_RMT_CRC_OFF)) {
141 corrupt = true;
142 break;
143 }
144 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
145 corrupt = true;
146 break;
147 }
148 len -= XFS_LBSIZE(mp);
149 ptr += XFS_LBSIZE(mp);
150 bno += mp->m_bsize;
151 }
152
153 if (corrupt) {
97 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 154 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
98 xfs_buf_ioerror(bp, EFSCORRUPTED); 155 xfs_buf_ioerror(bp, EFSCORRUPTED);
99 } 156 } else
157 ASSERT(len == 0);
100} 158}
101 159
102static void 160static void
@@ -105,23 +163,39 @@ xfs_attr3_rmt_write_verify(
105{ 163{
106 struct xfs_mount *mp = bp->b_target->bt_mount; 164 struct xfs_mount *mp = bp->b_target->bt_mount;
107 struct xfs_buf_log_item *bip = bp->b_fspriv; 165 struct xfs_buf_log_item *bip = bp->b_fspriv;
166 char *ptr;
167 int len;
168 xfs_daddr_t bno;
108 169
109 /* no verification of non-crc buffers */ 170 /* no verification of non-crc buffers */
110 if (!xfs_sb_version_hascrc(&mp->m_sb)) 171 if (!xfs_sb_version_hascrc(&mp->m_sb))
111 return; 172 return;
112 173
113 if (!xfs_attr3_rmt_verify(bp)) { 174 ptr = bp->b_addr;
114 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 175 bno = bp->b_bn;
115 xfs_buf_ioerror(bp, EFSCORRUPTED); 176 len = BBTOB(bp->b_length);
116 return; 177 ASSERT(len >= XFS_LBSIZE(mp));
117 } 178
179 while (len > 0) {
180 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
181 XFS_CORRUPTION_ERROR(__func__,
182 XFS_ERRLEVEL_LOW, mp, bp->b_addr);
183 xfs_buf_ioerror(bp, EFSCORRUPTED);
184 return;
185 }
186 if (bip) {
187 struct xfs_attr3_rmt_hdr *rmt;
188
189 rmt = (struct xfs_attr3_rmt_hdr *)ptr;
190 rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
191 }
192 xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF);
118 193
119 if (bip) { 194 len -= XFS_LBSIZE(mp);
120 struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; 195 ptr += XFS_LBSIZE(mp);
121 rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); 196 bno += mp->m_bsize;
122 } 197 }
123 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 198 ASSERT(len == 0);
124 XFS_ATTR3_RMT_CRC_OFF);
125} 199}
126 200
127const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { 201const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
@@ -129,15 +203,16 @@ const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
129 .verify_write = xfs_attr3_rmt_write_verify, 203 .verify_write = xfs_attr3_rmt_write_verify,
130}; 204};
131 205
132static int 206STATIC int
133xfs_attr3_rmt_hdr_set( 207xfs_attr3_rmt_hdr_set(
134 struct xfs_mount *mp, 208 struct xfs_mount *mp,
209 void *ptr,
135 xfs_ino_t ino, 210 xfs_ino_t ino,
136 uint32_t offset, 211 uint32_t offset,
137 uint32_t size, 212 uint32_t size,
138 struct xfs_buf *bp) 213 xfs_daddr_t bno)
139{ 214{
140 struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; 215 struct xfs_attr3_rmt_hdr *rmt = ptr;
141 216
142 if (!xfs_sb_version_hascrc(&mp->m_sb)) 217 if (!xfs_sb_version_hascrc(&mp->m_sb))
143 return 0; 218 return 0;
@@ -147,36 +222,107 @@ xfs_attr3_rmt_hdr_set(
147 rmt->rm_bytes = cpu_to_be32(size); 222 rmt->rm_bytes = cpu_to_be32(size);
148 uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); 223 uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
149 rmt->rm_owner = cpu_to_be64(ino); 224 rmt->rm_owner = cpu_to_be64(ino);
150 rmt->rm_blkno = cpu_to_be64(bp->b_bn); 225 rmt->rm_blkno = cpu_to_be64(bno);
151 bp->b_ops = &xfs_attr3_rmt_buf_ops;
152 226
153 return sizeof(struct xfs_attr3_rmt_hdr); 227 return sizeof(struct xfs_attr3_rmt_hdr);
154} 228}
155 229
156/* 230/*
157 * Checking of the remote attribute header is split into two parts. the verifier 231 * Helper functions to copy attribute data in and out of the one disk extents
158 * does CRC, location and bounds checking, the unpacking function checks the
159 * attribute parameters and owner.
160 */ 232 */
161static bool 233STATIC int
162xfs_attr3_rmt_hdr_ok( 234xfs_attr_rmtval_copyout(
163 struct xfs_mount *mp, 235 struct xfs_mount *mp,
164 xfs_ino_t ino, 236 struct xfs_buf *bp,
165 uint32_t offset, 237 xfs_ino_t ino,
166 uint32_t size, 238 int *offset,
167 struct xfs_buf *bp) 239 int *valuelen,
240 char **dst)
168{ 241{
169 struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; 242 char *src = bp->b_addr;
243 xfs_daddr_t bno = bp->b_bn;
244 int len = BBTOB(bp->b_length);
170 245
171 if (offset != be32_to_cpu(rmt->rm_offset)) 246 ASSERT(len >= XFS_LBSIZE(mp));
172 return false;
173 if (size != be32_to_cpu(rmt->rm_bytes))
174 return false;
175 if (ino != be64_to_cpu(rmt->rm_owner))
176 return false;
177 247
178 /* ok */ 248 while (len > 0 && *valuelen > 0) {
179 return true; 249 int hdr_size = 0;
250 int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
251
252 byte_cnt = min_t(int, *valuelen, byte_cnt);
253
254 if (xfs_sb_version_hascrc(&mp->m_sb)) {
255 if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
256 byte_cnt, bno)) {
257 xfs_alert(mp,
258"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
259 bno, *offset, byte_cnt, ino);
260 return EFSCORRUPTED;
261 }
262 hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
263 }
264
265 memcpy(*dst, src + hdr_size, byte_cnt);
266
267 /* roll buffer forwards */
268 len -= XFS_LBSIZE(mp);
269 src += XFS_LBSIZE(mp);
270 bno += mp->m_bsize;
271
272 /* roll attribute data forwards */
273 *valuelen -= byte_cnt;
274 *dst += byte_cnt;
275 *offset += byte_cnt;
276 }
277 return 0;
278}
279
280STATIC void
281xfs_attr_rmtval_copyin(
282 struct xfs_mount *mp,
283 struct xfs_buf *bp,
284 xfs_ino_t ino,
285 int *offset,
286 int *valuelen,
287 char **src)
288{
289 char *dst = bp->b_addr;
290 xfs_daddr_t bno = bp->b_bn;
291 int len = BBTOB(bp->b_length);
292
293 ASSERT(len >= XFS_LBSIZE(mp));
294
295 while (len > 0 && *valuelen > 0) {
296 int hdr_size;
297 int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
298
299 byte_cnt = min(*valuelen, byte_cnt);
300 hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
301 byte_cnt, bno);
302
303 memcpy(dst + hdr_size, *src, byte_cnt);
304
305 /*
306 * If this is the last block, zero the remainder of it.
307 * Check that we are actually the last block, too.
308 */
309 if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) {
310 ASSERT(*valuelen - byte_cnt == 0);
311 ASSERT(len == XFS_LBSIZE(mp));
312 memset(dst + hdr_size + byte_cnt, 0,
313 XFS_LBSIZE(mp) - hdr_size - byte_cnt);
314 }
315
316 /* roll buffer forwards */
317 len -= XFS_LBSIZE(mp);
318 dst += XFS_LBSIZE(mp);
319 bno += mp->m_bsize;
320
321 /* roll attribute data forwards */
322 *valuelen -= byte_cnt;
323 *src += byte_cnt;
324 *offset += byte_cnt;
325 }
180} 326}
181 327
182/* 328/*
@@ -190,13 +336,12 @@ xfs_attr_rmtval_get(
190 struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; 336 struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE];
191 struct xfs_mount *mp = args->dp->i_mount; 337 struct xfs_mount *mp = args->dp->i_mount;
192 struct xfs_buf *bp; 338 struct xfs_buf *bp;
193 xfs_daddr_t dblkno;
194 xfs_dablk_t lblkno = args->rmtblkno; 339 xfs_dablk_t lblkno = args->rmtblkno;
195 void *dst = args->value; 340 char *dst = args->value;
196 int valuelen = args->valuelen; 341 int valuelen = args->valuelen;
197 int nmap; 342 int nmap;
198 int error; 343 int error;
199 int blkcnt; 344 int blkcnt = args->rmtblkcnt;
200 int i; 345 int i;
201 int offset = 0; 346 int offset = 0;
202 347
@@ -207,52 +352,36 @@ xfs_attr_rmtval_get(
207 while (valuelen > 0) { 352 while (valuelen > 0) {
208 nmap = ATTR_RMTVALUE_MAPSIZE; 353 nmap = ATTR_RMTVALUE_MAPSIZE;
209 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, 354 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
210 args->rmtblkcnt, map, &nmap, 355 blkcnt, map, &nmap,
211 XFS_BMAPI_ATTRFORK); 356 XFS_BMAPI_ATTRFORK);
212 if (error) 357 if (error)
213 return error; 358 return error;
214 ASSERT(nmap >= 1); 359 ASSERT(nmap >= 1);
215 360
216 for (i = 0; (i < nmap) && (valuelen > 0); i++) { 361 for (i = 0; (i < nmap) && (valuelen > 0); i++) {
217 int byte_cnt; 362 xfs_daddr_t dblkno;
218 char *src; 363 int dblkcnt;
219 364
220 ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && 365 ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
221 (map[i].br_startblock != HOLESTARTBLOCK)); 366 (map[i].br_startblock != HOLESTARTBLOCK));
222 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 367 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
223 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 368 dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
224 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 369 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
225 dblkno, blkcnt, 0, &bp, 370 dblkno, dblkcnt, 0, &bp,
226 &xfs_attr3_rmt_buf_ops); 371 &xfs_attr3_rmt_buf_ops);
227 if (error) 372 if (error)
228 return error; 373 return error;
229 374
230 byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length)); 375 error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
231 byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); 376 &offset, &valuelen,
232 377 &dst);
233 src = bp->b_addr;
234 if (xfs_sb_version_hascrc(&mp->m_sb)) {
235 if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino,
236 offset, byte_cnt, bp)) {
237 xfs_alert(mp,
238"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
239 offset, byte_cnt, args->dp->i_ino);
240 xfs_buf_relse(bp);
241 return EFSCORRUPTED;
242
243 }
244
245 src += sizeof(struct xfs_attr3_rmt_hdr);
246 }
247
248 memcpy(dst, src, byte_cnt);
249 xfs_buf_relse(bp); 378 xfs_buf_relse(bp);
379 if (error)
380 return error;
250 381
251 offset += byte_cnt; 382 /* roll attribute extent map forwards */
252 dst += byte_cnt;
253 valuelen -= byte_cnt;
254
255 lblkno += map[i].br_blockcount; 383 lblkno += map[i].br_blockcount;
384 blkcnt -= map[i].br_blockcount;
256 } 385 }
257 } 386 }
258 ASSERT(valuelen == 0); 387 ASSERT(valuelen == 0);
@@ -270,17 +399,13 @@ xfs_attr_rmtval_set(
270 struct xfs_inode *dp = args->dp; 399 struct xfs_inode *dp = args->dp;
271 struct xfs_mount *mp = dp->i_mount; 400 struct xfs_mount *mp = dp->i_mount;
272 struct xfs_bmbt_irec map; 401 struct xfs_bmbt_irec map;
273 struct xfs_buf *bp;
274 xfs_daddr_t dblkno;
275 xfs_dablk_t lblkno; 402 xfs_dablk_t lblkno;
276 xfs_fileoff_t lfileoff = 0; 403 xfs_fileoff_t lfileoff = 0;
277 void *src = args->value; 404 char *src = args->value;
278 int blkcnt; 405 int blkcnt;
279 int valuelen; 406 int valuelen;
280 int nmap; 407 int nmap;
281 int error; 408 int error;
282 int hdrcnt = 0;
283 bool crcs = xfs_sb_version_hascrc(&mp->m_sb);
284 int offset = 0; 409 int offset = 0;
285 410
286 trace_xfs_attr_rmtval_set(args); 411 trace_xfs_attr_rmtval_set(args);
@@ -289,24 +414,14 @@ xfs_attr_rmtval_set(
289 * Find a "hole" in the attribute address space large enough for 414 * Find a "hole" in the attribute address space large enough for
290 * us to drop the new attribute's value into. Because CRC enable 415 * us to drop the new attribute's value into. Because CRC enable
291 * attributes have headers, we can't just do a straight byte to FSB 416 * attributes have headers, we can't just do a straight byte to FSB
292 * conversion. We calculate the worst case block count in this case 417 * conversion and have to take the header space into account.
293 * and we may not need that many, so we have to handle this when
294 * allocating the blocks below.
295 */ 418 */
296 if (!crcs) 419 blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
297 blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
298 else
299 blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
300
301 error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, 420 error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
302 XFS_ATTR_FORK); 421 XFS_ATTR_FORK);
303 if (error) 422 if (error)
304 return error; 423 return error;
305 424
306 /* Start with the attribute data. We'll allocate the rest afterwards. */
307 if (crcs)
308 blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
309
310 args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; 425 args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
311 args->rmtblkcnt = blkcnt; 426 args->rmtblkcnt = blkcnt;
312 427
@@ -349,26 +464,6 @@ xfs_attr_rmtval_set(
349 (map.br_startblock != HOLESTARTBLOCK)); 464 (map.br_startblock != HOLESTARTBLOCK));
350 lblkno += map.br_blockcount; 465 lblkno += map.br_blockcount;
351 blkcnt -= map.br_blockcount; 466 blkcnt -= map.br_blockcount;
352 hdrcnt++;
353
354 /*
355 * If we have enough blocks for the attribute data, calculate
356 * how many extra blocks we need for headers. We might run
357 * through this multiple times in the case that the additional
358 * headers in the blocks needed for the data fragments spills
359 * into requiring more blocks. e.g. for 512 byte blocks, we'll
360 * spill for another block every 9 headers we require in this
361 * loop.
362 */
363 if (crcs && blkcnt == 0) {
364 int total_len;
365
366 total_len = args->valuelen +
367 hdrcnt * sizeof(struct xfs_attr3_rmt_hdr);
368 blkcnt = XFS_B_TO_FSB(mp, total_len);
369 blkcnt -= args->rmtblkcnt;
370 args->rmtblkcnt += blkcnt;
371 }
372 467
373 /* 468 /*
374 * Start the next trans in the chain. 469 * Start the next trans in the chain.
@@ -385,18 +480,19 @@ xfs_attr_rmtval_set(
385 * the INCOMPLETE flag. 480 * the INCOMPLETE flag.
386 */ 481 */
387 lblkno = args->rmtblkno; 482 lblkno = args->rmtblkno;
483 blkcnt = args->rmtblkcnt;
388 valuelen = args->valuelen; 484 valuelen = args->valuelen;
389 while (valuelen > 0) { 485 while (valuelen > 0) {
390 int byte_cnt; 486 struct xfs_buf *bp;
391 char *buf; 487 xfs_daddr_t dblkno;
488 int dblkcnt;
489
490 ASSERT(blkcnt > 0);
392 491
393 /*
394 * Try to remember where we decided to put the value.
395 */
396 xfs_bmap_init(args->flist, args->firstblock); 492 xfs_bmap_init(args->flist, args->firstblock);
397 nmap = 1; 493 nmap = 1;
398 error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, 494 error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
399 args->rmtblkcnt, &map, &nmap, 495 blkcnt, &map, &nmap,
400 XFS_BMAPI_ATTRFORK); 496 XFS_BMAPI_ATTRFORK);
401 if (error) 497 if (error)
402 return(error); 498 return(error);
@@ -405,41 +501,27 @@ xfs_attr_rmtval_set(
405 (map.br_startblock != HOLESTARTBLOCK)); 501 (map.br_startblock != HOLESTARTBLOCK));
406 502
407 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), 503 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
408 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); 504 dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
409 505
410 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); 506 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
411 if (!bp) 507 if (!bp)
412 return ENOMEM; 508 return ENOMEM;
413 bp->b_ops = &xfs_attr3_rmt_buf_ops; 509 bp->b_ops = &xfs_attr3_rmt_buf_ops;
414 510
415 byte_cnt = BBTOB(bp->b_length); 511 xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
416 byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); 512 &valuelen, &src);
417 if (valuelen < byte_cnt)
418 byte_cnt = valuelen;
419
420 buf = bp->b_addr;
421 buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset,
422 byte_cnt, bp);
423 memcpy(buf, src, byte_cnt);
424
425 if (byte_cnt < BBTOB(bp->b_length))
426 xfs_buf_zero(bp, byte_cnt,
427 BBTOB(bp->b_length) - byte_cnt);
428 513
429 error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ 514 error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
430 xfs_buf_relse(bp); 515 xfs_buf_relse(bp);
431 if (error) 516 if (error)
432 return error; 517 return error;
433 518
434 src += byte_cnt;
435 valuelen -= byte_cnt;
436 offset += byte_cnt;
437 hdrcnt--;
438 519
520 /* roll attribute extent map forwards */
439 lblkno += map.br_blockcount; 521 lblkno += map.br_blockcount;
522 blkcnt -= map.br_blockcount;
440 } 523 }
441 ASSERT(valuelen == 0); 524 ASSERT(valuelen == 0);
442 ASSERT(hdrcnt == 0);
443 return 0; 525 return 0;
444} 526}
445 527
@@ -448,33 +530,40 @@ xfs_attr_rmtval_set(
448 * out-of-line buffer that it is stored on. 530 * out-of-line buffer that it is stored on.
449 */ 531 */
450int 532int
451xfs_attr_rmtval_remove(xfs_da_args_t *args) 533xfs_attr_rmtval_remove(
534 struct xfs_da_args *args)
452{ 535{
453 xfs_mount_t *mp; 536 struct xfs_mount *mp = args->dp->i_mount;
454 xfs_bmbt_irec_t map; 537 xfs_dablk_t lblkno;
455 xfs_buf_t *bp; 538 int blkcnt;
456 xfs_daddr_t dblkno; 539 int error;
457 xfs_dablk_t lblkno; 540 int done;
458 int valuelen, blkcnt, nmap, error, done, committed;
459 541
460 trace_xfs_attr_rmtval_remove(args); 542 trace_xfs_attr_rmtval_remove(args);
461 543
462 mp = args->dp->i_mount;
463
464 /* 544 /*
465 * Roll through the "value", invalidating the attribute value's 545 * Roll through the "value", invalidating the attribute value's blocks.
466 * blocks. 546 * Note that args->rmtblkcnt is the minimum number of data blocks we'll
547 * see for a CRC enabled remote attribute. Each extent will have a
548 * header, and so we may have more blocks than we realise here. If we
549 * fail to map the blocks correctly, we'll have problems with the buffer
550 * lookups.
467 */ 551 */
468 lblkno = args->rmtblkno; 552 lblkno = args->rmtblkno;
469 valuelen = args->rmtblkcnt; 553 blkcnt = args->rmtblkcnt;
470 while (valuelen > 0) { 554 while (blkcnt > 0) {
555 struct xfs_bmbt_irec map;
556 struct xfs_buf *bp;
557 xfs_daddr_t dblkno;
558 int dblkcnt;
559 int nmap;
560
471 /* 561 /*
472 * Try to remember where we decided to put the value. 562 * Try to remember where we decided to put the value.
473 */ 563 */
474 nmap = 1; 564 nmap = 1;
475 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, 565 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
476 args->rmtblkcnt, &map, &nmap, 566 blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
477 XFS_BMAPI_ATTRFORK);
478 if (error) 567 if (error)
479 return(error); 568 return(error);
480 ASSERT(nmap == 1); 569 ASSERT(nmap == 1);
@@ -482,21 +571,20 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
482 (map.br_startblock != HOLESTARTBLOCK)); 571 (map.br_startblock != HOLESTARTBLOCK));
483 572
484 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), 573 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
485 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); 574 dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
486 575
487 /* 576 /*
488 * If the "remote" value is in the cache, remove it. 577 * If the "remote" value is in the cache, remove it.
489 */ 578 */
490 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); 579 bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
491 if (bp) { 580 if (bp) {
492 xfs_buf_stale(bp); 581 xfs_buf_stale(bp);
493 xfs_buf_relse(bp); 582 xfs_buf_relse(bp);
494 bp = NULL; 583 bp = NULL;
495 } 584 }
496 585
497 valuelen -= map.br_blockcount;
498
499 lblkno += map.br_blockcount; 586 lblkno += map.br_blockcount;
587 blkcnt -= map.br_blockcount;
500 } 588 }
501 589
502 /* 590 /*
@@ -506,6 +594,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
506 blkcnt = args->rmtblkcnt; 594 blkcnt = args->rmtblkcnt;
507 done = 0; 595 done = 0;
508 while (!done) { 596 while (!done) {
597 int committed;
598
509 xfs_bmap_init(args->flist, args->firstblock); 599 xfs_bmap_init(args->flist, args->firstblock);
510 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, 600 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
511 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 601 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h
index c7cca60a062a..92a8fd7977cc 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/xfs_attr_remote.h
@@ -20,6 +20,14 @@
20 20
21#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ 21#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */
22 22
23/*
24 * There is one of these headers per filesystem block in a remote attribute.
25 * This is done to ensure there is a 1:1 mapping between the attribute value
26 * length and the number of blocks needed to store the attribute. This makes the
27 * verification of a buffer a little more complex, but greatly simplifies the
28 * allocation, reading and writing of these attributes as we don't have to guess
29 * the number of blocks needed to store the attribute data.
30 */
23struct xfs_attr3_rmt_hdr { 31struct xfs_attr3_rmt_hdr {
24 __be32 rm_magic; 32 __be32 rm_magic;
25 __be32 rm_offset; 33 __be32 rm_offset;
@@ -39,6 +47,8 @@ struct xfs_attr3_rmt_hdr {
39 47
40extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; 48extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
41 49
50int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
51
42int xfs_attr_rmtval_get(struct xfs_da_args *args); 52int xfs_attr_rmtval_get(struct xfs_da_args *args);
43int xfs_attr_rmtval_set(struct xfs_da_args *args); 53int xfs_attr_rmtval_set(struct xfs_da_args *args);
44int xfs_attr_rmtval_remove(struct xfs_da_args *args); 54int xfs_attr_rmtval_remove(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 8804b8a3c310..0903960410a2 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -2544,7 +2544,17 @@ xfs_btree_new_iroot(
2544 if (error) 2544 if (error)
2545 goto error0; 2545 goto error0;
2546 2546
2547 /*
2548 * we can't just memcpy() the root in for CRC enabled btree blocks.
2549 * In that case have to also ensure the blkno remains correct
2550 */
2547 memcpy(cblock, block, xfs_btree_block_len(cur)); 2551 memcpy(cblock, block, xfs_btree_block_len(cur));
2552 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2553 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2554 cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2555 else
2556 cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2557 }
2548 2558
2549 be16_add_cpu(&block->bb_level, 1); 2559 be16_add_cpu(&block->bb_level, 1);
2550 xfs_btree_set_numrecs(block, 1); 2560 xfs_btree_set_numrecs(block, 1);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 82b70bda9f47..1b2472a46e46 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -513,6 +513,7 @@ _xfs_buf_find(
513 xfs_alert(btp->bt_mount, 513 xfs_alert(btp->bt_mount,
514 "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", 514 "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
515 __func__, blkno, eofs); 515 __func__, blkno, eofs);
516 WARN_ON(1);
516 return NULL; 517 return NULL;
517 } 518 }
518 519
@@ -1649,7 +1650,7 @@ xfs_alloc_buftarg(
1649{ 1650{
1650 xfs_buftarg_t *btp; 1651 xfs_buftarg_t *btp;
1651 1652
1652 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1653 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
1653 1654
1654 btp->bt_mount = mp; 1655 btp->bt_mount = mp;
1655 btp->bt_dev = bdev->bd_dev; 1656 btp->bt_dev = bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cf263476d6b4..4ec431777048 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -262,12 +262,7 @@ xfs_buf_item_format_segment(
262 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 262 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
263 vecp->i_len = nbits * XFS_BLF_CHUNK; 263 vecp->i_len = nbits * XFS_BLF_CHUNK;
264 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 264 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
265/* 265 nvecs++;
266 * You would think we need to bump the nvecs here too, but we do not
267 * this number is used by recovery, and it gets confused by the boundary
268 * split here
269 * nvecs++;
270 */
271 vecp++; 266 vecp++;
272 first_bit = next_bit; 267 first_bit = next_bit;
273 last_bit = next_bit; 268 last_bit = next_bit;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9b26a99ebfe9..0b8b2a13cd24 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -270,6 +270,7 @@ xfs_da3_node_read_verify(
270 break; 270 break;
271 return; 271 return;
272 case XFS_ATTR_LEAF_MAGIC: 272 case XFS_ATTR_LEAF_MAGIC:
273 case XFS_ATTR3_LEAF_MAGIC:
273 bp->b_ops = &xfs_attr3_leaf_buf_ops; 274 bp->b_ops = &xfs_attr3_leaf_buf_ops;
274 bp->b_ops->verify_read(bp); 275 bp->b_ops->verify_read(bp);
275 return; 276 return;
@@ -2464,7 +2465,8 @@ xfs_buf_map_from_irec(
2464 ASSERT(nirecs >= 1); 2465 ASSERT(nirecs >= 1);
2465 2466
2466 if (nirecs > 1) { 2467 if (nirecs > 1) {
2467 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); 2468 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
2469 KM_SLEEP | KM_NOFS);
2468 if (!map) 2470 if (!map)
2469 return ENOMEM; 2471 return ENOMEM;
2470 *mapp = map; 2472 *mapp = map;
@@ -2520,7 +2522,8 @@ xfs_dabuf_map(
2520 * Optimize the one-block case. 2522 * Optimize the one-block case.
2521 */ 2523 */
2522 if (nfsb != 1) 2524 if (nfsb != 1)
2523 irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); 2525 irecs = kmem_zalloc(sizeof(irec) * nfsb,
2526 KM_SLEEP | KM_NOFS);
2524 2527
2525 nirecs = nfsb; 2528 nirecs = nfsb;
2526 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, 2529 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f852b082a084..c407e1ccff43 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -219,6 +219,14 @@ xfs_swap_extents(
219 int taforkblks = 0; 219 int taforkblks = 0;
220 __uint64_t tmp; 220 __uint64_t tmp;
221 221
222 /*
223 * We have no way of updating owner information in the BMBT blocks for
224 * each inode on CRC enabled filesystems, so to avoid corrupting the
225 * this metadata we simply don't allow extent swaps to occur.
226 */
227 if (xfs_sb_version_hascrc(&mp->m_sb))
228 return XFS_ERROR(EINVAL);
229
222 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); 230 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
223 if (!tempifp) { 231 if (!tempifp) {
224 error = XFS_ERROR(ENOMEM); 232 error = XFS_ERROR(ENOMEM);
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index b26a50f9921d..8f023dee404d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -368,10 +368,8 @@ xfs_dir_removename(
368int 368int
369xfs_readdir( 369xfs_readdir(
370 xfs_inode_t *dp, 370 xfs_inode_t *dp,
371 void *dirent, 371 struct dir_context *ctx,
372 size_t bufsize, 372 size_t bufsize)
373 xfs_off_t *offset,
374 filldir_t filldir)
375{ 373{
376 int rval; /* return value */ 374 int rval; /* return value */
377 int v; /* type-checking value */ 375 int v; /* type-checking value */
@@ -385,14 +383,13 @@ xfs_readdir(
385 XFS_STATS_INC(xs_dir_getdents); 383 XFS_STATS_INC(xs_dir_getdents);
386 384
387 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 385 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
388 rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir); 386 rval = xfs_dir2_sf_getdents(dp, ctx);
389 else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) 387 else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
390 ; 388 ;
391 else if (v) 389 else if (v)
392 rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir); 390 rval = xfs_dir2_block_getdents(dp, ctx);
393 else 391 else
394 rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset, 392 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
395 filldir);
396 return rval; 393 return rval;
397} 394}
398 395
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e59f5fc816fe..09aea0247d96 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -569,9 +569,7 @@ xfs_dir2_block_addname(
569int /* error */ 569int /* error */
570xfs_dir2_block_getdents( 570xfs_dir2_block_getdents(
571 xfs_inode_t *dp, /* incore inode */ 571 xfs_inode_t *dp, /* incore inode */
572 void *dirent, 572 struct dir_context *ctx)
573 xfs_off_t *offset,
574 filldir_t filldir)
575{ 573{
576 xfs_dir2_data_hdr_t *hdr; /* block header */ 574 xfs_dir2_data_hdr_t *hdr; /* block header */
577 struct xfs_buf *bp; /* buffer for block */ 575 struct xfs_buf *bp; /* buffer for block */
@@ -589,7 +587,7 @@ xfs_dir2_block_getdents(
589 /* 587 /*
590 * If the block number in the offset is out of range, we're done. 588 * If the block number in the offset is out of range, we're done.
591 */ 589 */
592 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) 590 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
593 return 0; 591 return 0;
594 592
595 error = xfs_dir3_block_read(NULL, dp, &bp); 593 error = xfs_dir3_block_read(NULL, dp, &bp);
@@ -600,7 +598,7 @@ xfs_dir2_block_getdents(
600 * Extract the byte offset we start at from the seek pointer. 598 * Extract the byte offset we start at from the seek pointer.
601 * We'll skip entries before this. 599 * We'll skip entries before this.
602 */ 600 */
603 wantoff = xfs_dir2_dataptr_to_off(mp, *offset); 601 wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
604 hdr = bp->b_addr; 602 hdr = bp->b_addr;
605 xfs_dir3_data_check(dp, bp); 603 xfs_dir3_data_check(dp, bp);
606 /* 604 /*
@@ -639,13 +637,12 @@ xfs_dir2_block_getdents(
639 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, 637 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
640 (char *)dep - (char *)hdr); 638 (char *)dep - (char *)hdr);
641 639
640 ctx->pos = cook & 0x7fffffff;
642 /* 641 /*
643 * If it didn't fit, set the final offset to here & return. 642 * If it didn't fit, set the final offset to here & return.
644 */ 643 */
645 if (filldir(dirent, (char *)dep->name, dep->namelen, 644 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
646 cook & 0x7fffffff, be64_to_cpu(dep->inumber), 645 be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
647 DT_UNKNOWN)) {
648 *offset = cook & 0x7fffffff;
649 xfs_trans_brelse(NULL, bp); 646 xfs_trans_brelse(NULL, bp);
650 return 0; 647 return 0;
651 } 648 }
@@ -655,7 +652,7 @@ xfs_dir2_block_getdents(
655 * Reached the end of the block. 652 * Reached the end of the block.
656 * Set the offset to a non-existent block 1 and return. 653 * Set the offset to a non-existent block 1 and return.
657 */ 654 */
658 *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 655 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
659 0x7fffffff; 656 0x7fffffff;
660 xfs_trans_brelse(NULL, bp); 657 xfs_trans_brelse(NULL, bp);
661 return 0; 658 return 0;
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index a3b1bd841a80..7826782b8d78 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -266,6 +266,7 @@ struct xfs_dir3_blk_hdr {
266struct xfs_dir3_data_hdr { 266struct xfs_dir3_data_hdr {
267 struct xfs_dir3_blk_hdr hdr; 267 struct xfs_dir3_blk_hdr hdr;
268 xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; 268 xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT];
269 __be32 pad; /* 64 bit alignment */
269}; 270};
270 271
271#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) 272#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc)
@@ -477,7 +478,7 @@ struct xfs_dir3_leaf_hdr {
477 struct xfs_da3_blkinfo info; /* header for da routines */ 478 struct xfs_da3_blkinfo info; /* header for da routines */
478 __be16 count; /* count of entries */ 479 __be16 count; /* count of entries */
479 __be16 stale; /* count of stale entries */ 480 __be16 stale; /* count of stale entries */
480 __be32 pad; 481 __be32 pad; /* 64 bit alignment */
481}; 482};
482 483
483struct xfs_dir3_icleaf_hdr { 484struct xfs_dir3_icleaf_hdr {
@@ -715,6 +716,7 @@ struct xfs_dir3_free_hdr {
715 __be32 firstdb; /* db of first entry */ 716 __be32 firstdb; /* db of first entry */
716 __be32 nvalid; /* count of valid entries */ 717 __be32 nvalid; /* count of valid entries */
717 __be32 nused; /* count of used entries */ 718 __be32 nused; /* count of used entries */
719 __be32 pad; /* 64 bit alignment */
718}; 720};
719 721
720struct xfs_dir3_free { 722struct xfs_dir3_free {
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 721ba2fe8e54..e0cc1243a8aa 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1300,10 +1300,8 @@ out:
1300int /* error */ 1300int /* error */
1301xfs_dir2_leaf_getdents( 1301xfs_dir2_leaf_getdents(
1302 xfs_inode_t *dp, /* incore directory inode */ 1302 xfs_inode_t *dp, /* incore directory inode */
1303 void *dirent, 1303 struct dir_context *ctx,
1304 size_t bufsize, 1304 size_t bufsize)
1305 xfs_off_t *offset,
1306 filldir_t filldir)
1307{ 1305{
1308 struct xfs_buf *bp = NULL; /* data block buffer */ 1306 struct xfs_buf *bp = NULL; /* data block buffer */
1309 xfs_dir2_data_hdr_t *hdr; /* data block header */ 1307 xfs_dir2_data_hdr_t *hdr; /* data block header */
@@ -1322,7 +1320,7 @@ xfs_dir2_leaf_getdents(
1322 * If the offset is at or past the largest allowed value, 1320 * If the offset is at or past the largest allowed value,
1323 * give up right away. 1321 * give up right away.
1324 */ 1322 */
1325 if (*offset >= XFS_DIR2_MAX_DATAPTR) 1323 if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
1326 return 0; 1324 return 0;
1327 1325
1328 mp = dp->i_mount; 1326 mp = dp->i_mount;
@@ -1336,14 +1334,14 @@ xfs_dir2_leaf_getdents(
1336 mp->m_sb.sb_blocksize); 1334 mp->m_sb.sb_blocksize);
1337 map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + 1335 map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
1338 (length * sizeof(struct xfs_bmbt_irec)), 1336 (length * sizeof(struct xfs_bmbt_irec)),
1339 KM_SLEEP); 1337 KM_SLEEP | KM_NOFS);
1340 map_info->map_size = length; 1338 map_info->map_size = length;
1341 1339
1342 /* 1340 /*
1343 * Inside the loop we keep the main offset value as a byte offset 1341 * Inside the loop we keep the main offset value as a byte offset
1344 * in the directory file. 1342 * in the directory file.
1345 */ 1343 */
1346 curoff = xfs_dir2_dataptr_to_byte(mp, *offset); 1344 curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
1347 1345
1348 /* 1346 /*
1349 * Force this conversion through db so we truncate the offset 1347 * Force this conversion through db so we truncate the offset
@@ -1444,8 +1442,8 @@ xfs_dir2_leaf_getdents(
1444 dep = (xfs_dir2_data_entry_t *)ptr; 1442 dep = (xfs_dir2_data_entry_t *)ptr;
1445 length = xfs_dir2_data_entsize(dep->namelen); 1443 length = xfs_dir2_data_entsize(dep->namelen);
1446 1444
1447 if (filldir(dirent, (char *)dep->name, dep->namelen, 1445 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
1448 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, 1446 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
1449 be64_to_cpu(dep->inumber), DT_UNKNOWN)) 1447 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1450 break; 1448 break;
1451 1449
@@ -1462,9 +1460,9 @@ xfs_dir2_leaf_getdents(
1462 * All done. Set output offset value to current offset. 1460 * All done. Set output offset value to current offset.
1463 */ 1461 */
1464 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) 1462 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
1465 *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; 1463 ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
1466 else 1464 else
1467 *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; 1465 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
1468 kmem_free(map_info); 1466 kmem_free(map_info);
1469 if (bp) 1467 if (bp)
1470 xfs_trans_brelse(NULL, bp); 1468 xfs_trans_brelse(NULL, bp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 5246de4912d4..2226a00acd15 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -263,18 +263,19 @@ xfs_dir3_free_get_buf(
263 * Initialize the new block to be empty, and remember 263 * Initialize the new block to be empty, and remember
264 * its first slot as our empty slot. 264 * its first slot as our empty slot.
265 */ 265 */
266 hdr.magic = XFS_DIR2_FREE_MAGIC; 266 memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
267 hdr.firstdb = 0; 267 memset(&hdr, 0, sizeof(hdr));
268 hdr.nused = 0; 268
269 hdr.nvalid = 0;
270 if (xfs_sb_version_hascrc(&mp->m_sb)) { 269 if (xfs_sb_version_hascrc(&mp->m_sb)) {
271 struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; 270 struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
272 271
273 hdr.magic = XFS_DIR3_FREE_MAGIC; 272 hdr.magic = XFS_DIR3_FREE_MAGIC;
273
274 hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); 274 hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
275 hdr3->hdr.owner = cpu_to_be64(dp->i_ino); 275 hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
276 uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); 276 uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
277 } 277 } else
278 hdr.magic = XFS_DIR2_FREE_MAGIC;
278 xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); 279 xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr);
279 *bpp = bp; 280 *bpp = bp;
280 return 0; 281 return 0;
@@ -1921,8 +1922,6 @@ xfs_dir2_node_addname_int(
1921 */ 1922 */
1922 freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * 1923 freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
1923 xfs_dir3_free_max_bests(mp); 1924 xfs_dir3_free_max_bests(mp);
1924 free->hdr.nvalid = 0;
1925 free->hdr.nused = 0;
1926 } else { 1925 } else {
1927 free = fbp->b_addr; 1926 free = fbp->b_addr;
1928 bests = xfs_dir3_free_bests_p(mp, free); 1927 bests = xfs_dir3_free_bests_p(mp, free);
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7cf573c88aad..0511cda4a712 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -33,8 +33,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
33extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; 33extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
34 34
35extern int xfs_dir2_block_addname(struct xfs_da_args *args); 35extern int xfs_dir2_block_addname(struct xfs_da_args *args);
36extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, 36extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
37 xfs_off_t *offset, filldir_t filldir); 37 struct dir_context *ctx);
38extern int xfs_dir2_block_lookup(struct xfs_da_args *args); 38extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
39extern int xfs_dir2_block_removename(struct xfs_da_args *args); 39extern int xfs_dir2_block_removename(struct xfs_da_args *args);
40extern int xfs_dir2_block_replace(struct xfs_da_args *args); 40extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -91,8 +91,8 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
91extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, 91extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
92 struct xfs_dir2_leaf_entry *ents, int *indexp, 92 struct xfs_dir2_leaf_entry *ents, int *indexp,
93 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); 93 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
94extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, 94extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
95 size_t bufsize, xfs_off_t *offset, filldir_t filldir); 95 size_t bufsize);
96extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, 96extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
97 struct xfs_buf **bpp, __uint16_t magic); 97 struct xfs_buf **bpp, __uint16_t magic);
98extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, 98extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -153,8 +153,7 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
153 int size, xfs_dir2_sf_hdr_t *sfhp); 153 int size, xfs_dir2_sf_hdr_t *sfhp);
154extern int xfs_dir2_sf_addname(struct xfs_da_args *args); 154extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
155extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); 155extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
156extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent, 156extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
157 xfs_off_t *offset, filldir_t filldir);
158extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); 157extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
159extern int xfs_dir2_sf_removename(struct xfs_da_args *args); 158extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
160extern int xfs_dir2_sf_replace(struct xfs_da_args *args); 159extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 6157424dbf8f..97676a347da1 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -768,9 +768,7 @@ xfs_dir2_sf_create(
768int /* error */ 768int /* error */
769xfs_dir2_sf_getdents( 769xfs_dir2_sf_getdents(
770 xfs_inode_t *dp, /* incore directory inode */ 770 xfs_inode_t *dp, /* incore directory inode */
771 void *dirent, 771 struct dir_context *ctx)
772 xfs_off_t *offset,
773 filldir_t filldir)
774{ 772{
775 int i; /* shortform entry number */ 773 int i; /* shortform entry number */
776 xfs_mount_t *mp; /* filesystem mount point */ 774 xfs_mount_t *mp; /* filesystem mount point */
@@ -802,7 +800,7 @@ xfs_dir2_sf_getdents(
802 /* 800 /*
803 * If the block number in the offset is out of range, we're done. 801 * If the block number in the offset is out of range, we're done.
804 */ 802 */
805 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) 803 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
806 return 0; 804 return 0;
807 805
808 /* 806 /*
@@ -819,22 +817,20 @@ xfs_dir2_sf_getdents(
819 /* 817 /*
820 * Put . entry unless we're starting past it. 818 * Put . entry unless we're starting past it.
821 */ 819 */
822 if (*offset <= dot_offset) { 820 if (ctx->pos <= dot_offset) {
823 if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) { 821 ctx->pos = dot_offset & 0x7fffffff;
824 *offset = dot_offset & 0x7fffffff; 822 if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
825 return 0; 823 return 0;
826 }
827 } 824 }
828 825
829 /* 826 /*
830 * Put .. entry unless we're starting past it. 827 * Put .. entry unless we're starting past it.
831 */ 828 */
832 if (*offset <= dotdot_offset) { 829 if (ctx->pos <= dotdot_offset) {
833 ino = xfs_dir2_sf_get_parent_ino(sfp); 830 ino = xfs_dir2_sf_get_parent_ino(sfp);
834 if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { 831 ctx->pos = dotdot_offset & 0x7fffffff;
835 *offset = dotdot_offset & 0x7fffffff; 832 if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
836 return 0; 833 return 0;
837 }
838 } 834 }
839 835
840 /* 836 /*
@@ -845,21 +841,20 @@ xfs_dir2_sf_getdents(
845 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, 841 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
846 xfs_dir2_sf_get_offset(sfep)); 842 xfs_dir2_sf_get_offset(sfep));
847 843
848 if (*offset > off) { 844 if (ctx->pos > off) {
849 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 845 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
850 continue; 846 continue;
851 } 847 }
852 848
853 ino = xfs_dir2_sfe_get_ino(sfp, sfep); 849 ino = xfs_dir2_sfe_get_ino(sfp, sfep);
854 if (filldir(dirent, (char *)sfep->name, sfep->namelen, 850 ctx->pos = off & 0x7fffffff;
855 off & 0x7fffffff, ino, DT_UNKNOWN)) { 851 if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
856 *offset = off & 0x7fffffff; 852 ino, DT_UNKNOWN))
857 return 0; 853 return 0;
858 }
859 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 854 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
860 } 855 }
861 856
862 *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 857 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
863 0x7fffffff; 858 0x7fffffff;
864 return 0; 859 return 0;
865} 860}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a41f8bf1da37..044e97a33c8d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -249,8 +249,11 @@ xfs_qm_init_dquot_blk(
249 d->dd_diskdq.d_version = XFS_DQUOT_VERSION; 249 d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
250 d->dd_diskdq.d_id = cpu_to_be32(curid); 250 d->dd_diskdq.d_id = cpu_to_be32(curid);
251 d->dd_diskdq.d_flags = type; 251 d->dd_diskdq.d_flags = type;
252 if (xfs_sb_version_hascrc(&mp->m_sb)) 252 if (xfs_sb_version_hascrc(&mp->m_sb)) {
253 uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); 253 uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
254 xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
255 XFS_DQUOT_CRC_OFF);
256 }
254 } 257 }
255 258
256 xfs_trans_dquot_buf(tp, bp, 259 xfs_trans_dquot_buf(tp, bp,
@@ -286,23 +289,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
286 dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; 289 dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
287} 290}
288 291
289STATIC void
290xfs_dquot_buf_calc_crc(
291 struct xfs_mount *mp,
292 struct xfs_buf *bp)
293{
294 struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
295 int i;
296
297 if (!xfs_sb_version_hascrc(&mp->m_sb))
298 return;
299
300 for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) {
301 xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
302 offsetof(struct xfs_dqblk, dd_crc));
303 }
304}
305
306STATIC bool 292STATIC bool
307xfs_dquot_buf_verify_crc( 293xfs_dquot_buf_verify_crc(
308 struct xfs_mount *mp, 294 struct xfs_mount *mp,
@@ -328,12 +314,11 @@ xfs_dquot_buf_verify_crc(
328 314
329 for (i = 0; i < ndquots; i++, d++) { 315 for (i = 0; i < ndquots; i++, d++) {
330 if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), 316 if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
331 offsetof(struct xfs_dqblk, dd_crc))) 317 XFS_DQUOT_CRC_OFF))
332 return false; 318 return false;
333 if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) 319 if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
334 return false; 320 return false;
335 } 321 }
336
337 return true; 322 return true;
338} 323}
339 324
@@ -393,6 +378,11 @@ xfs_dquot_buf_read_verify(
393 } 378 }
394} 379}
395 380
381/*
382 * we don't calculate the CRC here as that is done when the dquot is flushed to
383 * the buffer after the update is done. This ensures that the dquot in the
384 * buffer always has an up-to-date CRC value.
385 */
396void 386void
397xfs_dquot_buf_write_verify( 387xfs_dquot_buf_write_verify(
398 struct xfs_buf *bp) 388 struct xfs_buf *bp)
@@ -404,7 +394,6 @@ xfs_dquot_buf_write_verify(
404 xfs_buf_ioerror(bp, EFSCORRUPTED); 394 xfs_buf_ioerror(bp, EFSCORRUPTED);
405 return; 395 return;
406 } 396 }
407 xfs_dquot_buf_calc_crc(mp, bp);
408} 397}
409 398
410const struct xfs_buf_ops xfs_dquot_buf_ops = { 399const struct xfs_buf_ops xfs_dquot_buf_ops = {
@@ -1151,11 +1140,17 @@ xfs_qm_dqflush(
1151 * copy the lsn into the on-disk dquot now while we have the in memory 1140 * copy the lsn into the on-disk dquot now while we have the in memory
1152 * dquot here. This can't be done later in the write verifier as we 1141 * dquot here. This can't be done later in the write verifier as we
1153 * can't get access to the log item at that point in time. 1142 * can't get access to the log item at that point in time.
1143 *
1144 * We also calculate the CRC here so that the on-disk dquot in the
1145 * buffer always has a valid CRC. This ensures there is no possibility
1146 * of a dquot without an up-to-date CRC getting to disk.
1154 */ 1147 */
1155 if (xfs_sb_version_hascrc(&mp->m_sb)) { 1148 if (xfs_sb_version_hascrc(&mp->m_sb)) {
1156 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; 1149 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
1157 1150
1158 dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); 1151 dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
1152 xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
1153 XFS_DQUOT_CRC_OFF);
1159 } 1154 }
1160 1155
1161 /* 1156 /*
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index c0f375087efc..452920a3f03f 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -305,11 +305,12 @@ xfs_efi_release(xfs_efi_log_item_t *efip,
305{ 305{
306 ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); 306 ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
307 if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { 307 if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
308 __xfs_efi_release(efip);
309
310 /* recovery needs us to drop the EFI reference, too */ 308 /* recovery needs us to drop the EFI reference, too */
311 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) 309 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
312 __xfs_efi_release(efip); 310 __xfs_efi_release(efip);
311
312 __xfs_efi_release(efip);
313 /* efip may now have been freed, do not reference it again. */
313 } 314 }
314} 315}
315 316
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a5f2042aec8b..de3dc98f4e8f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -906,11 +906,10 @@ xfs_file_release(
906 906
907STATIC int 907STATIC int
908xfs_file_readdir( 908xfs_file_readdir(
909 struct file *filp, 909 struct file *file,
910 void *dirent, 910 struct dir_context *ctx)
911 filldir_t filldir)
912{ 911{
913 struct inode *inode = file_inode(filp); 912 struct inode *inode = file_inode(file);
914 xfs_inode_t *ip = XFS_I(inode); 913 xfs_inode_t *ip = XFS_I(inode);
915 int error; 914 int error;
916 size_t bufsize; 915 size_t bufsize;
@@ -929,8 +928,7 @@ xfs_file_readdir(
929 */ 928 */
930 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); 929 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
931 930
932 error = xfs_readdir(ip, dirent, bufsize, 931 error = xfs_readdir(ip, ctx, bufsize);
933 (xfs_off_t *)&filp->f_pos, filldir);
934 if (error) 932 if (error)
935 return -error; 933 return -error;
936 return 0; 934 return 0;
@@ -1270,8 +1268,7 @@ xfs_seek_data(
1270 } 1268 }
1271 1269
1272out: 1270out:
1273 if (offset != file->f_pos) 1271 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1274 file->f_pos = offset;
1275 1272
1276out_unlock: 1273out_unlock:
1277 xfs_iunlock_map_shared(ip, lock); 1274 xfs_iunlock_map_shared(ip, lock);
@@ -1379,8 +1376,7 @@ out:
1379 * situation in particular. 1376 * situation in particular.
1380 */ 1377 */
1381 offset = min_t(loff_t, offset, isize); 1378 offset = min_t(loff_t, offset, isize);
1382 if (offset != file->f_pos) 1379 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1383 file->f_pos = offset;
1384 1380
1385out_unlock: 1381out_unlock:
1386 xfs_iunlock_map_shared(ip, lock); 1382 xfs_iunlock_map_shared(ip, lock);
@@ -1432,7 +1428,7 @@ const struct file_operations xfs_file_operations = {
1432const struct file_operations xfs_dir_file_operations = { 1428const struct file_operations xfs_dir_file_operations = {
1433 .open = xfs_dir_open, 1429 .open = xfs_dir_open,
1434 .read = generic_read_dir, 1430 .read = generic_read_dir,
1435 .readdir = xfs_file_readdir, 1431 .iterate = xfs_file_readdir,
1436 .llseek = generic_file_llseek, 1432 .llseek = generic_file_llseek,
1437 .unlocked_ioctl = xfs_file_ioctl, 1433 .unlocked_ioctl = xfs_file_ioctl,
1438#ifdef CONFIG_COMPAT 1434#ifdef CONFIG_COMPAT
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 6dda3f949b04..d04695545397 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
236#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ 236#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
237#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ 237#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
238#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ 238#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
239#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
239 240
240 241
241/* 242/*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 87595b211da1..3c3644ea825b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -99,7 +99,9 @@ xfs_fs_geometry(
99 (xfs_sb_version_hasattr2(&mp->m_sb) ? 99 (xfs_sb_version_hasattr2(&mp->m_sb) ?
100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | 100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
101 (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? 101 (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
102 XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); 102 XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
103 (xfs_sb_version_hascrc(&mp->m_sb) ?
104 XFS_FSOP_GEOM_FLAGS_V5SB : 0);
103 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? 105 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
104 mp->m_sb.sb_logsectsize : BBSIZE; 106 mp->m_sb.sb_logsectsize : BBSIZE;
105 geo->rtsectsize = mp->m_sb.sb_blocksize; 107 geo->rtsectsize = mp->m_sb.sb_blocksize;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index efbe1accb6ca..7f7be5f98f52 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1638,6 +1638,10 @@ xfs_iunlink(
1638 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1638 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1639 offset = ip->i_imap.im_boffset + 1639 offset = ip->i_imap.im_boffset +
1640 offsetof(xfs_dinode_t, di_next_unlinked); 1640 offsetof(xfs_dinode_t, di_next_unlinked);
1641
1642 /* need to recalc the inode CRC if appropriate */
1643 xfs_dinode_calc_crc(mp, dip);
1644
1641 xfs_trans_inode_buf(tp, ibp); 1645 xfs_trans_inode_buf(tp, ibp);
1642 xfs_trans_log_buf(tp, ibp, offset, 1646 xfs_trans_log_buf(tp, ibp, offset,
1643 (offset + sizeof(xfs_agino_t) - 1)); 1647 (offset + sizeof(xfs_agino_t) - 1));
@@ -1723,6 +1727,10 @@ xfs_iunlink_remove(
1723 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1727 dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1724 offset = ip->i_imap.im_boffset + 1728 offset = ip->i_imap.im_boffset +
1725 offsetof(xfs_dinode_t, di_next_unlinked); 1729 offsetof(xfs_dinode_t, di_next_unlinked);
1730
1731 /* need to recalc the inode CRC if appropriate */
1732 xfs_dinode_calc_crc(mp, dip);
1733
1726 xfs_trans_inode_buf(tp, ibp); 1734 xfs_trans_inode_buf(tp, ibp);
1727 xfs_trans_log_buf(tp, ibp, offset, 1735 xfs_trans_log_buf(tp, ibp, offset,
1728 (offset + sizeof(xfs_agino_t) - 1)); 1736 (offset + sizeof(xfs_agino_t) - 1));
@@ -1796,6 +1804,10 @@ xfs_iunlink_remove(
1796 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1804 dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1797 offset = ip->i_imap.im_boffset + 1805 offset = ip->i_imap.im_boffset +
1798 offsetof(xfs_dinode_t, di_next_unlinked); 1806 offsetof(xfs_dinode_t, di_next_unlinked);
1807
1808 /* need to recalc the inode CRC if appropriate */
1809 xfs_dinode_calc_crc(mp, dip);
1810
1799 xfs_trans_inode_buf(tp, ibp); 1811 xfs_trans_inode_buf(tp, ibp);
1800 xfs_trans_log_buf(tp, ibp, offset, 1812 xfs_trans_log_buf(tp, ibp, offset,
1801 (offset + sizeof(xfs_agino_t) - 1)); 1813 (offset + sizeof(xfs_agino_t) - 1));
@@ -1809,6 +1821,10 @@ xfs_iunlink_remove(
1809 last_dip->di_next_unlinked = cpu_to_be32(next_agino); 1821 last_dip->di_next_unlinked = cpu_to_be32(next_agino);
1810 ASSERT(next_agino != 0); 1822 ASSERT(next_agino != 0);
1811 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 1823 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
1824
1825 /* need to recalc the inode CRC if appropriate */
1826 xfs_dinode_calc_crc(mp, last_dip);
1827
1812 xfs_trans_inode_buf(tp, last_ibp); 1828 xfs_trans_inode_buf(tp, last_ibp);
1813 xfs_trans_log_buf(tp, last_ibp, offset, 1829 xfs_trans_log_buf(tp, last_ibp, offset,
1814 (offset + sizeof(xfs_agino_t) - 1)); 1830 (offset + sizeof(xfs_agino_t) - 1));
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d82efaa2ac73..ca9ecaa81112 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -455,6 +455,28 @@ xfs_vn_getattr(
455 return 0; 455 return 0;
456} 456}
457 457
458static void
459xfs_setattr_mode(
460 struct xfs_trans *tp,
461 struct xfs_inode *ip,
462 struct iattr *iattr)
463{
464 struct inode *inode = VFS_I(ip);
465 umode_t mode = iattr->ia_mode;
466
467 ASSERT(tp);
468 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
469
470 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
471 mode &= ~S_ISGID;
472
473 ip->i_d.di_mode &= S_IFMT;
474 ip->i_d.di_mode |= mode & ~S_IFMT;
475
476 inode->i_mode &= S_IFMT;
477 inode->i_mode |= mode & ~S_IFMT;
478}
479
458int 480int
459xfs_setattr_nonsize( 481xfs_setattr_nonsize(
460 struct xfs_inode *ip, 482 struct xfs_inode *ip,
@@ -606,18 +628,8 @@ xfs_setattr_nonsize(
606 /* 628 /*
607 * Change file access modes. 629 * Change file access modes.
608 */ 630 */
609 if (mask & ATTR_MODE) { 631 if (mask & ATTR_MODE)
610 umode_t mode = iattr->ia_mode; 632 xfs_setattr_mode(tp, ip, iattr);
611
612 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
613 mode &= ~S_ISGID;
614
615 ip->i_d.di_mode &= S_IFMT;
616 ip->i_d.di_mode |= mode & ~S_IFMT;
617
618 inode->i_mode &= S_IFMT;
619 inode->i_mode |= mode & ~S_IFMT;
620 }
621 633
622 /* 634 /*
623 * Change file access or modified times. 635 * Change file access or modified times.
@@ -714,9 +726,8 @@ xfs_setattr_size(
714 return XFS_ERROR(error); 726 return XFS_ERROR(error);
715 727
716 ASSERT(S_ISREG(ip->i_d.di_mode)); 728 ASSERT(S_ISREG(ip->i_d.di_mode));
717 ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| 729 ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
718 ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| 730 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
719 ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
720 731
721 if (!(flags & XFS_ATTR_NOLOCK)) { 732 if (!(flags & XFS_ATTR_NOLOCK)) {
722 lock_flags |= XFS_IOLOCK_EXCL; 733 lock_flags |= XFS_IOLOCK_EXCL;
@@ -860,6 +871,12 @@ xfs_setattr_size(
860 xfs_inode_clear_eofblocks_tag(ip); 871 xfs_inode_clear_eofblocks_tag(ip);
861 } 872 }
862 873
874 /*
875 * Change file access modes.
876 */
877 if (mask & ATTR_MODE)
878 xfs_setattr_mode(tp, ip, iattr);
879
863 if (mask & ATTR_CTIME) { 880 if (mask & ATTR_CTIME) {
864 inode->i_ctime = iattr->ia_ctime; 881 inode->i_ctime = iattr->ia_ctime;
865 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; 882 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index e3d0b85d852b..d0833b54e55d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -139,7 +139,7 @@ xlog_cil_prepare_log_vecs(
139 139
140 new_lv = kmem_zalloc(sizeof(*new_lv) + 140 new_lv = kmem_zalloc(sizeof(*new_lv) +
141 niovecs * sizeof(struct xfs_log_iovec), 141 niovecs * sizeof(struct xfs_log_iovec),
142 KM_SLEEP); 142 KM_SLEEP|KM_NOFS);
143 143
144 /* The allocated iovec region lies beyond the log vector. */ 144 /* The allocated iovec region lies beyond the log vector. */
145 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; 145 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 93f03ec17eec..7cf5e4eafe28 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1599,10 +1599,43 @@ xlog_recover_add_to_trans(
1599} 1599}
1600 1600
1601/* 1601/*
1602 * Sort the log items in the transaction. Cancelled buffers need 1602 * Sort the log items in the transaction.
1603 * to be put first so they are processed before any items that might 1603 *
1604 * modify the buffers. If they are cancelled, then the modifications 1604 * The ordering constraints are defined by the inode allocation and unlink
1605 * don't need to be replayed. 1605 * behaviour. The rules are:
1606 *
1607 * 1. Every item is only logged once in a given transaction. Hence it
1608 * represents the last logged state of the item. Hence ordering is
1609 * dependent on the order in which operations need to be performed so
1610 * required initial conditions are always met.
1611 *
1612 * 2. Cancelled buffers are recorded in pass 1 in a separate table and
1613 * there's nothing to replay from them so we can simply cull them
1614 * from the transaction. However, we can't do that until after we've
1615 * replayed all the other items because they may be dependent on the
1616 * cancelled buffer and replaying the cancelled buffer can remove it
1617 * form the cancelled buffer table. Hence they have tobe done last.
1618 *
1619 * 3. Inode allocation buffers must be replayed before inode items that
1620 * read the buffer and replay changes into it.
1621 *
1622 * 4. Inode unlink buffers must be replayed after inode items are replayed.
1623 * This ensures that inodes are completely flushed to the inode buffer
1624 * in a "free" state before we remove the unlinked inode list pointer.
1625 *
1626 * Hence the ordering needs to be inode allocation buffers first, inode items
1627 * second, inode unlink buffers third and cancelled buffers last.
1628 *
1629 * But there's a problem with that - we can't tell an inode allocation buffer
1630 * apart from a regular buffer, so we can't separate them. We can, however,
1631 * tell an inode unlink buffer from the others, and so we can separate them out
1632 * from all the other buffers and move them to last.
1633 *
1634 * Hence, 4 lists, in order from head to tail:
1635 * - buffer_list for all buffers except cancelled/inode unlink buffers
1636 * - item_list for all non-buffer items
1637 * - inode_buffer_list for inode unlink buffers
1638 * - cancel_list for the cancelled buffers
1606 */ 1639 */
1607STATIC int 1640STATIC int
1608xlog_recover_reorder_trans( 1641xlog_recover_reorder_trans(
@@ -1612,6 +1645,10 @@ xlog_recover_reorder_trans(
1612{ 1645{
1613 xlog_recover_item_t *item, *n; 1646 xlog_recover_item_t *item, *n;
1614 LIST_HEAD(sort_list); 1647 LIST_HEAD(sort_list);
1648 LIST_HEAD(cancel_list);
1649 LIST_HEAD(buffer_list);
1650 LIST_HEAD(inode_buffer_list);
1651 LIST_HEAD(inode_list);
1615 1652
1616 list_splice_init(&trans->r_itemq, &sort_list); 1653 list_splice_init(&trans->r_itemq, &sort_list);
1617 list_for_each_entry_safe(item, n, &sort_list, ri_list) { 1654 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
@@ -1619,12 +1656,18 @@ xlog_recover_reorder_trans(
1619 1656
1620 switch (ITEM_TYPE(item)) { 1657 switch (ITEM_TYPE(item)) {
1621 case XFS_LI_BUF: 1658 case XFS_LI_BUF:
1622 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { 1659 if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1623 trace_xfs_log_recover_item_reorder_head(log, 1660 trace_xfs_log_recover_item_reorder_head(log,
1624 trans, item, pass); 1661 trans, item, pass);
1625 list_move(&item->ri_list, &trans->r_itemq); 1662 list_move(&item->ri_list, &cancel_list);
1663 break;
1664 }
1665 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1666 list_move(&item->ri_list, &inode_buffer_list);
1626 break; 1667 break;
1627 } 1668 }
1669 list_move_tail(&item->ri_list, &buffer_list);
1670 break;
1628 case XFS_LI_INODE: 1671 case XFS_LI_INODE:
1629 case XFS_LI_DQUOT: 1672 case XFS_LI_DQUOT:
1630 case XFS_LI_QUOTAOFF: 1673 case XFS_LI_QUOTAOFF:
@@ -1632,7 +1675,7 @@ xlog_recover_reorder_trans(
1632 case XFS_LI_EFI: 1675 case XFS_LI_EFI:
1633 trace_xfs_log_recover_item_reorder_tail(log, 1676 trace_xfs_log_recover_item_reorder_tail(log,
1634 trans, item, pass); 1677 trans, item, pass);
1635 list_move_tail(&item->ri_list, &trans->r_itemq); 1678 list_move_tail(&item->ri_list, &inode_list);
1636 break; 1679 break;
1637 default: 1680 default:
1638 xfs_warn(log->l_mp, 1681 xfs_warn(log->l_mp,
@@ -1643,6 +1686,14 @@ xlog_recover_reorder_trans(
1643 } 1686 }
1644 } 1687 }
1645 ASSERT(list_empty(&sort_list)); 1688 ASSERT(list_empty(&sort_list));
1689 if (!list_empty(&buffer_list))
1690 list_splice(&buffer_list, &trans->r_itemq);
1691 if (!list_empty(&inode_list))
1692 list_splice_tail(&inode_list, &trans->r_itemq);
1693 if (!list_empty(&inode_buffer_list))
1694 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1695 if (!list_empty(&cancel_list))
1696 list_splice_tail(&cancel_list, &trans->r_itemq);
1646 return 0; 1697 return 0;
1647} 1698}
1648 1699
@@ -1794,7 +1845,13 @@ xlog_recover_do_inode_buffer(
1794 xfs_agino_t *buffer_nextp; 1845 xfs_agino_t *buffer_nextp;
1795 1846
1796 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1847 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1797 bp->b_ops = &xfs_inode_buf_ops; 1848
1849 /*
1850 * Post recovery validation only works properly on CRC enabled
1851 * filesystems.
1852 */
1853 if (xfs_sb_version_hascrc(&mp->m_sb))
1854 bp->b_ops = &xfs_inode_buf_ops;
1798 1855
1799 inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; 1856 inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
1800 for (i = 0; i < inodes_per_buf; i++) { 1857 for (i = 0; i < inodes_per_buf; i++) {
@@ -1861,6 +1918,15 @@ xlog_recover_do_inode_buffer(
1861 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, 1918 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1862 next_unlinked_offset); 1919 next_unlinked_offset);
1863 *buffer_nextp = *logged_nextp; 1920 *buffer_nextp = *logged_nextp;
1921
1922 /*
1923 * If necessary, recalculate the CRC in the on-disk inode. We
1924 * have to leave the inode in a consistent state for whoever
1925 * reads it next....
1926 */
1927 xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
1928 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
1929
1864 } 1930 }
1865 1931
1866 return 0; 1932 return 0;
@@ -2097,6 +2163,17 @@ xlog_recover_do_reg_buffer(
2097 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); 2163 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2098 2164
2099 /* 2165 /*
2166 * The dirty regions logged in the buffer, even though
2167 * contiguous, may span multiple chunks. This is because the
2168 * dirty region may span a physical page boundary in a buffer
2169 * and hence be split into two separate vectors for writing into
2170 * the log. Hence we need to trim nbits back to the length of
2171 * the current region being copied out of the log.
2172 */
2173 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2174 nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2175
2176 /*
2100 * Do a sanity check if this is a dquot buffer. Just checking 2177 * Do a sanity check if this is a dquot buffer. Just checking
2101 * the first dquot in the buffer should do. XXXThis is 2178 * the first dquot in the buffer should do. XXXThis is
2102 * probably a good thing to do for other buf types also. 2179 * probably a good thing to do for other buf types also.
@@ -2134,7 +2211,16 @@ xlog_recover_do_reg_buffer(
2134 /* Shouldn't be any more regions */ 2211 /* Shouldn't be any more regions */
2135 ASSERT(i == item->ri_total); 2212 ASSERT(i == item->ri_total);
2136 2213
2137 xlog_recovery_validate_buf_type(mp, bp, buf_f); 2214 /*
2215 * We can only do post recovery validation on items on CRC enabled
2216 * fielsystems as we need to know when the buffer was written to be able
2217 * to determine if we should have replayed the item. If we replay old
2218 * metadata over a newer buffer, then it will enter a temporarily
2219 * inconsistent state resulting in verification failures. Hence for now
2220 * just avoid the verification stage for non-crc filesystems
2221 */
2222 if (xfs_sb_version_hascrc(&mp->m_sb))
2223 xlog_recovery_validate_buf_type(mp, bp, buf_f);
2138} 2224}
2139 2225
2140/* 2226/*
@@ -2255,6 +2341,12 @@ xfs_qm_dqcheck(
2255 d->dd_diskdq.d_flags = type; 2341 d->dd_diskdq.d_flags = type;
2256 d->dd_diskdq.d_id = cpu_to_be32(id); 2342 d->dd_diskdq.d_id = cpu_to_be32(id);
2257 2343
2344 if (xfs_sb_version_hascrc(&mp->m_sb)) {
2345 uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
2346 xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
2347 XFS_DQUOT_CRC_OFF);
2348 }
2349
2258 return errs; 2350 return errs;
2259} 2351}
2260 2352
@@ -2782,6 +2874,10 @@ xlog_recover_dquot_pass2(
2782 } 2874 }
2783 2875
2784 memcpy(ddq, recddq, item->ri_buf[1].i_len); 2876 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2877 if (xfs_sb_version_hascrc(&mp->m_sb)) {
2878 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
2879 XFS_DQUOT_CRC_OFF);
2880 }
2785 2881
2786 ASSERT(dq_f->qlf_size == 2); 2882 ASSERT(dq_f->qlf_size == 2);
2787 ASSERT(bp->b_target->bt_mount == mp); 2883 ASSERT(bp->b_target->bt_mount == mp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f6bfbd734669..e8e310c05097 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -314,7 +314,8 @@ STATIC int
314xfs_mount_validate_sb( 314xfs_mount_validate_sb(
315 xfs_mount_t *mp, 315 xfs_mount_t *mp,
316 xfs_sb_t *sbp, 316 xfs_sb_t *sbp,
317 bool check_inprogress) 317 bool check_inprogress,
318 bool check_version)
318{ 319{
319 320
320 /* 321 /*
@@ -337,9 +338,10 @@ xfs_mount_validate_sb(
337 338
338 /* 339 /*
339 * Version 5 superblock feature mask validation. Reject combinations the 340 * Version 5 superblock feature mask validation. Reject combinations the
340 * kernel cannot support up front before checking anything else. 341 * kernel cannot support up front before checking anything else. For
342 * write validation, we don't need to check feature masks.
341 */ 343 */
342 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { 344 if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
343 xfs_alert(mp, 345 xfs_alert(mp,
344"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" 346"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
345"Use of these features in this kernel is at your own risk!"); 347"Use of these features in this kernel is at your own risk!");
@@ -675,7 +677,8 @@ xfs_sb_to_disk(
675 677
676static int 678static int
677xfs_sb_verify( 679xfs_sb_verify(
678 struct xfs_buf *bp) 680 struct xfs_buf *bp,
681 bool check_version)
679{ 682{
680 struct xfs_mount *mp = bp->b_target->bt_mount; 683 struct xfs_mount *mp = bp->b_target->bt_mount;
681 struct xfs_sb sb; 684 struct xfs_sb sb;
@@ -686,7 +689,8 @@ xfs_sb_verify(
686 * Only check the in progress field for the primary superblock as 689 * Only check the in progress field for the primary superblock as
687 * mkfs.xfs doesn't clear it from secondary superblocks. 690 * mkfs.xfs doesn't clear it from secondary superblocks.
688 */ 691 */
689 return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); 692 return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
693 check_version);
690} 694}
691 695
692/* 696/*
@@ -719,7 +723,7 @@ xfs_sb_read_verify(
719 goto out_error; 723 goto out_error;
720 } 724 }
721 } 725 }
722 error = xfs_sb_verify(bp); 726 error = xfs_sb_verify(bp, true);
723 727
724out_error: 728out_error:
725 if (error) { 729 if (error) {
@@ -758,7 +762,7 @@ xfs_sb_write_verify(
758 struct xfs_buf_log_item *bip = bp->b_fspriv; 762 struct xfs_buf_log_item *bip = bp->b_fspriv;
759 int error; 763 int error;
760 764
761 error = xfs_sb_verify(bp); 765 error = xfs_sb_verify(bp, false);
762 if (error) { 766 if (error) {
763 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 767 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
764 xfs_buf_ioerror(bp, error); 768 xfs_buf_ioerror(bp, error);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f41702b43003..b75c9bb6e71e 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -41,6 +41,7 @@
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h" 43#include "xfs_icache.h"
44#include "xfs_cksum.h"
44 45
45/* 46/*
46 * The global quota manager. There is only one of these for the entire 47 * The global quota manager. There is only one of these for the entire
@@ -839,7 +840,7 @@ xfs_qm_reset_dqcounts(
839 xfs_dqid_t id, 840 xfs_dqid_t id,
840 uint type) 841 uint type)
841{ 842{
842 xfs_disk_dquot_t *ddq; 843 struct xfs_dqblk *dqb;
843 int j; 844 int j;
844 845
845 trace_xfs_reset_dqcounts(bp, _RET_IP_); 846 trace_xfs_reset_dqcounts(bp, _RET_IP_);
@@ -853,8 +854,12 @@ xfs_qm_reset_dqcounts(
853 do_div(j, sizeof(xfs_dqblk_t)); 854 do_div(j, sizeof(xfs_dqblk_t));
854 ASSERT(mp->m_quotainfo->qi_dqperchunk == j); 855 ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
855#endif 856#endif
856 ddq = bp->b_addr; 857 dqb = bp->b_addr;
857 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { 858 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
859 struct xfs_disk_dquot *ddq;
860
861 ddq = (struct xfs_disk_dquot *)&dqb[j];
862
858 /* 863 /*
859 * Do a sanity check, and if needed, repair the dqblk. Don't 864 * Do a sanity check, and if needed, repair the dqblk. Don't
860 * output any warnings because it's perfectly possible to 865 * output any warnings because it's perfectly possible to
@@ -871,7 +876,12 @@ xfs_qm_reset_dqcounts(
871 ddq->d_bwarns = 0; 876 ddq->d_bwarns = 0;
872 ddq->d_iwarns = 0; 877 ddq->d_iwarns = 0;
873 ddq->d_rtbwarns = 0; 878 ddq->d_rtbwarns = 0;
874 ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); 879
880 if (xfs_sb_version_hascrc(&mp->m_sb)) {
881 xfs_update_cksum((char *)&dqb[j],
882 sizeof(struct xfs_dqblk),
883 XFS_DQUOT_CRC_OFF);
884 }
875 } 885 }
876} 886}
877 887
@@ -907,19 +917,29 @@ xfs_qm_dqiter_bufs(
907 XFS_FSB_TO_DADDR(mp, bno), 917 XFS_FSB_TO_DADDR(mp, bno),
908 mp->m_quotainfo->qi_dqchunklen, 0, &bp, 918 mp->m_quotainfo->qi_dqchunklen, 0, &bp,
909 &xfs_dquot_buf_ops); 919 &xfs_dquot_buf_ops);
910 if (error)
911 break;
912 920
913 /* 921 /*
914 * XXX(hch): need to figure out if it makes sense to validate 922 * CRC and validation errors will return a EFSCORRUPTED here. If
915 * the CRC here. 923 * this occurs, re-read without CRC validation so that we can
924 * repair the damage via xfs_qm_reset_dqcounts(). This process
925 * will leave a trace in the log indicating corruption has
926 * been detected.
916 */ 927 */
928 if (error == EFSCORRUPTED) {
929 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
930 XFS_FSB_TO_DADDR(mp, bno),
931 mp->m_quotainfo->qi_dqchunklen, 0, &bp,
932 NULL);
933 }
934
935 if (error)
936 break;
937
917 xfs_qm_reset_dqcounts(mp, bp, firstid, type); 938 xfs_qm_reset_dqcounts(mp, bp, firstid, type);
918 xfs_buf_delwri_queue(bp, buffer_list); 939 xfs_buf_delwri_queue(bp, buffer_list);
919 xfs_buf_relse(bp); 940 xfs_buf_relse(bp);
920 /* 941
921 * goto the next block. 942 /* goto the next block. */
922 */
923 bno++; 943 bno++;
924 firstid += mp->m_quotainfo->qi_dqperchunk; 944 firstid += mp->m_quotainfo->qi_dqperchunk;
925 } 945 }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index c41190cad6e9..6cdf6ffc36a1 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -489,31 +489,36 @@ xfs_qm_scall_setqlim(
489 if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) 489 if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
490 return 0; 490 return 0;
491 491
492 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
493 error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
494 0, 0, XFS_DEFAULT_LOG_COUNT);
495 if (error) {
496 xfs_trans_cancel(tp, 0);
497 return (error);
498 }
499
500 /* 492 /*
501 * We don't want to race with a quotaoff so take the quotaoff lock. 493 * We don't want to race with a quotaoff so take the quotaoff lock.
502 * (We don't hold an inode lock, so there's nothing else to stop 494 * We don't hold an inode lock, so there's nothing else to stop
503 * a quotaoff from happening). (XXXThis doesn't currently happen 495 * a quotaoff from happening.
504 * because we take the vfslock before calling xfs_qm_sysent).
505 */ 496 */
506 mutex_lock(&q->qi_quotaofflock); 497 mutex_lock(&q->qi_quotaofflock);
507 498
508 /* 499 /*
509 * Get the dquot (locked), and join it to the transaction. 500 * Get the dquot (locked) before we start, as we need to do a
510 * Allocate the dquot if this doesn't exist. 501 * transaction to allocate it if it doesn't exist. Once we have the
502 * dquot, unlock it so we can start the next transaction safely. We hold
503 * a reference to the dquot, so it's safe to do this unlock/lock without
504 * it being reclaimed in the mean time.
511 */ 505 */
512 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { 506 error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
513 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 507 if (error) {
514 ASSERT(error != ENOENT); 508 ASSERT(error != ENOENT);
515 goto out_unlock; 509 goto out_unlock;
516 } 510 }
511 xfs_dqunlock(dqp);
512
513 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
514 error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
515 0, 0, XFS_DEFAULT_LOG_COUNT);
516 if (error) {
517 xfs_trans_cancel(tp, 0);
518 goto out_rele;
519 }
520
521 xfs_dqlock(dqp);
517 xfs_trans_dqjoin(tp, dqp); 522 xfs_trans_dqjoin(tp, dqp);
518 ddq = &dqp->q_core; 523 ddq = &dqp->q_core;
519 524
@@ -621,9 +626,10 @@ xfs_qm_scall_setqlim(
621 xfs_trans_log_dquot(tp, dqp); 626 xfs_trans_log_dquot(tp, dqp);
622 627
623 error = xfs_trans_commit(tp, 0); 628 error = xfs_trans_commit(tp, 0);
624 xfs_qm_dqrele(dqp);
625 629
626 out_unlock: 630out_rele:
631 xfs_qm_dqrele(dqp);
632out_unlock:
627 mutex_unlock(&q->qi_quotaofflock); 633 mutex_unlock(&q->qi_quotaofflock);
628 return error; 634 return error;
629} 635}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c61e31c7d997..c38068f26c55 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -87,6 +87,8 @@ typedef struct xfs_dqblk {
87 uuid_t dd_uuid; /* location information */ 87 uuid_t dd_uuid; /* location information */
88} xfs_dqblk_t; 88} xfs_dqblk_t;
89 89
90#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
91
90/* 92/*
91 * flags for q_flags field in the dquot. 93 * flags for q_flags field in the dquot.
92 */ 94 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ea341cea68cb..3033ba5e9762 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1373,6 +1373,17 @@ xfs_finish_flags(
1373 } 1373 }
1374 1374
1375 /* 1375 /*
1376 * V5 filesystems always use attr2 format for attributes.
1377 */
1378 if (xfs_sb_version_hascrc(&mp->m_sb) &&
1379 (mp->m_flags & XFS_MOUNT_NOATTR2)) {
1380 xfs_warn(mp,
1381"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
1382 MNTOPT_NOATTR2, MNTOPT_ATTR2);
1383 return XFS_ERROR(EINVAL);
1384 }
1385
1386 /*
1376 * mkfs'ed attr2 will turn on attr2 mount unless explicitly 1387 * mkfs'ed attr2 will turn on attr2 mount unless explicitly
1377 * told by noattr2 to turn it off 1388 * told by noattr2 to turn it off
1378 */ 1389 */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 5f234389327c..195a403e1522 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -56,16 +56,9 @@ xfs_symlink_blocks(
56 struct xfs_mount *mp, 56 struct xfs_mount *mp,
57 int pathlen) 57 int pathlen)
58{ 58{
59 int fsblocks = 0; 59 int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
60 int len = pathlen;
61 60
62 do { 61 return (pathlen + buflen - 1) / buflen;
63 fsblocks++;
64 len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
65 } while (len > 0);
66
67 ASSERT(fsblocks <= XFS_SYMLINK_MAPS);
68 return fsblocks;
69} 62}
70 63
71static int 64static int
@@ -405,7 +398,7 @@ xfs_symlink(
405 if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) 398 if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
406 fs_blocks = 0; 399 fs_blocks = 0;
407 else 400 else
408 fs_blocks = XFS_B_TO_FSB(mp, pathlen); 401 fs_blocks = xfs_symlink_blocks(mp, pathlen);
409 resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); 402 resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
410 error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, 403 error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
411 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); 404 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
@@ -512,7 +505,7 @@ xfs_symlink(
512 cur_chunk = target_path; 505 cur_chunk = target_path;
513 offset = 0; 506 offset = 0;
514 for (n = 0; n < nmaps; n++) { 507 for (n = 0; n < nmaps; n++) {
515 char *buf; 508 char *buf;
516 509
517 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); 510 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
518 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 511 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
@@ -525,9 +518,7 @@ xfs_symlink(
525 bp->b_ops = &xfs_symlink_buf_ops; 518 bp->b_ops = &xfs_symlink_buf_ops;
526 519
527 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); 520 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
528 if (pathlen < byte_cnt) { 521 byte_cnt = min(byte_cnt, pathlen);
529 byte_cnt = pathlen;
530 }
531 522
532 buf = bp->b_addr; 523 buf = bp->b_addr;
533 buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, 524 buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
@@ -542,6 +533,7 @@ xfs_symlink(
542 xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - 533 xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
543 (char *)bp->b_addr); 534 (char *)bp->b_addr);
544 } 535 }
536 ASSERT(pathlen == 0);
545 } 537 }
546 538
547 /* 539 /*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa4db3307d36..a04701de6bbd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -974,14 +974,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
974DEFINE_RW_EVENT(xfs_file_splice_write); 974DEFINE_RW_EVENT(xfs_file_splice_write);
975 975
976DECLARE_EVENT_CLASS(xfs_page_class, 976DECLARE_EVENT_CLASS(xfs_page_class,
977 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), 977 TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
978 TP_ARGS(inode, page, off), 978 unsigned int len),
979 TP_ARGS(inode, page, off, len),
979 TP_STRUCT__entry( 980 TP_STRUCT__entry(
980 __field(dev_t, dev) 981 __field(dev_t, dev)
981 __field(xfs_ino_t, ino) 982 __field(xfs_ino_t, ino)
982 __field(pgoff_t, pgoff) 983 __field(pgoff_t, pgoff)
983 __field(loff_t, size) 984 __field(loff_t, size)
984 __field(unsigned long, offset) 985 __field(unsigned long, offset)
986 __field(unsigned int, length)
985 __field(int, delalloc) 987 __field(int, delalloc)
986 __field(int, unwritten) 988 __field(int, unwritten)
987 ), 989 ),
@@ -995,24 +997,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
995 __entry->pgoff = page_offset(page); 997 __entry->pgoff = page_offset(page);
996 __entry->size = i_size_read(inode); 998 __entry->size = i_size_read(inode);
997 __entry->offset = off; 999 __entry->offset = off;
1000 __entry->length = len;
998 __entry->delalloc = delalloc; 1001 __entry->delalloc = delalloc;
999 __entry->unwritten = unwritten; 1002 __entry->unwritten = unwritten;
1000 ), 1003 ),
1001 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " 1004 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
1002 "delalloc %d unwritten %d", 1005 "length %x delalloc %d unwritten %d",
1003 MAJOR(__entry->dev), MINOR(__entry->dev), 1006 MAJOR(__entry->dev), MINOR(__entry->dev),
1004 __entry->ino, 1007 __entry->ino,
1005 __entry->pgoff, 1008 __entry->pgoff,
1006 __entry->size, 1009 __entry->size,
1007 __entry->offset, 1010 __entry->offset,
1011 __entry->length,
1008 __entry->delalloc, 1012 __entry->delalloc,
1009 __entry->unwritten) 1013 __entry->unwritten)
1010) 1014)
1011 1015
1012#define DEFINE_PAGE_EVENT(name) \ 1016#define DEFINE_PAGE_EVENT(name) \
1013DEFINE_EVENT(xfs_page_class, name, \ 1017DEFINE_EVENT(xfs_page_class, name, \
1014 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ 1018 TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
1015 TP_ARGS(inode, page, off)) 1019 unsigned int len), \
1020 TP_ARGS(inode, page, off, len))
1016DEFINE_PAGE_EVENT(xfs_writepage); 1021DEFINE_PAGE_EVENT(xfs_writepage);
1017DEFINE_PAGE_EVENT(xfs_releasepage); 1022DEFINE_PAGE_EVENT(xfs_releasepage);
1018DEFINE_PAGE_EVENT(xfs_invalidatepage); 1023DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 1501f4fa51a6..0176bb21f09a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1453,7 +1453,7 @@ xfs_free_file_space(
1453 xfs_mount_t *mp; 1453 xfs_mount_t *mp;
1454 int nimap; 1454 int nimap;
1455 uint resblks; 1455 uint resblks;
1456 uint rounding; 1456 xfs_off_t rounding;
1457 int rt; 1457 int rt;
1458 xfs_fileoff_t startoffset_fsb; 1458 xfs_fileoff_t startoffset_fsb;
1459 xfs_trans_t *tp; 1459 xfs_trans_t *tp;
@@ -1482,7 +1482,7 @@ xfs_free_file_space(
1482 inode_dio_wait(VFS_I(ip)); 1482 inode_dio_wait(VFS_I(ip));
1483 } 1483 }
1484 1484
1485 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 1485 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1486 ioffset = offset & ~(rounding - 1); 1486 ioffset = offset & ~(rounding - 1);
1487 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 1487 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1488 ioffset, -1); 1488 ioffset, -1);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 5163022d9808..38c67c34d73f 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,8 +31,7 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 31 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
33 struct xfs_name *target_name); 33 struct xfs_name *target_name);
34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 34int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
35 xfs_off_t *offset, filldir_t filldir);
36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 35int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
37 const char *target_path, umode_t mode, struct xfs_inode **ipp); 36 const char *target_path, umode_t mode, struct xfs_inode **ipp);
38int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 37int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);