aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c21
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_inode.c126
-rw-r--r--fs/9p/vfs_super.c39
-rw-r--r--fs/afs/file.c18
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/binfmt_elf.c28
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/inode.c21
-rw-r--r--fs/buffer.c9
-rw-r--r--fs/char_dev.c40
-rw-r--r--fs/compat.c17
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/exec.c67
-rw-r--r--fs/ext2/acl.c8
-rw-r--r--fs/ext2/acl.h4
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/namei.c8
-rw-r--r--fs/ext3/Kconfig32
-rw-r--r--fs/ext3/acl.c8
-rw-r--r--fs/ext3/acl.h4
-rw-r--r--fs/ext3/file.c2
-rw-r--r--fs/ext3/namei.c4
-rw-r--r--fs/ext3/super.c40
-rw-r--r--fs/ext4/acl.c8
-rw-r--r--fs/ext4/acl.h4
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/namei.c4
-rw-r--r--fs/fs-writeback.c1065
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/gfs2/sys.c20
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--fs/jffs2/acl.c7
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jffs2/wbuf.c10
-rw-r--r--fs/jfs/acl.c7
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/locks.c2
-rw-r--r--fs/namei.c110
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/nfs4state.c4
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c46
-rw-r--r--fs/notify/inotify/inotify_user.c254
-rw-r--r--fs/notify/notification.c11
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/dcache.c11
-rw-r--r--fs/ocfs2/dlm/dlmfs.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/ocfs2_lockid.h1
-rw-r--r--fs/ocfs2/quota_global.c10
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/open.c12
-rw-r--r--fs/proc/base.c19
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/select.c1
-rw-r--r--fs/super.c5
-rw-r--r--fs/sync.c20
-rw-r--r--fs/sysfs/dir.c1
-rw-r--r--fs/sysfs/inode.c135
-rw-r--r--fs/sysfs/symlink.c2
-rw-r--r--fs/sysfs/sysfs.h12
-rw-r--r--fs/ubifs/budget.c16
-rw-r--r--fs/ubifs/super.c9
-rw-r--r--fs/xattr.c55
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/xfs_iget.c113
82 files changed, 1677 insertions, 908 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 332b5ff02fec..f7003cfac63d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -76,7 +76,7 @@ static const match_table_t tokens = {
76 * Return 0 upon success, -ERRNO upon failure. 76 * Return 0 upon success, -ERRNO upon failure.
77 */ 77 */
78 78
79static int v9fs_parse_options(struct v9fs_session_info *v9ses) 79static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
80{ 80{
81 char *options; 81 char *options;
82 substring_t args[MAX_OPT_ARGS]; 82 substring_t args[MAX_OPT_ARGS];
@@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
90 v9ses->debug = 0; 90 v9ses->debug = 0;
91 v9ses->cache = 0; 91 v9ses->cache = 0;
92 92
93 if (!v9ses->options) 93 if (!opts)
94 return 0; 94 return 0;
95 95
96 options = kstrdup(v9ses->options, GFP_KERNEL); 96 options = kstrdup(opts, GFP_KERNEL);
97 if (!options) { 97 if (!options) {
98 P9_DPRINTK(P9_DEBUG_ERROR, 98 P9_DPRINTK(P9_DEBUG_ERROR,
99 "failed to allocate copy of option string\n"); 99 "failed to allocate copy of option string\n");
@@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
206 v9ses->uid = ~0; 206 v9ses->uid = ~0;
207 v9ses->dfltuid = V9FS_DEFUID; 207 v9ses->dfltuid = V9FS_DEFUID;
208 v9ses->dfltgid = V9FS_DEFGID; 208 v9ses->dfltgid = V9FS_DEFGID;
209 if (data) {
210 v9ses->options = kstrdup(data, GFP_KERNEL);
211 if (!v9ses->options) {
212 P9_DPRINTK(P9_DEBUG_ERROR,
213 "failed to allocate copy of option string\n");
214 retval = -ENOMEM;
215 goto error;
216 }
217 }
218 209
219 rc = v9fs_parse_options(v9ses); 210 rc = v9fs_parse_options(v9ses, data);
220 if (rc < 0) { 211 if (rc < 0) {
221 retval = rc; 212 retval = rc;
222 goto error; 213 goto error;
223 } 214 }
224 215
225 v9ses->clnt = p9_client_create(dev_name, v9ses->options); 216 v9ses->clnt = p9_client_create(dev_name, data);
226
227 if (IS_ERR(v9ses->clnt)) { 217 if (IS_ERR(v9ses->clnt)) {
228 retval = PTR_ERR(v9ses->clnt); 218 retval = PTR_ERR(v9ses->clnt);
229 v9ses->clnt = NULL; 219 v9ses->clnt = NULL;
@@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
280 270
281 __putname(v9ses->uname); 271 __putname(v9ses->uname);
282 __putname(v9ses->aname); 272 __putname(v9ses->aname);
283 kfree(v9ses->options);
284} 273}
285 274
286/** 275/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a7d567192998..38762bf102a9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -85,7 +85,6 @@ struct v9fs_session_info {
85 unsigned int afid; 85 unsigned int afid;
86 unsigned int cache; 86 unsigned int cache;
87 87
88 char *options; /* copy of mount options */
89 char *uname; /* user name to mount as */ 88 char *uname; /* user name to mount as */
90 char *aname; /* name of remote hierarchy being mounted */ 89 char *aname; /* name of remote hierarchy being mounted */
91 unsigned int maxdata; /* max data for client interface */ 90 unsigned int maxdata; /* max data for client interface */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 81f8bbf12f9f..06a223d50a81 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended)
171 171
172/** 172/**
173 * v9fs_blank_wstat - helper function to setup a 9P stat structure 173 * v9fs_blank_wstat - helper function to setup a 9P stat structure
174 * @v9ses: 9P session info (for determining extended mode)
175 * @wstat: structure to initialize 174 * @wstat: structure to initialize
176 * 175 *
177 */ 176 */
@@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
207 206
208struct inode *v9fs_get_inode(struct super_block *sb, int mode) 207struct inode *v9fs_get_inode(struct super_block *sb, int mode)
209{ 208{
209 int err;
210 struct inode *inode; 210 struct inode *inode;
211 struct v9fs_session_info *v9ses = sb->s_fs_info; 211 struct v9fs_session_info *v9ses = sb->s_fs_info;
212 212
213 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); 213 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
214 214
215 inode = new_inode(sb); 215 inode = new_inode(sb);
216 if (inode) { 216 if (!inode) {
217 inode->i_mode = mode;
218 inode->i_uid = current_fsuid();
219 inode->i_gid = current_fsgid();
220 inode->i_blocks = 0;
221 inode->i_rdev = 0;
222 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
223 inode->i_mapping->a_ops = &v9fs_addr_operations;
224
225 switch (mode & S_IFMT) {
226 case S_IFIFO:
227 case S_IFBLK:
228 case S_IFCHR:
229 case S_IFSOCK:
230 if (!v9fs_extended(v9ses)) {
231 P9_DPRINTK(P9_DEBUG_ERROR,
232 "special files without extended mode\n");
233 return ERR_PTR(-EINVAL);
234 }
235 init_special_inode(inode, inode->i_mode,
236 inode->i_rdev);
237 break;
238 case S_IFREG:
239 inode->i_op = &v9fs_file_inode_operations;
240 inode->i_fop = &v9fs_file_operations;
241 break;
242 case S_IFLNK:
243 if (!v9fs_extended(v9ses)) {
244 P9_DPRINTK(P9_DEBUG_ERROR,
245 "extended modes used w/o 9P2000.u\n");
246 return ERR_PTR(-EINVAL);
247 }
248 inode->i_op = &v9fs_symlink_inode_operations;
249 break;
250 case S_IFDIR:
251 inc_nlink(inode);
252 if (v9fs_extended(v9ses))
253 inode->i_op = &v9fs_dir_inode_operations_ext;
254 else
255 inode->i_op = &v9fs_dir_inode_operations;
256 inode->i_fop = &v9fs_dir_operations;
257 break;
258 default:
259 P9_DPRINTK(P9_DEBUG_ERROR,
260 "BAD mode 0x%x S_IFMT 0x%x\n",
261 mode, mode & S_IFMT);
262 return ERR_PTR(-EINVAL);
263 }
264 } else {
265 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); 217 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
266 return ERR_PTR(-ENOMEM); 218 return ERR_PTR(-ENOMEM);
267 } 219 }
220
221 inode->i_mode = mode;
222 inode->i_uid = current_fsuid();
223 inode->i_gid = current_fsgid();
224 inode->i_blocks = 0;
225 inode->i_rdev = 0;
226 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
227 inode->i_mapping->a_ops = &v9fs_addr_operations;
228
229 switch (mode & S_IFMT) {
230 case S_IFIFO:
231 case S_IFBLK:
232 case S_IFCHR:
233 case S_IFSOCK:
234 if (!v9fs_extended(v9ses)) {
235 P9_DPRINTK(P9_DEBUG_ERROR,
236 "special files without extended mode\n");
237 err = -EINVAL;
238 goto error;
239 }
240 init_special_inode(inode, inode->i_mode, inode->i_rdev);
241 break;
242 case S_IFREG:
243 inode->i_op = &v9fs_file_inode_operations;
244 inode->i_fop = &v9fs_file_operations;
245 break;
246 case S_IFLNK:
247 if (!v9fs_extended(v9ses)) {
248 P9_DPRINTK(P9_DEBUG_ERROR,
249 "extended modes used w/o 9P2000.u\n");
250 err = -EINVAL;
251 goto error;
252 }
253 inode->i_op = &v9fs_symlink_inode_operations;
254 break;
255 case S_IFDIR:
256 inc_nlink(inode);
257 if (v9fs_extended(v9ses))
258 inode->i_op = &v9fs_dir_inode_operations_ext;
259 else
260 inode->i_op = &v9fs_dir_inode_operations;
261 inode->i_fop = &v9fs_dir_operations;
262 break;
263 default:
264 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
265 mode, mode & S_IFMT);
266 err = -EINVAL;
267 goto error;
268 }
269
268 return inode; 270 return inode;
271
272error:
273 iput(inode);
274 return ERR_PTR(err);
269} 275}
270 276
271/* 277/*
@@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
338 344
339 ret = NULL; 345 ret = NULL;
340 st = p9_client_stat(fid); 346 st = p9_client_stat(fid);
341 if (IS_ERR(st)) { 347 if (IS_ERR(st))
342 err = PTR_ERR(st); 348 return ERR_CAST(st);
343 st = NULL;
344 goto error;
345 }
346 349
347 umode = p9mode2unixmode(v9ses, st->mode); 350 umode = p9mode2unixmode(v9ses, st->mode);
348 ret = v9fs_get_inode(sb, umode); 351 ret = v9fs_get_inode(sb, umode);
349 if (IS_ERR(ret)) { 352 if (IS_ERR(ret)) {
350 err = PTR_ERR(ret); 353 err = PTR_ERR(ret);
351 ret = NULL;
352 goto error; 354 goto error;
353 } 355 }
354 356
355 v9fs_stat2inode(st, ret, sb); 357 v9fs_stat2inode(st, ret, sb);
356 ret->i_ino = v9fs_qid2ino(&st->qid); 358 ret->i_ino = v9fs_qid2ino(&st->qid);
359 p9stat_free(st);
357 kfree(st); 360 kfree(st);
358 return ret; 361 return ret;
359 362
360error: 363error:
364 p9stat_free(st);
361 kfree(st); 365 kfree(st);
362 if (ret)
363 iput(ret);
364
365 return ERR_PTR(err); 366 return ERR_PTR(err);
366} 367}
367 368
@@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file)
403 * @v9ses: session information 404 * @v9ses: session information
404 * @dir: directory that dentry is being created in 405 * @dir: directory that dentry is being created in
405 * @dentry: dentry that is being created 406 * @dentry: dentry that is being created
407 * @extension: 9p2000.u extension string to support devices, etc.
406 * @perm: create permissions 408 * @perm: create permissions
407 * @mode: open mode 409 * @mode: open mode
408 * @extension: 9p2000.u extension string to support devices, etc.
409 * 410 *
410 */ 411 */
411static struct p9_fid * 412static struct p9_fid *
@@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
470 dentry->d_op = &v9fs_dentry_operations; 471 dentry->d_op = &v9fs_dentry_operations;
471 472
472 d_instantiate(dentry, inode); 473 d_instantiate(dentry, inode);
473 v9fs_fid_add(dentry, fid); 474 err = v9fs_fid_add(dentry, fid);
475 if (err < 0)
476 goto error;
477
474 return ofid; 478 return ofid;
475 479
476error: 480error:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 38d695d66a0b..8961f1a8f668 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
81 81
82static void 82static void
83v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, 83v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
84 int flags) 84 int flags, void *data)
85{ 85{
86 sb->s_maxbytes = MAX_LFS_FILESIZE; 86 sb->s_maxbytes = MAX_LFS_FILESIZE;
87 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 87 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
91 91
92 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 92 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
93 MS_NOATIME; 93 MS_NOATIME;
94
95 save_mount_options(sb, data);
94} 96}
95 97
96/** 98/**
@@ -113,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
113 struct v9fs_session_info *v9ses = NULL; 115 struct v9fs_session_info *v9ses = NULL;
114 struct p9_wstat *st = NULL; 116 struct p9_wstat *st = NULL;
115 int mode = S_IRWXUGO | S_ISVTX; 117 int mode = S_IRWXUGO | S_ISVTX;
116 uid_t uid = current_fsuid();
117 gid_t gid = current_fsgid();
118 struct p9_fid *fid; 118 struct p9_fid *fid;
119 int retval = 0; 119 int retval = 0;
120 120
121 P9_DPRINTK(P9_DEBUG_VFS, " \n"); 121 P9_DPRINTK(P9_DEBUG_VFS, " \n");
122 122
123 st = NULL;
124 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); 123 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
125 if (!v9ses) 124 if (!v9ses)
126 return -ENOMEM; 125 return -ENOMEM;
@@ -142,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
142 retval = PTR_ERR(sb); 141 retval = PTR_ERR(sb);
143 goto free_stat; 142 goto free_stat;
144 } 143 }
145 v9fs_fill_super(sb, v9ses, flags); 144 v9fs_fill_super(sb, v9ses, flags, data);
146 145
147 inode = v9fs_get_inode(sb, S_IFDIR | mode); 146 inode = v9fs_get_inode(sb, S_IFDIR | mode);
148 if (IS_ERR(inode)) { 147 if (IS_ERR(inode)) {
@@ -150,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
150 goto release_sb; 149 goto release_sb;
151 } 150 }
152 151
153 inode->i_uid = uid;
154 inode->i_gid = gid;
155
156 root = d_alloc_root(inode); 152 root = d_alloc_root(inode);
157 if (!root) { 153 if (!root) {
158 iput(inode); 154 iput(inode);
@@ -173,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
173 simple_set_mnt(mnt, sb); 169 simple_set_mnt(mnt, sb);
174 return 0; 170 return 0;
175 171
176release_sb:
177 deactivate_locked_super(sb);
178
179free_stat: 172free_stat:
173 p9stat_free(st);
180 kfree(st); 174 kfree(st);
181 175
182clunk_fid: 176clunk_fid:
@@ -185,7 +179,12 @@ clunk_fid:
185close_session: 179close_session:
186 v9fs_session_close(v9ses); 180 v9fs_session_close(v9ses);
187 kfree(v9ses); 181 kfree(v9ses);
182 return retval;
188 183
184release_sb:
185 p9stat_free(st);
186 kfree(st);
187 deactivate_locked_super(sb);
189 return retval; 188 return retval;
190} 189}
191 190
@@ -207,24 +206,10 @@ static void v9fs_kill_super(struct super_block *s)
207 206
208 v9fs_session_close(v9ses); 207 v9fs_session_close(v9ses);
209 kfree(v9ses); 208 kfree(v9ses);
209 s->s_fs_info = NULL;
210 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); 210 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
211} 211}
212 212
213/**
214 * v9fs_show_options - Show mount options in /proc/mounts
215 * @m: seq_file to write to
216 * @mnt: mount descriptor
217 *
218 */
219
220static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
221{
222 struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
223
224 seq_printf(m, "%s", v9ses->options);
225 return 0;
226}
227
228static void 213static void
229v9fs_umount_begin(struct super_block *sb) 214v9fs_umount_begin(struct super_block *sb)
230{ 215{
@@ -237,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb)
237static const struct super_operations v9fs_super_ops = { 222static const struct super_operations v9fs_super_ops = {
238 .statfs = simple_statfs, 223 .statfs = simple_statfs,
239 .clear_inode = v9fs_clear_inode, 224 .clear_inode = v9fs_clear_inode,
240 .show_options = v9fs_show_options, 225 .show_options = generic_show_options,
241 .umount_begin = v9fs_umount_begin, 226 .umount_begin = v9fs_umount_begin,
242}; 227};
243 228
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0149dab365e7..681c2a7b013f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page)
134 134
135 inode = page->mapping->host; 135 inode = page->mapping->host;
136 136
137 ASSERT(file != NULL); 137 if (file) {
138 key = file->private_data; 138 key = file->private_data;
139 ASSERT(key != NULL); 139 ASSERT(key != NULL);
140 } else {
141 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
142 if (IS_ERR(key)) {
143 ret = PTR_ERR(key);
144 goto error_nokey;
145 }
146 }
140 147
141 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); 148 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
142 149
@@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page)
207 unlock_page(page); 214 unlock_page(page);
208 } 215 }
209 216
217 if (!file)
218 key_put(key);
210 _leave(" = 0"); 219 _leave(" = 0");
211 return 0; 220 return 0;
212 221
213error: 222error:
214 SetPageError(page); 223 SetPageError(page);
215 unlock_page(page); 224 unlock_page(page);
225 if (!file)
226 key_put(key);
227error_nokey:
216 _leave(" = %d", ret); 228 _leave(" = %d", ret);
217 return ret; 229 return ret;
218} 230}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index aa39ae83f019..3da18d453488 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
77 } 77 }
78 78
79 /* Update the expiry counter if fs is busy */ 79 /* Update the expiry counter if fs is busy */
80 if (!may_umount_tree(mnt)) { 80 if (!may_umount_tree(path.mnt)) {
81 struct autofs_info *ino = autofs4_dentry_ino(top); 81 struct autofs_info *ino = autofs4_dentry_ino(top);
82 ino->last_used = jiffies; 82 ino->last_used = jiffies;
83 goto done; 83 goto done;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..7c1e65d54872 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
501 } 501 }
502 } 502 }
503 503
504 /* 504 if (last_bss > elf_bss) {
505 * Now fill out the bss section. First pad the last page up 505 /*
506 * to the page boundary, and then perform a mmap to make sure 506 * Now fill out the bss section. First pad the last page up
507 * that there are zero-mapped pages up to and including the 507 * to the page boundary, and then perform a mmap to make sure
508 * last bss page. 508 * that there are zero-mapped pages up to and including the
509 */ 509 * last bss page.
510 if (padzero(elf_bss)) { 510 */
511 error = -EFAULT; 511 if (padzero(elf_bss)) {
512 goto out_close; 512 error = -EFAULT;
513 } 513 goto out_close;
514 }
514 515
515 /* What we have mapped so far */ 516 /* What we have mapped so far */
516 elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); 517 elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
517 518
518 /* Map the last of the bss segment */ 519 /* Map the last of the bss segment */
519 if (last_bss > elf_bss) {
520 down_write(&current->mm->mmap_sem); 520 down_write(&current->mm->mmap_sem);
521 error = do_brk(elf_bss, last_bss - elf_bss); 521 error = do_brk(elf_bss, last_bss - elf_bss);
522 up_write(&current->mm->mmap_sem); 522 up_write(&current->mm->mmap_sem);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..15831d5c7367 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1352{ 1352{
1353 int err; 1353 int err;
1354 1354
1355 bdi->name = "btrfs";
1355 bdi->capabilities = BDI_CAP_MAP_COPY; 1356 bdi->capabilities = BDI_CAP_MAP_COPY;
1356 err = bdi_init(bdi); 1357 err = bdi_init(bdi);
1357 if (err) 1358 if (err)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 272b9b2bea86..59cba180fe83 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3099,8 +3099,12 @@ static void inode_tree_add(struct inode *inode)
3099{ 3099{
3100 struct btrfs_root *root = BTRFS_I(inode)->root; 3100 struct btrfs_root *root = BTRFS_I(inode)->root;
3101 struct btrfs_inode *entry; 3101 struct btrfs_inode *entry;
3102 struct rb_node **p = &root->inode_tree.rb_node; 3102 struct rb_node **p;
3103 struct rb_node *parent = NULL; 3103 struct rb_node *parent;
3104
3105again:
3106 p = &root->inode_tree.rb_node;
3107 parent = NULL;
3104 3108
3105 spin_lock(&root->inode_lock); 3109 spin_lock(&root->inode_lock);
3106 while (*p) { 3110 while (*p) {
@@ -3108,13 +3112,16 @@ static void inode_tree_add(struct inode *inode)
3108 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3112 entry = rb_entry(parent, struct btrfs_inode, rb_node);
3109 3113
3110 if (inode->i_ino < entry->vfs_inode.i_ino) 3114 if (inode->i_ino < entry->vfs_inode.i_ino)
3111 p = &(*p)->rb_left; 3115 p = &parent->rb_left;
3112 else if (inode->i_ino > entry->vfs_inode.i_ino) 3116 else if (inode->i_ino > entry->vfs_inode.i_ino)
3113 p = &(*p)->rb_right; 3117 p = &parent->rb_right;
3114 else { 3118 else {
3115 WARN_ON(!(entry->vfs_inode.i_state & 3119 WARN_ON(!(entry->vfs_inode.i_state &
3116 (I_WILL_FREE | I_FREEING | I_CLEAR))); 3120 (I_WILL_FREE | I_FREEING | I_CLEAR)));
3117 break; 3121 rb_erase(parent, &root->inode_tree);
3122 RB_CLEAR_NODE(parent);
3123 spin_unlock(&root->inode_lock);
3124 goto again;
3118 } 3125 }
3119 } 3126 }
3120 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3127 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
@@ -3126,12 +3133,12 @@ static void inode_tree_del(struct inode *inode)
3126{ 3133{
3127 struct btrfs_root *root = BTRFS_I(inode)->root; 3134 struct btrfs_root *root = BTRFS_I(inode)->root;
3128 3135
3136 spin_lock(&root->inode_lock);
3129 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3137 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
3130 spin_lock(&root->inode_lock);
3131 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3138 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3132 spin_unlock(&root->inode_lock);
3133 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3139 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3134 } 3140 }
3141 spin_unlock(&root->inode_lock);
3135} 3142}
3136 3143
3137static noinline void init_btrfs_i(struct inode *inode) 3144static noinline void init_btrfs_i(struct inode *inode)
diff --git a/fs/buffer.c b/fs/buffer.c
index a3ef091a45bd..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
281 struct zone *zone; 281 struct zone *zone;
282 int nid; 282 int nid;
283 283
284 wakeup_pdflush(1024); 284 wakeup_flusher_threads(1024);
285 yield(); 285 yield();
286 286
287 for_each_online_node(nid) { 287 for_each_online_node(nid) {
@@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
1165 1165
1166 if (!test_set_buffer_dirty(bh)) { 1166 if (!test_set_buffer_dirty(bh)) {
1167 struct page *page = bh->b_page; 1167 struct page *page = bh->b_page;
1168 if (!TestSetPageDirty(page)) 1168 if (!TestSetPageDirty(page)) {
1169 __set_page_dirty(page, page_mapping(page), 0); 1169 struct address_space *mapping = page_mapping(page);
1170 if (mapping)
1171 __set_page_dirty(page, mapping, 0);
1172 }
1170 } 1173 }
1171} 1174}
1172 1175
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..3cbc57f932d2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
31 * - no readahead or I/O queue unplugging required 31 * - no readahead or I/O queue unplugging required
32 */ 32 */
33struct backing_dev_info directly_mappable_cdev_bdi = { 33struct backing_dev_info directly_mappable_cdev_bdi = {
34 .name = "char",
34 .capabilities = ( 35 .capabilities = (
35#ifdef CONFIG_MMU 36#ifdef CONFIG_MMU
36 /* permit private copies of the data to be taken */ 37 /* permit private copies of the data to be taken */
@@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
237} 238}
238 239
239/** 240/**
240 * register_chrdev() - Register a major number for character devices. 241 * __register_chrdev() - create and register a cdev occupying a range of minors
241 * @major: major device number or 0 for dynamic allocation 242 * @major: major device number or 0 for dynamic allocation
243 * @baseminor: first of the requested range of minor numbers
244 * @count: the number of minor numbers required
242 * @name: name of this range of devices 245 * @name: name of this range of devices
243 * @fops: file operations associated with this devices 246 * @fops: file operations associated with this devices
244 * 247 *
@@ -254,19 +257,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
254 * /dev. It only helps to keep track of the different owners of devices. If 257 * /dev. It only helps to keep track of the different owners of devices. If
255 * your module name has only one type of devices it's ok to use e.g. the name 258 * your module name has only one type of devices it's ok to use e.g. the name
256 * of the module here. 259 * of the module here.
257 *
258 * This function registers a range of 256 minor numbers. The first minor number
259 * is 0.
260 */ 260 */
261int register_chrdev(unsigned int major, const char *name, 261int __register_chrdev(unsigned int major, unsigned int baseminor,
262 const struct file_operations *fops) 262 unsigned int count, const char *name,
263 const struct file_operations *fops)
263{ 264{
264 struct char_device_struct *cd; 265 struct char_device_struct *cd;
265 struct cdev *cdev; 266 struct cdev *cdev;
266 char *s; 267 char *s;
267 int err = -ENOMEM; 268 int err = -ENOMEM;
268 269
269 cd = __register_chrdev_region(major, 0, 256, name); 270 cd = __register_chrdev_region(major, baseminor, count, name);
270 if (IS_ERR(cd)) 271 if (IS_ERR(cd))
271 return PTR_ERR(cd); 272 return PTR_ERR(cd);
272 273
@@ -280,7 +281,7 @@ int register_chrdev(unsigned int major, const char *name,
280 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) 281 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
281 *s = '!'; 282 *s = '!';
282 283
283 err = cdev_add(cdev, MKDEV(cd->major, 0), 256); 284 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
284 if (err) 285 if (err)
285 goto out; 286 goto out;
286 287
@@ -290,7 +291,7 @@ int register_chrdev(unsigned int major, const char *name,
290out: 291out:
291 kobject_put(&cdev->kobj); 292 kobject_put(&cdev->kobj);
292out2: 293out2:
293 kfree(__unregister_chrdev_region(cd->major, 0, 256)); 294 kfree(__unregister_chrdev_region(cd->major, baseminor, count));
294 return err; 295 return err;
295} 296}
296 297
@@ -316,10 +317,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
316 } 317 }
317} 318}
318 319
319void unregister_chrdev(unsigned int major, const char *name) 320/**
321 * __unregister_chrdev - unregister and destroy a cdev
322 * @major: major device number
323 * @baseminor: first of the range of minor numbers
324 * @count: the number of minor numbers this cdev is occupying
325 * @name: name of this range of devices
326 *
327 * Unregister and destroy the cdev occupying the region described by
328 * @major, @baseminor and @count. This function undoes what
329 * __register_chrdev() did.
330 */
331void __unregister_chrdev(unsigned int major, unsigned int baseminor,
332 unsigned int count, const char *name)
320{ 333{
321 struct char_device_struct *cd; 334 struct char_device_struct *cd;
322 cd = __unregister_chrdev_region(major, 0, 256); 335
336 cd = __unregister_chrdev_region(major, baseminor, count);
323 if (cd && cd->cdev) 337 if (cd && cd->cdev)
324 cdev_del(cd->cdev); 338 cdev_del(cd->cdev);
325 kfree(cd); 339 kfree(cd);
@@ -568,6 +582,6 @@ EXPORT_SYMBOL(cdev_alloc);
568EXPORT_SYMBOL(cdev_del); 582EXPORT_SYMBOL(cdev_del);
569EXPORT_SYMBOL(cdev_add); 583EXPORT_SYMBOL(cdev_add);
570EXPORT_SYMBOL(cdev_index); 584EXPORT_SYMBOL(cdev_index);
571EXPORT_SYMBOL(register_chrdev); 585EXPORT_SYMBOL(__register_chrdev);
572EXPORT_SYMBOL(unregister_chrdev); 586EXPORT_SYMBOL(__unregister_chrdev);
573EXPORT_SYMBOL(directly_mappable_cdev_bdi); 587EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/compat.c b/fs/compat.c
index 94502dab972a..6d6f98fe64a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename,
1485 if (!bprm) 1485 if (!bprm)
1486 goto out_files; 1486 goto out_files;
1487 1487
1488 retval = -ERESTARTNOINTR; 1488 retval = prepare_bprm_creds(bprm);
1489 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1489 if (retval)
1490 goto out_free; 1490 goto out_free;
1491 current->in_execve = 1;
1492
1493 retval = -ENOMEM;
1494 bprm->cred = prepare_exec_creds();
1495 if (!bprm->cred)
1496 goto out_unlock;
1497 1491
1498 retval = check_unsafe_exec(bprm); 1492 retval = check_unsafe_exec(bprm);
1499 if (retval < 0) 1493 if (retval < 0)
1500 goto out_unlock; 1494 goto out_free;
1501 clear_in_exec = retval; 1495 clear_in_exec = retval;
1496 current->in_execve = 1;
1502 1497
1503 file = open_exec(filename); 1498 file = open_exec(filename);
1504 retval = PTR_ERR(file); 1499 retval = PTR_ERR(file);
@@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename,
1547 /* execve succeeded */ 1542 /* execve succeeded */
1548 current->fs->in_exec = 0; 1543 current->fs->in_exec = 0;
1549 current->in_execve = 0; 1544 current->in_execve = 0;
1550 mutex_unlock(&current->cred_guard_mutex);
1551 acct_update_integrals(current); 1545 acct_update_integrals(current);
1552 free_bprm(bprm); 1546 free_bprm(bprm);
1553 if (displaced) 1547 if (displaced)
@@ -1567,10 +1561,7 @@ out_file:
1567out_unmark: 1561out_unmark:
1568 if (clear_in_exec) 1562 if (clear_in_exec)
1569 current->fs->in_exec = 0; 1563 current->fs->in_exec = 0;
1570
1571out_unlock:
1572 current->in_execve = 0; 1564 current->in_execve = 0;
1573 mutex_unlock(&current->cred_guard_mutex);
1574 1565
1575out_free: 1566out_free:
1576 free_bprm(bprm); 1567 free_bprm(bprm);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
51}; 51};
52 52
53static struct backing_dev_info configfs_backing_dev_info = { 53static struct backing_dev_info configfs_backing_dev_info = {
54 .name = "configfs",
54 .ra_pages = 0, /* No readahead */ 55 .ra_pages = 0, /* No readahead */
55 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 56 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
56}; 57};
diff --git a/fs/exec.c b/fs/exec.c
index 4a8849e45b21..172ceb6edde4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -678,8 +678,8 @@ exit:
678} 678}
679EXPORT_SYMBOL(open_exec); 679EXPORT_SYMBOL(open_exec);
680 680
681int kernel_read(struct file *file, unsigned long offset, 681int kernel_read(struct file *file, loff_t offset,
682 char *addr, unsigned long count) 682 char *addr, unsigned long count)
683{ 683{
684 mm_segment_t old_fs; 684 mm_segment_t old_fs;
685 loff_t pos = offset; 685 loff_t pos = offset;
@@ -1016,6 +1016,35 @@ out:
1016EXPORT_SYMBOL(flush_old_exec); 1016EXPORT_SYMBOL(flush_old_exec);
1017 1017
1018/* 1018/*
1019 * Prepare credentials and lock ->cred_guard_mutex.
1020 * install_exec_creds() commits the new creds and drops the lock.
1021 * Or, if exec fails before, free_bprm() should release ->cred and
1022 * and unlock.
1023 */
1024int prepare_bprm_creds(struct linux_binprm *bprm)
1025{
1026 if (mutex_lock_interruptible(&current->cred_guard_mutex))
1027 return -ERESTARTNOINTR;
1028
1029 bprm->cred = prepare_exec_creds();
1030 if (likely(bprm->cred))
1031 return 0;
1032
1033 mutex_unlock(&current->cred_guard_mutex);
1034 return -ENOMEM;
1035}
1036
1037void free_bprm(struct linux_binprm *bprm)
1038{
1039 free_arg_pages(bprm);
1040 if (bprm->cred) {
1041 mutex_unlock(&current->cred_guard_mutex);
1042 abort_creds(bprm->cred);
1043 }
1044 kfree(bprm);
1045}
1046
1047/*
1019 * install the new credentials for this executable 1048 * install the new credentials for this executable
1020 */ 1049 */
1021void install_exec_creds(struct linux_binprm *bprm) 1050void install_exec_creds(struct linux_binprm *bprm)
@@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm)
1024 1053
1025 commit_creds(bprm->cred); 1054 commit_creds(bprm->cred);
1026 bprm->cred = NULL; 1055 bprm->cred = NULL;
1027 1056 /*
1028 /* cred_guard_mutex must be held at least to this point to prevent 1057 * cred_guard_mutex must be held at least to this point to prevent
1029 * ptrace_attach() from altering our determination of the task's 1058 * ptrace_attach() from altering our determination of the task's
1030 * credentials; any time after this it may be unlocked */ 1059 * credentials; any time after this it may be unlocked.
1031 1060 */
1032 security_bprm_committed_creds(bprm); 1061 security_bprm_committed_creds(bprm);
1062 mutex_unlock(&current->cred_guard_mutex);
1033} 1063}
1034EXPORT_SYMBOL(install_exec_creds); 1064EXPORT_SYMBOL(install_exec_creds);
1035 1065
@@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1246 1276
1247EXPORT_SYMBOL(search_binary_handler); 1277EXPORT_SYMBOL(search_binary_handler);
1248 1278
1249void free_bprm(struct linux_binprm *bprm)
1250{
1251 free_arg_pages(bprm);
1252 if (bprm->cred)
1253 abort_creds(bprm->cred);
1254 kfree(bprm);
1255}
1256
1257/* 1279/*
1258 * sys_execve() executes a new program. 1280 * sys_execve() executes a new program.
1259 */ 1281 */
@@ -1277,20 +1299,15 @@ int do_execve(char * filename,
1277 if (!bprm) 1299 if (!bprm)
1278 goto out_files; 1300 goto out_files;
1279 1301
1280 retval = -ERESTARTNOINTR; 1302 retval = prepare_bprm_creds(bprm);
1281 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1303 if (retval)
1282 goto out_free; 1304 goto out_free;
1283 current->in_execve = 1;
1284
1285 retval = -ENOMEM;
1286 bprm->cred = prepare_exec_creds();
1287 if (!bprm->cred)
1288 goto out_unlock;
1289 1305
1290 retval = check_unsafe_exec(bprm); 1306 retval = check_unsafe_exec(bprm);
1291 if (retval < 0) 1307 if (retval < 0)
1292 goto out_unlock; 1308 goto out_free;
1293 clear_in_exec = retval; 1309 clear_in_exec = retval;
1310 current->in_execve = 1;
1294 1311
1295 file = open_exec(filename); 1312 file = open_exec(filename);
1296 retval = PTR_ERR(file); 1313 retval = PTR_ERR(file);
@@ -1340,7 +1357,6 @@ int do_execve(char * filename,
1340 /* execve succeeded */ 1357 /* execve succeeded */
1341 current->fs->in_exec = 0; 1358 current->fs->in_exec = 0;
1342 current->in_execve = 0; 1359 current->in_execve = 0;
1343 mutex_unlock(&current->cred_guard_mutex);
1344 acct_update_integrals(current); 1360 acct_update_integrals(current);
1345 free_bprm(bprm); 1361 free_bprm(bprm);
1346 if (displaced) 1362 if (displaced)
@@ -1360,10 +1376,7 @@ out_file:
1360out_unmark: 1376out_unmark:
1361 if (clear_in_exec) 1377 if (clear_in_exec)
1362 current->fs->in_exec = 0; 1378 current->fs->in_exec = 0;
1363
1364out_unlock:
1365 current->in_execve = 0; 1379 current->in_execve = 0;
1366 mutex_unlock(&current->cred_guard_mutex);
1367 1380
1368out_free: 1381out_free:
1369 free_bprm(bprm); 1382 free_bprm(bprm);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297cad..a63d44256a70 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
230 return error; 230 return error;
231} 231}
232 232
233static int 233int
234ext2_check_acl(struct inode *inode, int mask) 234ext2_check_acl(struct inode *inode, int mask)
235{ 235{
236 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); 236 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
246 return -EAGAIN; 246 return -EAGAIN;
247} 247}
248 248
249int
250ext2_permission(struct inode *inode, int mask)
251{
252 return generic_permission(inode, mask, ext2_check_acl);
253}
254
255/* 249/*
256 * Initialize the ACLs of a new inode. Called from ext2_new_inode. 250 * Initialize the ACLs of a new inode. Called from ext2_new_inode.
257 * 251 *
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898f..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
54#ifdef CONFIG_EXT2_FS_POSIX_ACL 54#ifdef CONFIG_EXT2_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext2_permission (struct inode *, int); 57extern int ext2_check_acl (struct inode *, int);
58extern int ext2_acl_chmod (struct inode *); 58extern int ext2_acl_chmod (struct inode *);
59extern int ext2_init_acl (struct inode *, struct inode *); 59extern int ext2_init_acl (struct inode *, struct inode *);
60 60
61#else 61#else
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext2_permission NULL 63#define ext2_check_acl NULL
64#define ext2_get_acl NULL 64#define ext2_get_acl NULL
65#define ext2_set_acl NULL 65#define ext2_set_acl NULL
66 66
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc9222..a2f3afd1a1c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
85 .removexattr = generic_removexattr, 85 .removexattr = generic_removexattr,
86#endif 86#endif
87 .setattr = ext2_setattr, 87 .setattr = ext2_setattr,
88 .permission = ext2_permission, 88 .check_acl = ext2_check_acl,
89 .fiemap = ext2_fiemap, 89 .fiemap = ext2_fiemap,
90}; 90};
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e1dedb0f7873..23701f289e98 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
362 if (dir_de) { 362 if (dir_de) {
363 if (old_dir != new_dir) 363 if (old_dir != new_dir)
364 ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); 364 ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
365 else {
366 kunmap(dir_page);
367 page_cache_release(dir_page);
368 }
365 inode_dec_link_count(old_dir); 369 inode_dec_link_count(old_dir);
366 } 370 }
367 return 0; 371 return 0;
@@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
396 .removexattr = generic_removexattr, 400 .removexattr = generic_removexattr,
397#endif 401#endif
398 .setattr = ext2_setattr, 402 .setattr = ext2_setattr,
399 .permission = ext2_permission, 403 .check_acl = ext2_check_acl,
400}; 404};
401 405
402const struct inode_operations ext2_special_inode_operations = { 406const struct inode_operations ext2_special_inode_operations = {
@@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
407 .removexattr = generic_removexattr, 411 .removexattr = generic_removexattr,
408#endif 412#endif
409 .setattr = ext2_setattr, 413 .setattr = ext2_setattr,
410 .permission = ext2_permission, 414 .check_acl = ext2_check_acl,
411}; 415};
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index fb3c1a21b135..522b15498f45 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -29,23 +29,25 @@ config EXT3_FS
29 module will be called ext3. 29 module will be called ext3.
30 30
31config EXT3_DEFAULTS_TO_ORDERED 31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3 (legacy option)" 32 bool "Default to 'data=ordered' in ext3"
33 depends on EXT3_FS 33 depends on EXT3_FS
34 help 34 help
35 If a filesystem does not explicitly specify a data ordering 35 The journal mode options for ext3 have different tradeoffs
36 mode, and the journal capability allowed it, ext3 used to 36 between when data is guaranteed to be on disk and
37 historically default to 'data=ordered'. 37 performance. The use of "data=writeback" can cause
38 38 unwritten data to appear in files after an system crash or
39 That was a rather unfortunate choice, because it leads to all 39 power failure, which can be a security issue. However,
40 kinds of latency problems, and the 'data=writeback' mode is more 40 "data=ordered" mode can also result in major performance
41 appropriate these days. 41 problems, including seconds-long delays before an fsync()
42 42 call returns. For details, see:
43 You should probably always answer 'n' here, and if you really 43
44 want to use 'data=ordered' mode, set it in the filesystem itself 44 http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
45 with 'tune2fs -o journal_data_ordered'. 45
46 46 If you have been historically happy with ext3's performance,
47 But if you really want to enable the legacy default, you can do 47 data=ordered mode will be a safe choice and you should
48 so by answering 'y' to this question. 48 answer 'y' here. If you understand the reliability and data
49 privacy issues of data=writeback and are willing to make
50 that trade off, answer 'n'.
49 51
50config EXT3_FS_XATTR 52config EXT3_FS_XATTR
51 bool "Ext3 extended attributes" 53 bool "Ext3 extended attributes"
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef0..c9b0df376b5f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
238 return error; 238 return error;
239} 239}
240 240
241static int 241int
242ext3_check_acl(struct inode *inode, int mask) 242ext3_check_acl(struct inode *inode, int mask)
243{ 243{
244 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); 244 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
254 return -EAGAIN; 254 return -EAGAIN;
255} 255}
256 256
257int
258ext3_permission(struct inode *inode, int mask)
259{
260 return generic_permission(inode, mask, ext3_check_acl);
261}
262
263/* 257/*
264 * Initialize the ACLs of a new inode. Called from ext3_new_inode. 258 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
265 * 259 *
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a5969..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
54#ifdef CONFIG_EXT3_FS_POSIX_ACL 54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext3_permission (struct inode *, int); 57extern int ext3_check_acl (struct inode *, int);
58extern int ext3_acl_chmod (struct inode *); 58extern int ext3_acl_chmod (struct inode *);
59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); 59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT3_FS_POSIX_ACL */ 61#else /* CONFIG_EXT3_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext3_permission NULL 63#define ext3_check_acl NULL
64 64
65static inline int 65static inline int
66ext3_acl_chmod(struct inode *inode) 66ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231b..299253214789 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -137,7 +137,7 @@ const struct inode_operations ext3_file_inode_operations = {
137 .listxattr = ext3_listxattr, 137 .listxattr = ext3_listxattr,
138 .removexattr = generic_removexattr, 138 .removexattr = generic_removexattr,
139#endif 139#endif
140 .permission = ext3_permission, 140 .check_acl = ext3_check_acl,
141 .fiemap = ext3_fiemap, 141 .fiemap = ext3_fiemap,
142}; 142};
143 143
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b9730234..aad6400c9b77 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
2445 .listxattr = ext3_listxattr, 2445 .listxattr = ext3_listxattr,
2446 .removexattr = generic_removexattr, 2446 .removexattr = generic_removexattr,
2447#endif 2447#endif
2448 .permission = ext3_permission, 2448 .check_acl = ext3_check_acl,
2449}; 2449};
2450 2450
2451const struct inode_operations ext3_special_inode_operations = { 2451const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
2456 .listxattr = ext3_listxattr, 2456 .listxattr = ext3_listxattr,
2457 .removexattr = generic_removexattr, 2457 .removexattr = generic_removexattr,
2458#endif 2458#endif
2459 .permission = ext3_permission, 2459 .check_acl = ext3_check_acl,
2460}; 2460};
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 524b349c6299..a8d80a7f1105 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
543#endif 543#endif
544} 544}
545 545
546static char *data_mode_string(unsigned long mode)
547{
548 switch (mode) {
549 case EXT3_MOUNT_JOURNAL_DATA:
550 return "journal";
551 case EXT3_MOUNT_ORDERED_DATA:
552 return "ordered";
553 case EXT3_MOUNT_WRITEBACK_DATA:
554 return "writeback";
555 }
556 return "unknown";
557}
558
546/* 559/*
547 * Show an option if 560 * Show an option if
548 * - it's set to a non-default value OR 561 * - it's set to a non-default value OR
@@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
616 if (test_opt(sb, NOBH)) 629 if (test_opt(sb, NOBH))
617 seq_puts(seq, ",nobh"); 630 seq_puts(seq, ",nobh");
618 631
619 if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) 632 seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
620 seq_puts(seq, ",data=journal"); 633 EXT3_MOUNT_DATA_FLAGS));
621 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
622 seq_puts(seq, ",data=ordered");
623 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
624 seq_puts(seq, ",data=writeback");
625
626 if (test_opt(sb, DATA_ERR_ABORT)) 634 if (test_opt(sb, DATA_ERR_ABORT))
627 seq_puts(seq, ",data_err=abort"); 635 seq_puts(seq, ",data_err=abort");
628 636
@@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb,
1024 datacheck: 1032 datacheck:
1025 if (is_remount) { 1033 if (is_remount) {
1026 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) 1034 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
1027 != data_opt) { 1035 == data_opt)
1028 printk(KERN_ERR 1036 break;
1029 "EXT3-fs: cannot change data " 1037 printk(KERN_ERR
1030 "mode on remount\n"); 1038 "EXT3-fs (device %s): Cannot change "
1031 return 0; 1039 "data mode on remount. The filesystem "
1032 } 1040 "is mounted in data=%s mode and you "
1041 "try to remount it in data=%s mode.\n",
1042 sb->s_id,
1043 data_mode_string(sbi->s_mount_opt &
1044 EXT3_MOUNT_DATA_FLAGS),
1045 data_mode_string(data_opt));
1046 return 0;
1033 } else { 1047 } else {
1034 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; 1048 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
1035 sbi->s_mount_opt |= data_opt; 1049 sbi->s_mount_opt |= data_opt;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149ca..0df88b2a69b0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
236 return error; 236 return error;
237} 237}
238 238
239static int 239int
240ext4_check_acl(struct inode *inode, int mask) 240ext4_check_acl(struct inode *inode, int mask)
241{ 241{
242 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); 242 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
252 return -EAGAIN; 252 return -EAGAIN;
253} 253}
254 254
255int
256ext4_permission(struct inode *inode, int mask)
257{
258 return generic_permission(inode, mask, ext4_check_acl);
259}
260
261/* 255/*
262 * Initialize the ACLs of a new inode. Called from ext4_new_inode. 256 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
263 * 257 *
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba6..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
54#ifdef CONFIG_EXT4_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext4_permission(struct inode *, int); 57extern int ext4_check_acl(struct inode *, int);
58extern int ext4_acl_chmod(struct inode *); 58extern int ext4_acl_chmod(struct inode *);
59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); 59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT4_FS_POSIX_ACL */ 61#else /* CONFIG_EXT4_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext4_permission NULL 63#define ext4_check_acl NULL
64 64
65static inline int 65static inline int
66ext4_acl_chmod(struct inode *inode) 66ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c6..27f3c5354c0e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,7 +207,7 @@ const struct inode_operations ext4_file_inode_operations = {
207 .listxattr = ext4_listxattr, 207 .listxattr = ext4_listxattr,
208 .removexattr = generic_removexattr, 208 .removexattr = generic_removexattr,
209#endif 209#endif
210 .permission = ext4_permission, 210 .check_acl = ext4_check_acl,
211 .fallocate = ext4_fallocate, 211 .fallocate = ext4_fallocate,
212 .fiemap = ext4_fiemap, 212 .fiemap = ext4_fiemap,
213}; 213};
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16ff..114abe5d2c1d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2536,7 +2536,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2536 .listxattr = ext4_listxattr, 2536 .listxattr = ext4_listxattr,
2537 .removexattr = generic_removexattr, 2537 .removexattr = generic_removexattr,
2538#endif 2538#endif
2539 .permission = ext4_permission, 2539 .check_acl = ext4_check_acl,
2540 .fiemap = ext4_fiemap, 2540 .fiemap = ext4_fiemap,
2541}; 2541};
2542 2542
@@ -2548,5 +2548,5 @@ const struct inode_operations ext4_special_inode_operations = {
2548 .listxattr = ext4_listxattr, 2548 .listxattr = ext4_listxattr,
2549 .removexattr = generic_removexattr, 2549 .removexattr = generic_removexattr,
2550#endif 2550#endif
2551 .permission = ext4_permission, 2551 .check_acl = ext4_check_acl,
2552}; 2552};
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..da86ef58e427 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,223 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/kthread.h>
23#include <linux/freezer.h>
22#include <linux/writeback.h> 24#include <linux/writeback.h>
23#include <linux/blkdev.h> 25#include <linux/blkdev.h>
24#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
25#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
26#include "internal.h" 28#include "internal.h"
27 29
30#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
28 31
29/** 32/*
30 * writeback_acquire - attempt to get exclusive writeback access to a device 33 * We don't actually have pdflush, but this one is exported though /proc...
31 * @bdi: the device's backing_dev_info structure
32 *
33 * It is a waste of resources to have more than one pdflush thread blocked on
34 * a single request queue. Exclusion at the request_queue level is obtained
35 * via a flag in the request_queue's backing_dev_info.state.
36 *
37 * Non-request_queue-backed address_spaces will share default_backing_dev_info,
38 * unless they implement their own. Which is somewhat inefficient, as this
39 * may prevent concurrent writeback against multiple devices.
40 */ 34 */
41static int writeback_acquire(struct backing_dev_info *bdi) 35int nr_pdflush_threads;
36
37/*
38 * Work items for the bdi_writeback threads
39 */
40struct bdi_work {
41 struct list_head list;
42 struct list_head wait_list;
43 struct rcu_head rcu_head;
44
45 unsigned long seen;
46 atomic_t pending;
47
48 struct super_block *sb;
49 unsigned long nr_pages;
50 enum writeback_sync_modes sync_mode;
51
52 unsigned long state;
53};
54
55enum {
56 WS_USED_B = 0,
57 WS_ONSTACK_B,
58};
59
60#define WS_USED (1 << WS_USED_B)
61#define WS_ONSTACK (1 << WS_ONSTACK_B)
62
63static inline bool bdi_work_on_stack(struct bdi_work *work)
42{ 64{
43 return !test_and_set_bit(BDI_pdflush, &bdi->state); 65 return test_bit(WS_ONSTACK_B, &work->state);
66}
67
68static inline void bdi_work_init(struct bdi_work *work,
69 struct writeback_control *wbc)
70{
71 INIT_RCU_HEAD(&work->rcu_head);
72 work->sb = wbc->sb;
73 work->nr_pages = wbc->nr_to_write;
74 work->sync_mode = wbc->sync_mode;
75 work->state = WS_USED;
76}
77
78static inline void bdi_work_init_on_stack(struct bdi_work *work,
79 struct writeback_control *wbc)
80{
81 bdi_work_init(work, wbc);
82 work->state |= WS_ONSTACK;
44} 83}
45 84
46/** 85/**
47 * writeback_in_progress - determine whether there is writeback in progress 86 * writeback_in_progress - determine whether there is writeback in progress
48 * @bdi: the device's backing_dev_info structure. 87 * @bdi: the device's backing_dev_info structure.
49 * 88 *
50 * Determine whether there is writeback in progress against a backing device. 89 * Determine whether there is writeback waiting to be handled against a
90 * backing device.
51 */ 91 */
52int writeback_in_progress(struct backing_dev_info *bdi) 92int writeback_in_progress(struct backing_dev_info *bdi)
53{ 93{
54 return test_bit(BDI_pdflush, &bdi->state); 94 return !list_empty(&bdi->work_list);
55} 95}
56 96
57/** 97static void bdi_work_clear(struct bdi_work *work)
58 * writeback_release - relinquish exclusive writeback access against a device.
59 * @bdi: the device's backing_dev_info structure
60 */
61static void writeback_release(struct backing_dev_info *bdi)
62{ 98{
63 BUG_ON(!writeback_in_progress(bdi)); 99 clear_bit(WS_USED_B, &work->state);
64 clear_bit(BDI_pdflush, &bdi->state); 100 smp_mb__after_clear_bit();
101 wake_up_bit(&work->state, WS_USED_B);
65} 102}
66 103
67static noinline void block_dump___mark_inode_dirty(struct inode *inode) 104static void bdi_work_free(struct rcu_head *head)
68{ 105{
69 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 106 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
70 struct dentry *dentry;
71 const char *name = "?";
72 107
73 dentry = d_find_alias(inode); 108 if (!bdi_work_on_stack(work))
74 if (dentry) { 109 kfree(work);
75 spin_lock(&dentry->d_lock); 110 else
76 name = (const char *) dentry->d_name.name; 111 bdi_work_clear(work);
77 }
78 printk(KERN_DEBUG
79 "%s(%d): dirtied inode %lu (%s) on %s\n",
80 current->comm, task_pid_nr(current), inode->i_ino,
81 name, inode->i_sb->s_id);
82 if (dentry) {
83 spin_unlock(&dentry->d_lock);
84 dput(dentry);
85 }
86 }
87} 112}
88 113
89/** 114static void wb_work_complete(struct bdi_work *work)
90 * __mark_inode_dirty - internal function
91 * @inode: inode to mark
92 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
93 * Mark an inode as dirty. Callers should use mark_inode_dirty or
94 * mark_inode_dirty_sync.
95 *
96 * Put the inode on the super block's dirty list.
97 *
98 * CAREFUL! We mark it dirty unconditionally, but move it onto the
99 * dirty list only if it is hashed or if it refers to a blockdev.
100 * If it was not hashed, it will never be added to the dirty list
101 * even if it is later hashed, as it will have been marked dirty already.
102 *
103 * In short, make sure you hash any inodes _before_ you start marking
104 * them dirty.
105 *
106 * This function *must* be atomic for the I_DIRTY_PAGES case -
107 * set_page_dirty() is called under spinlock in several places.
108 *
109 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
110 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
111 * the kernel-internal blockdev inode represents the dirtying time of the
112 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
113 * page->mapping->host, so the page-dirtying time is recorded in the internal
114 * blockdev inode.
115 */
116void __mark_inode_dirty(struct inode *inode, int flags)
117{ 115{
118 struct super_block *sb = inode->i_sb; 116 const enum writeback_sync_modes sync_mode = work->sync_mode;
119 117
120 /* 118 /*
121 * Don't do this for I_DIRTY_PAGES - that doesn't actually 119 * For allocated work, we can clear the done/seen bit right here.
122 * dirty the inode itself 120 * For on-stack work, we need to postpone both the clear and free
121 * to after the RCU grace period, since the stack could be invalidated
122 * as soon as bdi_work_clear() has done the wakeup.
123 */ 123 */
124 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 124 if (!bdi_work_on_stack(work))
125 if (sb->s_op->dirty_inode) 125 bdi_work_clear(work);
126 sb->s_op->dirty_inode(inode); 126 if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
127 } 127 call_rcu(&work->rcu_head, bdi_work_free);
128}
128 129
130static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
131{
129 /* 132 /*
130 * make sure that changes are seen by all cpus before we test i_state 133 * The caller has retrieved the work arguments from this work,
131 * -- mikulas 134 * drop our reference. If this is the last ref, delete and free it
132 */ 135 */
133 smp_mb(); 136 if (atomic_dec_and_test(&work->pending)) {
137 struct backing_dev_info *bdi = wb->bdi;
134 138
135 /* avoid the locking if we can */ 139 spin_lock(&bdi->wb_lock);
136 if ((inode->i_state & flags) == flags) 140 list_del_rcu(&work->list);
137 return; 141 spin_unlock(&bdi->wb_lock);
138 142
139 if (unlikely(block_dump)) 143 wb_work_complete(work);
140 block_dump___mark_inode_dirty(inode); 144 }
141 145}
142 spin_lock(&inode_lock);
143 if ((inode->i_state & flags) != flags) {
144 const int was_dirty = inode->i_state & I_DIRTY;
145 146
146 inode->i_state |= flags; 147static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
148{
149 if (work) {
150 work->seen = bdi->wb_mask;
151 BUG_ON(!work->seen);
152 atomic_set(&work->pending, bdi->wb_cnt);
153 BUG_ON(!bdi->wb_cnt);
147 154
148 /* 155 /*
149 * If the inode is being synced, just update its dirty state. 156 * Make sure stores are seen before it appears on the list
150 * The unlocker will place the inode on the appropriate
151 * superblock list, based upon its state.
152 */ 157 */
153 if (inode->i_state & I_SYNC) 158 smp_mb();
154 goto out;
155 159
156 /* 160 spin_lock(&bdi->wb_lock);
157 * Only add valid (hashed) inodes to the superblock's 161 list_add_tail_rcu(&work->list, &bdi->work_list);
158 * dirty list. Add blockdev inodes as well. 162 spin_unlock(&bdi->wb_lock);
159 */ 163 }
160 if (!S_ISBLK(inode->i_mode)) { 164
161 if (hlist_unhashed(&inode->i_hash)) 165 /*
162 goto out; 166 * If the default thread isn't there, make sure we add it. When
163 } 167 * it gets created and wakes up, we'll run this work.
164 if (inode->i_state & (I_FREEING|I_CLEAR)) 168 */
165 goto out; 169 if (unlikely(list_empty_careful(&bdi->wb_list)))
170 wake_up_process(default_backing_dev_info.wb.task);
171 else {
172 struct bdi_writeback *wb = &bdi->wb;
166 173
167 /* 174 /*
168 * If the inode was already on s_dirty/s_io/s_more_io, don't 175 * If we failed allocating the bdi work item, wake up the wb
169 * reposition it (that would break s_dirty time-ordering). 176 * thread always. As a safety precaution, it'll flush out
177 * everything
170 */ 178 */
171 if (!was_dirty) { 179 if (!wb_has_dirty_io(wb)) {
172 inode->dirtied_when = jiffies; 180 if (work)
173 list_move(&inode->i_list, &sb->s_dirty); 181 wb_clear_pending(wb, work);
174 } 182 } else if (wb->task)
183 wake_up_process(wb->task);
175 } 184 }
176out:
177 spin_unlock(&inode_lock);
178} 185}
179 186
180EXPORT_SYMBOL(__mark_inode_dirty); 187/*
188 * Used for on-stack allocated work items. The caller needs to wait until
189 * the wb threads have acked the work before it's safe to continue.
190 */
191static void bdi_wait_on_work_clear(struct bdi_work *work)
192{
193 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
194 TASK_UNINTERRUPTIBLE);
195}
181 196
182static int write_inode(struct inode *inode, int sync) 197static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
183{ 198{
184 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 199 struct bdi_work *work;
185 return inode->i_sb->s_op->write_inode(inode, sync); 200
186 return 0; 201 work = kmalloc(sizeof(*work), GFP_ATOMIC);
202 if (work)
203 bdi_work_init(work, wbc);
204
205 return work;
206}
207
208void bdi_start_writeback(struct writeback_control *wbc)
209{
210 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
211 struct bdi_work work_stack, *work = NULL;
212
213 if (!must_wait)
214 work = bdi_alloc_work(wbc);
215
216 if (!work) {
217 work = &work_stack;
218 bdi_work_init_on_stack(work, wbc);
219 }
220
221 bdi_queue_work(wbc->bdi, work);
222
223 /*
224 * If the sync mode is WB_SYNC_ALL, block waiting for the work to
225 * complete. If not, we only need to wait for the work to be started,
226 * if we allocated it on-stack. We use the same mechanism, if the
227 * wait bit is set in the bdi_work struct, then threads will not
228 * clear pending until after they are done.
229 *
230 * Note that work == &work_stack if must_wait is true, so we don't
231 * need to do call_rcu() here ever, since the completion path will
232 * have done that for us.
233 */
234 if (must_wait || work == &work_stack) {
235 bdi_wait_on_work_clear(work);
236 if (work != &work_stack)
237 call_rcu(&work->rcu_head, bdi_work_free);
238 }
187} 239}
188 240
189/* 241/*
@@ -191,31 +243,32 @@ static int write_inode(struct inode *inode, int sync)
191 * furthest end of its superblock's dirty-inode list. 243 * furthest end of its superblock's dirty-inode list.
192 * 244 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 245 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the s_dirty list. If that is 246 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written 247 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 248 * out and we don't reset its dirtied_when.
197 */ 249 */
198static void redirty_tail(struct inode *inode) 250static void redirty_tail(struct inode *inode)
199{ 251{
200 struct super_block *sb = inode->i_sb; 252 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
201 253
202 if (!list_empty(&sb->s_dirty)) { 254 if (!list_empty(&wb->b_dirty)) {
203 struct inode *tail_inode; 255 struct inode *tail;
204 256
205 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 257 tail = list_entry(wb->b_dirty.next, struct inode, i_list);
206 if (time_before(inode->dirtied_when, 258 if (time_before(inode->dirtied_when, tail->dirtied_when))
207 tail_inode->dirtied_when))
208 inode->dirtied_when = jiffies; 259 inode->dirtied_when = jiffies;
209 } 260 }
210 list_move(&inode->i_list, &sb->s_dirty); 261 list_move(&inode->i_list, &wb->b_dirty);
211} 262}
212 263
213/* 264/*
214 * requeue inode for re-scanning after sb->s_io list is exhausted. 265 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 266 */
216static void requeue_io(struct inode *inode) 267static void requeue_io(struct inode *inode)
217{ 268{
218 list_move(&inode->i_list, &inode->i_sb->s_more_io); 269 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
270
271 list_move(&inode->i_list, &wb->b_more_io);
219} 272}
220 273
221static void inode_sync_complete(struct inode *inode) 274static void inode_sync_complete(struct inode *inode)
@@ -262,20 +315,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
262/* 315/*
263 * Queue all expired dirty inodes for io, eldest first. 316 * Queue all expired dirty inodes for io, eldest first.
264 */ 317 */
265static void queue_io(struct super_block *sb, 318static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
266 unsigned long *older_than_this)
267{ 319{
268 list_splice_init(&sb->s_more_io, sb->s_io.prev); 320 list_splice_init(&wb->b_more_io, wb->b_io.prev);
269 move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); 321 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
270} 322}
271 323
272int sb_has_dirty_inodes(struct super_block *sb) 324static int write_inode(struct inode *inode, int sync)
273{ 325{
274 return !list_empty(&sb->s_dirty) || 326 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
275 !list_empty(&sb->s_io) || 327 return inode->i_sb->s_op->write_inode(inode, sync);
276 !list_empty(&sb->s_more_io); 328 return 0;
277} 329}
278EXPORT_SYMBOL(sb_has_dirty_inodes);
279 330
280/* 331/*
281 * Wait for writeback on an inode to complete. 332 * Wait for writeback on an inode to complete.
@@ -322,11 +373,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
322 if (inode->i_state & I_SYNC) { 373 if (inode->i_state & I_SYNC) {
323 /* 374 /*
324 * If this inode is locked for writeback and we are not doing 375 * If this inode is locked for writeback and we are not doing
325 * writeback-for-data-integrity, move it to s_more_io so that 376 * writeback-for-data-integrity, move it to b_more_io so that
326 * writeback can proceed with the other inodes on s_io. 377 * writeback can proceed with the other inodes on s_io.
327 * 378 *
328 * We'll have another go at writing back this inode when we 379 * We'll have another go at writing back this inode when we
329 * completed a full scan of s_io. 380 * completed a full scan of b_io.
330 */ 381 */
331 if (!wait) { 382 if (!wait) {
332 requeue_io(inode); 383 requeue_io(inode);
@@ -371,11 +422,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
371 /* 422 /*
372 * We didn't write back all the pages. nfs_writepages() 423 * We didn't write back all the pages. nfs_writepages()
373 * sometimes bales out without doing anything. Redirty 424 * sometimes bales out without doing anything. Redirty
374 * the inode; Move it from s_io onto s_more_io/s_dirty. 425 * the inode; Move it from b_io onto b_more_io/b_dirty.
375 */ 426 */
376 /* 427 /*
377 * akpm: if the caller was the kupdate function we put 428 * akpm: if the caller was the kupdate function we put
378 * this inode at the head of s_dirty so it gets first 429 * this inode at the head of b_dirty so it gets first
379 * consideration. Otherwise, move it to the tail, for 430 * consideration. Otherwise, move it to the tail, for
380 * the reasons described there. I'm not really sure 431 * the reasons described there. I'm not really sure
381 * how much sense this makes. Presumably I had a good 432 * how much sense this makes. Presumably I had a good
@@ -385,7 +436,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
385 if (wbc->for_kupdate) { 436 if (wbc->for_kupdate) {
386 /* 437 /*
387 * For the kupdate function we move the inode 438 * For the kupdate function we move the inode
388 * to s_more_io so it will get more writeout as 439 * to b_more_io so it will get more writeout as
389 * soon as the queue becomes uncongested. 440 * soon as the queue becomes uncongested.
390 */ 441 */
391 inode->i_state |= I_DIRTY_PAGES; 442 inode->i_state |= I_DIRTY_PAGES;
@@ -434,50 +485,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
434} 485}
435 486
436/* 487/*
437 * Write out a superblock's list of dirty inodes. A wait will be performed 488 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
438 * upon no inodes, all inodes or the final one, depending upon sync_mode. 489 * before calling writeback. So make sure that we do pin it, so it doesn't
439 * 490 * go away while we are writing inodes from it.
440 * If older_than_this is non-NULL, then only write out inodes which
441 * had their first dirtying at a time earlier than *older_than_this.
442 *
443 * If we're a pdflush thread, then implement pdflush collision avoidance
444 * against the entire list.
445 * 491 *
446 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 492 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
447 * This function assumes that the blockdev superblock's inodes are backed by 493 * 1 if we failed.
448 * a variety of queues, so all inodes are searched. For other superblocks,
449 * assume that all inodes are backed by the same queue.
450 *
451 * FIXME: this linear search could get expensive with many fileystems. But
452 * how to fix? We need to go from an address_space to all inodes which share
453 * a queue with that address_space. (Easy: have a global "dirty superblocks"
454 * list).
455 *
456 * The inodes to be written are parked on sb->s_io. They are moved back onto
457 * sb->s_dirty as they are selected for writing. This way, none can be missed
458 * on the writer throttling path, and we get decent balancing between many
459 * throttled threads: we don't want them all piling up on inode_sync_wait.
460 */ 494 */
461void generic_sync_sb_inodes(struct super_block *sb, 495static int pin_sb_for_writeback(struct writeback_control *wbc,
496 struct inode *inode)
497{
498 struct super_block *sb = inode->i_sb;
499
500 /*
501 * Caller must already hold the ref for this
502 */
503 if (wbc->sync_mode == WB_SYNC_ALL) {
504 WARN_ON(!rwsem_is_locked(&sb->s_umount));
505 return 0;
506 }
507
508 spin_lock(&sb_lock);
509 sb->s_count++;
510 if (down_read_trylock(&sb->s_umount)) {
511 if (sb->s_root) {
512 spin_unlock(&sb_lock);
513 return 0;
514 }
515 /*
516 * umounted, drop rwsem again and fall through to failure
517 */
518 up_read(&sb->s_umount);
519 }
520
521 sb->s_count--;
522 spin_unlock(&sb_lock);
523 return 1;
524}
525
526static void unpin_sb_for_writeback(struct writeback_control *wbc,
527 struct inode *inode)
528{
529 struct super_block *sb = inode->i_sb;
530
531 if (wbc->sync_mode == WB_SYNC_ALL)
532 return;
533
534 up_read(&sb->s_umount);
535 put_super(sb);
536}
537
538static void writeback_inodes_wb(struct bdi_writeback *wb,
462 struct writeback_control *wbc) 539 struct writeback_control *wbc)
463{ 540{
541 struct super_block *sb = wbc->sb;
542 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
464 const unsigned long start = jiffies; /* livelock avoidance */ 543 const unsigned long start = jiffies; /* livelock avoidance */
465 int sync = wbc->sync_mode == WB_SYNC_ALL;
466 544
467 spin_lock(&inode_lock); 545 spin_lock(&inode_lock);
468 if (!wbc->for_kupdate || list_empty(&sb->s_io))
469 queue_io(sb, wbc->older_than_this);
470 546
471 while (!list_empty(&sb->s_io)) { 547 if (!wbc->for_kupdate || list_empty(&wb->b_io))
472 struct inode *inode = list_entry(sb->s_io.prev, 548 queue_io(wb, wbc->older_than_this);
549
550 while (!list_empty(&wb->b_io)) {
551 struct inode *inode = list_entry(wb->b_io.prev,
473 struct inode, i_list); 552 struct inode, i_list);
474 struct address_space *mapping = inode->i_mapping;
475 struct backing_dev_info *bdi = mapping->backing_dev_info;
476 long pages_skipped; 553 long pages_skipped;
477 554
478 if (!bdi_cap_writeback_dirty(bdi)) { 555 /*
556 * super block given and doesn't match, skip this inode
557 */
558 if (sb && sb != inode->i_sb) {
559 redirty_tail(inode);
560 continue;
561 }
562
563 if (!bdi_cap_writeback_dirty(wb->bdi)) {
479 redirty_tail(inode); 564 redirty_tail(inode);
480 if (sb_is_blkdev_sb(sb)) { 565 if (is_blkdev_sb) {
481 /* 566 /*
482 * Dirty memory-backed blockdev: the ramdisk 567 * Dirty memory-backed blockdev: the ramdisk
483 * driver does this. Skip just this inode 568 * driver does this. Skip just this inode
@@ -497,21 +582,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
497 continue; 582 continue;
498 } 583 }
499 584
500 if (wbc->nonblocking && bdi_write_congested(bdi)) { 585 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
501 wbc->encountered_congestion = 1; 586 wbc->encountered_congestion = 1;
502 if (!sb_is_blkdev_sb(sb)) 587 if (!is_blkdev_sb)
503 break; /* Skip a congested fs */ 588 break; /* Skip a congested fs */
504 requeue_io(inode); 589 requeue_io(inode);
505 continue; /* Skip a congested blockdev */ 590 continue; /* Skip a congested blockdev */
506 } 591 }
507 592
508 if (wbc->bdi && bdi != wbc->bdi) {
509 if (!sb_is_blkdev_sb(sb))
510 break; /* fs has the wrong queue */
511 requeue_io(inode);
512 continue; /* blockdev has wrong queue */
513 }
514
515 /* 593 /*
516 * Was this inode dirtied after sync_sb_inodes was called? 594 * Was this inode dirtied after sync_sb_inodes was called?
517 * This keeps sync from extra jobs and livelock. 595 * This keeps sync from extra jobs and livelock.
@@ -519,16 +597,16 @@ void generic_sync_sb_inodes(struct super_block *sb,
519 if (inode_dirtied_after(inode, start)) 597 if (inode_dirtied_after(inode, start))
520 break; 598 break;
521 599
522 /* Is another pdflush already flushing this queue? */ 600 if (pin_sb_for_writeback(wbc, inode)) {
523 if (current_is_pdflush() && !writeback_acquire(bdi)) 601 requeue_io(inode);
524 break; 602 continue;
603 }
525 604
526 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 605 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
527 __iget(inode); 606 __iget(inode);
528 pages_skipped = wbc->pages_skipped; 607 pages_skipped = wbc->pages_skipped;
529 writeback_single_inode(inode, wbc); 608 writeback_single_inode(inode, wbc);
530 if (current_is_pdflush()) 609 unpin_sb_for_writeback(wbc, inode);
531 writeback_release(bdi);
532 if (wbc->pages_skipped != pages_skipped) { 610 if (wbc->pages_skipped != pages_skipped) {
533 /* 611 /*
534 * writeback is not making progress due to locked 612 * writeback is not making progress due to locked
@@ -544,144 +622,571 @@ void generic_sync_sb_inodes(struct super_block *sb,
544 wbc->more_io = 1; 622 wbc->more_io = 1;
545 break; 623 break;
546 } 624 }
547 if (!list_empty(&sb->s_more_io)) 625 if (!list_empty(&wb->b_more_io))
548 wbc->more_io = 1; 626 wbc->more_io = 1;
549 } 627 }
550 628
551 if (sync) { 629 spin_unlock(&inode_lock);
552 struct inode *inode, *old_inode = NULL; 630 /* Leave any unwritten inodes on b_io */
631}
632
633void writeback_inodes_wbc(struct writeback_control *wbc)
634{
635 struct backing_dev_info *bdi = wbc->bdi;
553 636
637 writeback_inodes_wb(&bdi->wb, wbc);
638}
639
640/*
641 * The maximum number of pages to writeout in a single bdi flush/kupdate
642 * operation. We do this so we don't hold I_SYNC against an inode for
643 * enormous amounts of time, which would block a userspace task which has
644 * been forced to throttle against that inode. Also, the code reevaluates
645 * the dirty each time it has written this many pages.
646 */
647#define MAX_WRITEBACK_PAGES 1024
648
649static inline bool over_bground_thresh(void)
650{
651 unsigned long background_thresh, dirty_thresh;
652
653 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
654
655 return (global_page_state(NR_FILE_DIRTY) +
656 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
657}
658
659/*
660 * Explicit flushing or periodic writeback of "old" data.
661 *
662 * Define "old": the first time one of an inode's pages is dirtied, we mark the
663 * dirtying-time in the inode's address_space. So this periodic writeback code
664 * just walks the superblock inode list, writing back any inodes which are
665 * older than a specific point in time.
666 *
667 * Try to run once per dirty_writeback_interval. But if a writeback event
668 * takes longer than a dirty_writeback_interval interval, then leave a
669 * one-second gap.
670 *
671 * older_than_this takes precedence over nr_to_write. So we'll only write back
672 * all dirty pages if they are all attached to "old" mappings.
673 */
674static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
675 struct super_block *sb,
676 enum writeback_sync_modes sync_mode, int for_kupdate)
677{
678 struct writeback_control wbc = {
679 .bdi = wb->bdi,
680 .sb = sb,
681 .sync_mode = sync_mode,
682 .older_than_this = NULL,
683 .for_kupdate = for_kupdate,
684 .range_cyclic = 1,
685 };
686 unsigned long oldest_jif;
687 long wrote = 0;
688
689 if (wbc.for_kupdate) {
690 wbc.older_than_this = &oldest_jif;
691 oldest_jif = jiffies -
692 msecs_to_jiffies(dirty_expire_interval * 10);
693 }
694
695 for (;;) {
554 /* 696 /*
555 * Data integrity sync. Must wait for all pages under writeback, 697 * Don't flush anything for non-integrity writeback where
556 * because there may have been pages dirtied before our sync 698 * no nr_pages was given
557 * call, but which had writeout started before we write it out.
558 * In which case, the inode may not be on the dirty list, but
559 * we still have to wait for that writeout.
560 */ 699 */
561 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 700 if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
562 struct address_space *mapping; 701 break;
563 702
564 if (inode->i_state & 703 /*
565 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 704 * If no specific pages were given and this is just a
566 continue; 705 * periodic background writeout and we are below the
567 mapping = inode->i_mapping; 706 * background dirty threshold, don't do anything
568 if (mapping->nrpages == 0) 707 */
708 if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
709 break;
710
711 wbc.more_io = 0;
712 wbc.encountered_congestion = 0;
713 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
714 wbc.pages_skipped = 0;
715 writeback_inodes_wb(wb, &wbc);
716 nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
717 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
718
719 /*
720 * If we ran out of stuff to write, bail unless more_io got set
721 */
722 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
723 if (wbc.more_io && !wbc.for_kupdate)
569 continue; 724 continue;
570 __iget(inode); 725 break;
571 spin_unlock(&inode_lock); 726 }
727 }
728
729 return wrote;
730}
731
732/*
733 * Return the next bdi_work struct that hasn't been processed by this
734 * wb thread yet
735 */
736static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
737 struct bdi_writeback *wb)
738{
739 struct bdi_work *work, *ret = NULL;
740
741 rcu_read_lock();
742
743 list_for_each_entry_rcu(work, &bdi->work_list, list) {
744 if (!test_and_clear_bit(wb->nr, &work->seen))
745 continue;
746
747 ret = work;
748 break;
749 }
750
751 rcu_read_unlock();
752 return ret;
753}
754
755static long wb_check_old_data_flush(struct bdi_writeback *wb)
756{
757 unsigned long expired;
758 long nr_pages;
759
760 expired = wb->last_old_flush +
761 msecs_to_jiffies(dirty_writeback_interval * 10);
762 if (time_before(jiffies, expired))
763 return 0;
764
765 wb->last_old_flush = jiffies;
766 nr_pages = global_page_state(NR_FILE_DIRTY) +
767 global_page_state(NR_UNSTABLE_NFS) +
768 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
769
770 if (nr_pages)
771 return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
772
773 return 0;
774}
775
776/*
777 * Retrieve work items and do the writeback they describe
778 */
779long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
780{
781 struct backing_dev_info *bdi = wb->bdi;
782 struct bdi_work *work;
783 long nr_pages, wrote = 0;
784
785 while ((work = get_next_work_item(bdi, wb)) != NULL) {
786 enum writeback_sync_modes sync_mode;
787
788 nr_pages = work->nr_pages;
789
790 /*
791 * Override sync mode, in case we must wait for completion
792 */
793 if (force_wait)
794 work->sync_mode = sync_mode = WB_SYNC_ALL;
795 else
796 sync_mode = work->sync_mode;
797
798 /*
799 * If this isn't a data integrity operation, just notify
800 * that we have seen this work and we are now starting it.
801 */
802 if (sync_mode == WB_SYNC_NONE)
803 wb_clear_pending(wb, work);
804
805 wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
806
807 /*
808 * This is a data integrity writeback, so only do the
809 * notification when we have completed the work.
810 */
811 if (sync_mode == WB_SYNC_ALL)
812 wb_clear_pending(wb, work);
813 }
814
815 /*
816 * Check for periodic writeback, kupdated() style
817 */
818 wrote += wb_check_old_data_flush(wb);
819
820 return wrote;
821}
822
823/*
824 * Handle writeback of dirty data for the device backed by this bdi. Also
825 * wakes up periodically and does kupdated style flushing.
826 */
827int bdi_writeback_task(struct bdi_writeback *wb)
828{
829 unsigned long last_active = jiffies;
830 unsigned long wait_jiffies = -1UL;
831 long pages_written;
832
833 while (!kthread_should_stop()) {
834 pages_written = wb_do_writeback(wb, 0);
835
836 if (pages_written)
837 last_active = jiffies;
838 else if (wait_jiffies != -1UL) {
839 unsigned long max_idle;
840
572 /* 841 /*
573 * We hold a reference to 'inode' so it couldn't have 842 * Longest period of inactivity that we tolerate. If we
574 * been removed from s_inodes list while we dropped the 843 * see dirty data again later, the task will get
575 * inode_lock. We cannot iput the inode now as we can 844 * recreated automatically.
576 * be holding the last reference and we cannot iput it
577 * under inode_lock. So we keep the reference and iput
578 * it later.
579 */ 845 */
580 iput(old_inode); 846 max_idle = max(5UL * 60 * HZ, wait_jiffies);
581 old_inode = inode; 847 if (time_after(jiffies, max_idle + last_active))
848 break;
849 }
582 850
583 filemap_fdatawait(mapping); 851 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
852 set_current_state(TASK_INTERRUPTIBLE);
853 schedule_timeout(wait_jiffies);
854 try_to_freeze();
855 }
584 856
585 cond_resched(); 857 return 0;
858}
859
860/*
861 * Schedule writeback for all backing devices. Expensive! If this is a data
862 * integrity operation, writeback will be complete when this returns. If
863 * we are simply called for WB_SYNC_NONE, then writeback will merely be
864 * scheduled to run.
865 */
866static void bdi_writeback_all(struct writeback_control *wbc)
867{
868 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
869 struct backing_dev_info *bdi;
870 struct bdi_work *work;
871 LIST_HEAD(list);
872
873restart:
874 spin_lock(&bdi_lock);
875
876 list_for_each_entry(bdi, &bdi_list, bdi_list) {
877 struct bdi_work *work;
586 878
587 spin_lock(&inode_lock); 879 if (!bdi_has_dirty_io(bdi))
880 continue;
881
882 /*
883 * If work allocation fails, do the writes inline. We drop
884 * the lock and restart the list writeout. This should be OK,
885 * since this happens rarely and because the writeout should
886 * eventually make more free memory available.
887 */
888 work = bdi_alloc_work(wbc);
889 if (!work) {
890 struct writeback_control __wbc;
891
892 /*
893 * Not a data integrity writeout, just continue
894 */
895 if (!must_wait)
896 continue;
897
898 spin_unlock(&bdi_lock);
899 __wbc = *wbc;
900 __wbc.bdi = bdi;
901 writeback_inodes_wbc(&__wbc);
902 goto restart;
588 } 903 }
589 spin_unlock(&inode_lock); 904 if (must_wait)
590 iput(old_inode); 905 list_add_tail(&work->wait_list, &list);
591 } else 906
592 spin_unlock(&inode_lock); 907 bdi_queue_work(bdi, work);
908 }
909
910 spin_unlock(&bdi_lock);
593 911
594 return; /* Leave any unwritten inodes on s_io */ 912 /*
913 * If this is for WB_SYNC_ALL, wait for pending work to complete
914 * before returning.
915 */
916 while (!list_empty(&list)) {
917 work = list_entry(list.next, struct bdi_work, wait_list);
918 list_del(&work->wait_list);
919 bdi_wait_on_work_clear(work);
920 call_rcu(&work->rcu_head, bdi_work_free);
921 }
595} 922}
596EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
597 923
598static void sync_sb_inodes(struct super_block *sb, 924/*
599 struct writeback_control *wbc) 925 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
926 * the whole world.
927 */
928void wakeup_flusher_threads(long nr_pages)
600{ 929{
601 generic_sync_sb_inodes(sb, wbc); 930 struct writeback_control wbc = {
931 .sync_mode = WB_SYNC_NONE,
932 .older_than_this = NULL,
933 .range_cyclic = 1,
934 };
935
936 if (nr_pages == 0)
937 nr_pages = global_page_state(NR_FILE_DIRTY) +
938 global_page_state(NR_UNSTABLE_NFS);
939 wbc.nr_to_write = nr_pages;
940 bdi_writeback_all(&wbc);
602} 941}
603 942
604/* 943static noinline void block_dump___mark_inode_dirty(struct inode *inode)
605 * Start writeback of dirty pagecache data against all unlocked inodes. 944{
945 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
946 struct dentry *dentry;
947 const char *name = "?";
948
949 dentry = d_find_alias(inode);
950 if (dentry) {
951 spin_lock(&dentry->d_lock);
952 name = (const char *) dentry->d_name.name;
953 }
954 printk(KERN_DEBUG
955 "%s(%d): dirtied inode %lu (%s) on %s\n",
956 current->comm, task_pid_nr(current), inode->i_ino,
957 name, inode->i_sb->s_id);
958 if (dentry) {
959 spin_unlock(&dentry->d_lock);
960 dput(dentry);
961 }
962 }
963}
964
965/**
966 * __mark_inode_dirty - internal function
967 * @inode: inode to mark
968 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
969 * Mark an inode as dirty. Callers should use mark_inode_dirty or
970 * mark_inode_dirty_sync.
606 * 971 *
607 * Note: 972 * Put the inode on the super block's dirty list.
608 * We don't need to grab a reference to superblock here. If it has non-empty 973 *
609 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed 974 * CAREFUL! We mark it dirty unconditionally, but move it onto the
610 * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all 975 * dirty list only if it is hashed or if it refers to a blockdev.
611 * empty. Since __sync_single_inode() regains inode_lock before it finally moves 976 * If it was not hashed, it will never be added to the dirty list
612 * inode from superblock lists we are OK. 977 * even if it is later hashed, as it will have been marked dirty already.
613 * 978 *
614 * If `older_than_this' is non-zero then only flush inodes which have a 979 * In short, make sure you hash any inodes _before_ you start marking
615 * flushtime older than *older_than_this. 980 * them dirty.
616 * 981 *
617 * If `bdi' is non-zero then we will scan the first inode against each 982 * This function *must* be atomic for the I_DIRTY_PAGES case -
618 * superblock until we find the matching ones. One group will be the dirty 983 * set_page_dirty() is called under spinlock in several places.
619 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 984 *
620 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 985 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
621 * super-efficient but we're about to do a ton of I/O... 986 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
987 * the kernel-internal blockdev inode represents the dirtying time of the
988 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
989 * page->mapping->host, so the page-dirtying time is recorded in the internal
990 * blockdev inode.
622 */ 991 */
623void 992void __mark_inode_dirty(struct inode *inode, int flags)
624writeback_inodes(struct writeback_control *wbc)
625{ 993{
626 struct super_block *sb; 994 struct super_block *sb = inode->i_sb;
627 995
628 might_sleep(); 996 /*
629 spin_lock(&sb_lock); 997 * Don't do this for I_DIRTY_PAGES - that doesn't actually
630restart: 998 * dirty the inode itself
631 list_for_each_entry_reverse(sb, &super_blocks, s_list) { 999 */
632 if (sb_has_dirty_inodes(sb)) { 1000 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
633 /* we're making our own get_super here */ 1001 if (sb->s_op->dirty_inode)
634 sb->s_count++; 1002 sb->s_op->dirty_inode(inode);
635 spin_unlock(&sb_lock); 1003 }
636 /* 1004
637 * If we can't get the readlock, there's no sense in 1005 /*
638 * waiting around, most of the time the FS is going to 1006 * make sure that changes are seen by all cpus before we test i_state
639 * be unmounted by the time it is released. 1007 * -- mikulas
640 */ 1008 */
641 if (down_read_trylock(&sb->s_umount)) { 1009 smp_mb();
642 if (sb->s_root) 1010
643 sync_sb_inodes(sb, wbc); 1011 /* avoid the locking if we can */
644 up_read(&sb->s_umount); 1012 if ((inode->i_state & flags) == flags)
1013 return;
1014
1015 if (unlikely(block_dump))
1016 block_dump___mark_inode_dirty(inode);
1017
1018 spin_lock(&inode_lock);
1019 if ((inode->i_state & flags) != flags) {
1020 const int was_dirty = inode->i_state & I_DIRTY;
1021
1022 inode->i_state |= flags;
1023
1024 /*
1025 * If the inode is being synced, just update its dirty state.
1026 * The unlocker will place the inode on the appropriate
1027 * superblock list, based upon its state.
1028 */
1029 if (inode->i_state & I_SYNC)
1030 goto out;
1031
1032 /*
1033 * Only add valid (hashed) inodes to the superblock's
1034 * dirty list. Add blockdev inodes as well.
1035 */
1036 if (!S_ISBLK(inode->i_mode)) {
1037 if (hlist_unhashed(&inode->i_hash))
1038 goto out;
1039 }
1040 if (inode->i_state & (I_FREEING|I_CLEAR))
1041 goto out;
1042
1043 /*
1044 * If the inode was already on b_dirty/b_io/b_more_io, don't
1045 * reposition it (that would break b_dirty time-ordering).
1046 */
1047 if (!was_dirty) {
1048 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1049 struct backing_dev_info *bdi = wb->bdi;
1050
1051 if (bdi_cap_writeback_dirty(bdi) &&
1052 !test_bit(BDI_registered, &bdi->state)) {
1053 WARN_ON(1);
1054 printk(KERN_ERR "bdi-%s not registered\n",
1055 bdi->name);
645 } 1056 }
646 spin_lock(&sb_lock); 1057
647 if (__put_super_and_need_restart(sb)) 1058 inode->dirtied_when = jiffies;
648 goto restart; 1059 list_move(&inode->i_list, &wb->b_dirty);
649 } 1060 }
650 if (wbc->nr_to_write <= 0)
651 break;
652 } 1061 }
653 spin_unlock(&sb_lock); 1062out:
1063 spin_unlock(&inode_lock);
654} 1064}
1065EXPORT_SYMBOL(__mark_inode_dirty);
655 1066
656/* 1067/*
657 * writeback and wait upon the filesystem's dirty inodes. The caller will 1068 * Write out a superblock's list of dirty inodes. A wait will be performed
658 * do this in two passes - one to write, and one to wait. 1069 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1070 *
1071 * If older_than_this is non-NULL, then only write out inodes which
1072 * had their first dirtying at a time earlier than *older_than_this.
659 * 1073 *
660 * A finite limit is set on the number of pages which will be written. 1074 * If we're a pdlfush thread, then implement pdflush collision avoidance
661 * To prevent infinite livelock of sys_sync(). 1075 * against the entire list.
662 * 1076 *
663 * We add in the number of potentially dirty inodes, because each inode write 1077 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
664 * can dirty pagecache in the underlying blockdev. 1078 * This function assumes that the blockdev superblock's inodes are backed by
1079 * a variety of queues, so all inodes are searched. For other superblocks,
1080 * assume that all inodes are backed by the same queue.
1081 *
1082 * The inodes to be written are parked on bdi->b_io. They are moved back onto
1083 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1084 * on the writer throttling path, and we get decent balancing between many
1085 * throttled threads: we don't want them all piling up on inode_sync_wait.
665 */ 1086 */
666void sync_inodes_sb(struct super_block *sb, int wait) 1087static void wait_sb_inodes(struct writeback_control *wbc)
1088{
1089 struct inode *inode, *old_inode = NULL;
1090
1091 /*
1092 * We need to be protected against the filesystem going from
1093 * r/o to r/w or vice versa.
1094 */
1095 WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
1096
1097 spin_lock(&inode_lock);
1098
1099 /*
1100 * Data integrity sync. Must wait for all pages under writeback,
1101 * because there may have been pages dirtied before our sync
1102 * call, but which had writeout started before we write it out.
1103 * In which case, the inode may not be on the dirty list, but
1104 * we still have to wait for that writeout.
1105 */
1106 list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
1107 struct address_space *mapping;
1108
1109 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1110 continue;
1111 mapping = inode->i_mapping;
1112 if (mapping->nrpages == 0)
1113 continue;
1114 __iget(inode);
1115 spin_unlock(&inode_lock);
1116 /*
1117 * We hold a reference to 'inode' so it couldn't have
1118 * been removed from s_inodes list while we dropped the
1119 * inode_lock. We cannot iput the inode now as we can
1120 * be holding the last reference and we cannot iput it
1121 * under inode_lock. So we keep the reference and iput
1122 * it later.
1123 */
1124 iput(old_inode);
1125 old_inode = inode;
1126
1127 filemap_fdatawait(mapping);
1128
1129 cond_resched();
1130
1131 spin_lock(&inode_lock);
1132 }
1133 spin_unlock(&inode_lock);
1134 iput(old_inode);
1135}
1136
1137/**
1138 * writeback_inodes_sb - writeback dirty inodes from given super_block
1139 * @sb: the superblock
1140 *
1141 * Start writeback on some inodes on this super_block. No guarantees are made
1142 * on how many (if any) will be written, and this function does not wait
1143 * for IO completion of submitted IO. The number of pages submitted is
1144 * returned.
1145 */
1146long writeback_inodes_sb(struct super_block *sb)
667{ 1147{
668 struct writeback_control wbc = { 1148 struct writeback_control wbc = {
669 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 1149 .sb = sb,
1150 .sync_mode = WB_SYNC_NONE,
670 .range_start = 0, 1151 .range_start = 0,
671 .range_end = LLONG_MAX, 1152 .range_end = LLONG_MAX,
672 }; 1153 };
1154 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1155 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1156 long nr_to_write;
673 1157
674 if (!wait) { 1158 nr_to_write = nr_dirty + nr_unstable +
675 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
676 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
677
678 wbc.nr_to_write = nr_dirty + nr_unstable +
679 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1159 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
680 } else
681 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
682 1160
683 sync_sb_inodes(sb, &wbc); 1161 wbc.nr_to_write = nr_to_write;
1162 bdi_writeback_all(&wbc);
1163 return nr_to_write - wbc.nr_to_write;
1164}
1165EXPORT_SYMBOL(writeback_inodes_sb);
1166
1167/**
1168 * sync_inodes_sb - sync sb inode pages
1169 * @sb: the superblock
1170 *
1171 * This function writes and waits on any dirty inode belonging to this
1172 * super_block. The number of pages synced is returned.
1173 */
1174long sync_inodes_sb(struct super_block *sb)
1175{
1176 struct writeback_control wbc = {
1177 .sb = sb,
1178 .sync_mode = WB_SYNC_ALL,
1179 .range_start = 0,
1180 .range_end = LLONG_MAX,
1181 };
1182 long nr_to_write = LONG_MAX; /* doesn't actually matter */
1183
1184 wbc.nr_to_write = nr_to_write;
1185 bdi_writeback_all(&wbc);
1186 wait_sb_inodes(&wbc);
1187 return nr_to_write - wbc.nr_to_write;
684} 1188}
1189EXPORT_SYMBOL(sync_inodes_sb);
685 1190
686/** 1191/**
687 * write_inode_now - write an inode to disk 1192 * write_inode_now - write an inode to disk
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..4567db6f9430 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -801,6 +801,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
801{ 801{
802 int err; 802 int err;
803 803
804 fc->bdi.name = "fuse";
804 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 805 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
805 fc->bdi.unplug_io_fn = default_unplug_io_fn; 806 fc->bdi.unplug_io_fn = default_unplug_io_fn;
806 /* fuse does it's own writeback accounting */ 807 /* fuse does it's own writeback accounting */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 23419dc3027b..a7cbfbd340c7 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -386,16 +386,16 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
386#define GDLM_ATTR(_name,_mode,_show,_store) \ 386#define GDLM_ATTR(_name,_mode,_show,_store) \
387static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) 387static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
388 388
389GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); 389GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
390GDLM_ATTR(block, 0644, block_show, block_store); 390GDLM_ATTR(block, 0644, block_show, block_store);
391GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 391GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
392GDLM_ATTR(id, 0444, lkid_show, NULL); 392GDLM_ATTR(id, 0444, lkid_show, NULL);
393GDLM_ATTR(jid, 0444, jid_show, NULL); 393GDLM_ATTR(jid, 0444, jid_show, NULL);
394GDLM_ATTR(first, 0444, lkfirst_show, NULL); 394GDLM_ATTR(first, 0444, lkfirst_show, NULL);
395GDLM_ATTR(first_done, 0444, first_done_show, NULL); 395GDLM_ATTR(first_done, 0444, first_done_show, NULL);
396GDLM_ATTR(recover, 0200, NULL, recover_store); 396GDLM_ATTR(recover, 0600, NULL, recover_store);
397GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); 397GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
398GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); 398GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
399 399
400static struct attribute *lock_module_attrs[] = { 400static struct attribute *lock_module_attrs[] = {
401 &gdlm_attr_proto_name.attr, 401 &gdlm_attr_proto_name.attr,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 941c8425c10b..a93b885311d8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
44static const struct inode_operations hugetlbfs_inode_operations; 44static const struct inode_operations hugetlbfs_inode_operations;
45 45
46static struct backing_dev_info hugetlbfs_backing_dev_info = { 46static struct backing_dev_info hugetlbfs_backing_dev_info = {
47 .name = "hugetlbfs",
47 .ra_pages = 0, /* No readahead */ 48 .ra_pages = 0, /* No readahead */
48 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 49 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
49}; 50};
@@ -935,26 +936,28 @@ static int can_do_hugetlb_shm(void)
935 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); 936 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
936} 937}
937 938
938struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) 939struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
940 struct user_struct **user)
939{ 941{
940 int error = -ENOMEM; 942 int error = -ENOMEM;
941 int unlock_shm = 0;
942 struct file *file; 943 struct file *file;
943 struct inode *inode; 944 struct inode *inode;
944 struct dentry *dentry, *root; 945 struct dentry *dentry, *root;
945 struct qstr quick_string; 946 struct qstr quick_string;
946 struct user_struct *user = current_user();
947 947
948 *user = NULL;
948 if (!hugetlbfs_vfsmount) 949 if (!hugetlbfs_vfsmount)
949 return ERR_PTR(-ENOENT); 950 return ERR_PTR(-ENOENT);
950 951
951 if (!can_do_hugetlb_shm()) { 952 if (!can_do_hugetlb_shm()) {
952 if (user_shm_lock(size, user)) { 953 *user = current_user();
953 unlock_shm = 1; 954 if (user_shm_lock(size, *user)) {
954 WARN_ONCE(1, 955 WARN_ONCE(1,
955 "Using mlock ulimits for SHM_HUGETLB deprecated\n"); 956 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
956 } else 957 } else {
958 *user = NULL;
957 return ERR_PTR(-EPERM); 959 return ERR_PTR(-EPERM);
960 }
958 } 961 }
959 962
960 root = hugetlbfs_vfsmount->mnt_root; 963 root = hugetlbfs_vfsmount->mnt_root;
@@ -996,8 +999,10 @@ out_inode:
996out_dentry: 999out_dentry:
997 dput(dentry); 1000 dput(dentry);
998out_shm_unlock: 1001out_shm_unlock:
999 if (unlock_shm) 1002 if (*user) {
1000 user_shm_unlock(size, user); 1003 user_shm_unlock(size, *user);
1004 *user = NULL;
1005 }
1001 return ERR_PTR(error); 1006 return ERR_PTR(error);
1002} 1007}
1003 1008
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218e..7edb62e97419 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
258 return rc; 258 return rc;
259} 259}
260 260
261static int jffs2_check_acl(struct inode *inode, int mask) 261int jffs2_check_acl(struct inode *inode, int mask)
262{ 262{
263 struct posix_acl *acl; 263 struct posix_acl *acl;
264 int rc; 264 int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
274 return -EAGAIN; 274 return -EAGAIN;
275} 275}
276 276
277int jffs2_permission(struct inode *inode, int mask)
278{
279 return generic_permission(inode, mask, jffs2_check_acl);
280}
281
282int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) 277int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
283{ 278{
284 struct posix_acl *acl, *clone; 279 struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f6..f0ba63e3c36b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
26 26
27#ifdef CONFIG_JFFS2_FS_POSIX_ACL 27#ifdef CONFIG_JFFS2_FS_POSIX_ACL
28 28
29extern int jffs2_permission(struct inode *, int); 29extern int jffs2_check_acl(struct inode *, int);
30extern int jffs2_acl_chmod(struct inode *); 30extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
36 36
37#else 37#else
38 38
39#define jffs2_permission (NULL) 39#define jffs2_check_acl (NULL)
40#define jffs2_acl_chmod(inode) (0) 40#define jffs2_acl_chmod(inode) (0)
41#define jffs2_init_acl_pre(dir_i,inode,mode) (0) 41#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
42#define jffs2_init_acl_post(inode) (0) 42#define jffs2_init_acl_post(inode) (0)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4c..7aa4417e085f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
55 .rmdir = jffs2_rmdir, 55 .rmdir = jffs2_rmdir,
56 .mknod = jffs2_mknod, 56 .mknod = jffs2_mknod,
57 .rename = jffs2_rename, 57 .rename = jffs2_rename,
58 .permission = jffs2_permission, 58 .check_acl = jffs2_check_acl,
59 .setattr = jffs2_setattr, 59 .setattr = jffs2_setattr,
60 .setxattr = jffs2_setxattr, 60 .setxattr = jffs2_setxattr,
61 .getxattr = jffs2_getxattr, 61 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 23c947539864..b7b74e299142 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
56 56
57const struct inode_operations jffs2_file_inode_operations = 57const struct inode_operations jffs2_file_inode_operations =
58{ 58{
59 .permission = jffs2_permission, 59 .check_acl = jffs2_check_acl,
60 .setattr = jffs2_setattr, 60 .setattr = jffs2_setattr,
61 .setxattr = jffs2_setxattr, 61 .setxattr = jffs2_setxattr,
62 .getxattr = jffs2_getxattr, 62 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad9..4ec11e8bda8c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
21{ 21{
22 .readlink = generic_readlink, 22 .readlink = generic_readlink,
23 .follow_link = jffs2_follow_link, 23 .follow_link = jffs2_follow_link,
24 .permission = jffs2_permission, 24 .check_acl = jffs2_check_acl,
25 .setattr = jffs2_setattr, 25 .setattr = jffs2_setattr,
26 .setxattr = jffs2_setxattr, 26 .setxattr = jffs2_setxattr,
27 .getxattr = jffs2_getxattr, 27 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index d9a721e6db70..5ef7bac265e5 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
1268 if (!c->wbuf) 1268 if (!c->wbuf)
1269 return -ENOMEM; 1269 return -ENOMEM;
1270 1270
1271#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
1272 c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
1273 if (!c->wbuf_verify) {
1274 kfree(c->wbuf);
1275 return -ENOMEM;
1276 }
1277#endif
1271 return 0; 1278 return 0;
1272} 1279}
1273 1280
1274void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { 1281void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
1282#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
1283 kfree(c->wbuf_verify);
1284#endif
1275 kfree(c->wbuf); 1285 kfree(c->wbuf);
1276} 1286}
1277 1287
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a29c7c3e3fb8..d66477c34306 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,7 +114,7 @@ out:
114 return rc; 114 return rc;
115} 115}
116 116
117static int jfs_check_acl(struct inode *inode, int mask) 117int jfs_check_acl(struct inode *inode, int mask)
118{ 118{
119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); 119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
120 120
@@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
129 return -EAGAIN; 129 return -EAGAIN;
130} 130}
131 131
132int jfs_permission(struct inode *inode, int mask)
133{
134 return generic_permission(inode, mask, jfs_check_acl);
135}
136
137int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) 132int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
138{ 133{
139 struct posix_acl *acl = NULL; 134 struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3b..2b70fa78e4a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
96 .removexattr = jfs_removexattr, 96 .removexattr = jfs_removexattr,
97#ifdef CONFIG_JFS_POSIX_ACL 97#ifdef CONFIG_JFS_POSIX_ACL
98 .setattr = jfs_setattr, 98 .setattr = jfs_setattr,
99 .permission = jfs_permission, 99 .check_acl = jfs_check_acl,
100#endif 100#endif
101}; 101};
102 102
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a389..b07bd417ef85 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
20 20
21#ifdef CONFIG_JFS_POSIX_ACL 21#ifdef CONFIG_JFS_POSIX_ACL
22 22
23int jfs_permission(struct inode *, int); 23int jfs_check_acl(struct inode *, int);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 24int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_setattr(struct dentry *, struct iattr *); 25int jfs_setattr(struct dentry *, struct iattr *);
26 26
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92a..c79a4270f083 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
1543 .removexattr = jfs_removexattr, 1543 .removexattr = jfs_removexattr,
1544#ifdef CONFIG_JFS_POSIX_ACL 1544#ifdef CONFIG_JFS_POSIX_ACL
1545 .setattr = jfs_setattr, 1545 .setattr = jfs_setattr,
1546 .permission = jfs_permission, 1546 .check_acl = jfs_check_acl,
1547#endif 1547#endif
1548}; 1548};
1549 1549
diff --git a/fs/libfs.c b/fs/libfs.c
index ddfa89948c3f..dcec3d3ea64f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
217 return PTR_ERR(s); 217 return PTR_ERR(s);
218 218
219 s->s_flags = MS_NOUSER; 219 s->s_flags = MS_NOUSER;
220 s->s_maxbytes = ~0ULL; 220 s->s_maxbytes = MAX_LFS_FILESIZE;
221 s->s_blocksize = PAGE_SIZE; 221 s->s_blocksize = PAGE_SIZE;
222 s->s_blocksize_bits = PAGE_SHIFT; 222 s->s_blocksize_bits = PAGE_SHIFT;
223 s->s_magic = magic; 223 s->s_magic = magic;
diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..52366e877d76 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1591 if (can_sleep) 1591 if (can_sleep)
1592 lock->fl_flags |= FL_SLEEP; 1592 lock->fl_flags |= FL_SLEEP;
1593 1593
1594 error = security_file_lock(filp, cmd); 1594 error = security_file_lock(filp, lock->fl_type);
1595 if (error) 1595 if (error)
1596 goto out_free; 1596 goto out_free;
1597 1597
diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..d11f404667e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
169EXPORT_SYMBOL(putname); 169EXPORT_SYMBOL(putname);
170#endif 170#endif
171 171
172 172/*
173/** 173 * This does basic POSIX ACL permission checking
174 * generic_permission - check for access rights on a Posix-like filesystem
175 * @inode: inode to check access rights for
176 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
177 * @check_acl: optional callback to check for Posix ACLs
178 *
179 * Used to check for read/write/execute permissions on a file.
180 * We use "fsuid" for this, letting us set arbitrary permissions
181 * for filesystem access without changing the "normal" uids which
182 * are used for other things..
183 */ 174 */
184int generic_permission(struct inode *inode, int mask, 175static int acl_permission_check(struct inode *inode, int mask,
185 int (*check_acl)(struct inode *inode, int mask)) 176 int (*check_acl)(struct inode *inode, int mask))
186{ 177{
187 umode_t mode = inode->i_mode; 178 umode_t mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
193 else { 184 else {
194 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 185 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
195 int error = check_acl(inode, mask); 186 int error = check_acl(inode, mask);
196 if (error == -EACCES) 187 if (error != -EAGAIN)
197 goto check_capabilities;
198 else if (error != -EAGAIN)
199 return error; 188 return error;
200 } 189 }
201 190
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
208 */ 197 */
209 if ((mask & ~mode) == 0) 198 if ((mask & ~mode) == 0)
210 return 0; 199 return 0;
200 return -EACCES;
201}
202
203/**
204 * generic_permission - check for access rights on a Posix-like filesystem
205 * @inode: inode to check access rights for
206 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
207 * @check_acl: optional callback to check for Posix ACLs
208 *
209 * Used to check for read/write/execute permissions on a file.
210 * We use "fsuid" for this, letting us set arbitrary permissions
211 * for filesystem access without changing the "normal" uids which
212 * are used for other things..
213 */
214int generic_permission(struct inode *inode, int mask,
215 int (*check_acl)(struct inode *inode, int mask))
216{
217 int ret;
218
219 /*
220 * Do the basic POSIX ACL permission checks.
221 */
222 ret = acl_permission_check(inode, mask, check_acl);
223 if (ret != -EACCES)
224 return ret;
211 225
212 check_capabilities:
213 /* 226 /*
214 * Read/write DACs are always overridable. 227 * Read/write DACs are always overridable.
215 * Executable DACs are overridable if at least one exec bit is set. 228 * Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
262 if (inode->i_op->permission) 275 if (inode->i_op->permission)
263 retval = inode->i_op->permission(inode, mask); 276 retval = inode->i_op->permission(inode, mask);
264 else 277 else
265 retval = generic_permission(inode, mask, NULL); 278 retval = generic_permission(inode, mask, inode->i_op->check_acl);
266 279
267 if (retval) 280 if (retval)
268 return retval; 281 return retval;
@@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
432 */ 445 */
433static int exec_permission_lite(struct inode *inode) 446static int exec_permission_lite(struct inode *inode)
434{ 447{
435 umode_t mode = inode->i_mode; 448 int ret;
436 449
437 if (inode->i_op->permission) 450 if (inode->i_op->permission) {
438 return -EAGAIN; 451 ret = inode->i_op->permission(inode, MAY_EXEC);
439 452 if (!ret)
440 if (current_fsuid() == inode->i_uid) 453 goto ok;
441 mode >>= 6; 454 return ret;
442 else if (in_group_p(inode->i_gid)) 455 }
443 mode >>= 3; 456 ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
444 457 if (!ret)
445 if (mode & MAY_EXEC)
446 goto ok;
447
448 if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
449 goto ok;
450
451 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
452 goto ok; 458 goto ok;
453 459
454 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) 460 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
455 goto ok; 461 goto ok;
456 462
457 return -EACCES; 463 return ret;
458ok: 464ok:
459 return security_inode_permission(inode, MAY_EXEC); 465 return security_inode_permission(inode, MAY_EXEC);
460} 466}
@@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
853 859
854 nd->flags |= LOOKUP_CONTINUE; 860 nd->flags |= LOOKUP_CONTINUE;
855 err = exec_permission_lite(inode); 861 err = exec_permission_lite(inode);
856 if (err == -EAGAIN)
857 err = inode_permission(nd->path.dentry->d_inode,
858 MAY_EXEC);
859 if (!err)
860 err = ima_path_check(&nd->path, MAY_EXEC,
861 IMA_COUNT_UPDATE);
862 if (err) 862 if (err)
863 break; 863 break;
864 864
@@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag)
1533 if (error) 1533 if (error)
1534 return error; 1534 return error;
1535 1535
1536 error = ima_path_check(path, 1536 error = ima_path_check(path, acc_mode ?
1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), 1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
1538 ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
1538 IMA_COUNT_UPDATE); 1539 IMA_COUNT_UPDATE);
1540
1539 if (error) 1541 if (error)
1540 return error; 1542 return error;
1541 /* 1543 /*
1542 * An append-only file must be opened in append mode for writing. 1544 * An append-only file must be opened in append mode for writing.
1543 */ 1545 */
1544 if (IS_APPEND(inode)) { 1546 if (IS_APPEND(inode)) {
1547 error = -EPERM;
1545 if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) 1548 if ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1546 return -EPERM; 1549 goto err_out;
1547 if (flag & O_TRUNC) 1550 if (flag & O_TRUNC)
1548 return -EPERM; 1551 goto err_out;
1549 } 1552 }
1550 1553
1551 /* O_NOATIME can only be set by the owner or superuser */ 1554 /* O_NOATIME can only be set by the owner or superuser */
1552 if (flag & O_NOATIME) 1555 if (flag & O_NOATIME)
1553 if (!is_owner_or_cap(inode)) 1556 if (!is_owner_or_cap(inode)) {
1554 return -EPERM; 1557 error = -EPERM;
1558 goto err_out;
1559 }
1555 1560
1556 /* 1561 /*
1557 * Ensure there are no outstanding leases on the file. 1562 * Ensure there are no outstanding leases on the file.
1558 */ 1563 */
1559 error = break_lease(inode, flag); 1564 error = break_lease(inode, flag);
1560 if (error) 1565 if (error)
1561 return error; 1566 goto err_out;
1562 1567
1563 if (flag & O_TRUNC) { 1568 if (flag & O_TRUNC) {
1564 error = get_write_access(inode); 1569 error = get_write_access(inode);
1565 if (error) 1570 if (error)
1566 return error; 1571 goto err_out;
1567 1572
1568 /* 1573 /*
1569 * Refuse to truncate files with mandatory locks held on them. 1574 * Refuse to truncate files with mandatory locks held on them.
@@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag)
1581 } 1586 }
1582 put_write_access(inode); 1587 put_write_access(inode);
1583 if (error) 1588 if (error)
1584 return error; 1589 goto err_out;
1585 } else 1590 } else
1586 if (flag & FMODE_WRITE) 1591 if (flag & FMODE_WRITE)
1587 vfs_dq_init(inode); 1592 vfs_dq_init(inode);
1588 1593
1589 return 0; 1594 return 0;
1595err_out:
1596 ima_counts_put(path, acc_mode ?
1597 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
1598 ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
1599 return error;
1590} 1600}
1591 1601
1592/* 1602/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d36925f9b952..e350bd6a2334 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -882,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
882 server->rsize = NFS_MAX_FILE_IO_SIZE; 882 server->rsize = NFS_MAX_FILE_IO_SIZE;
883 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 883 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
884 884
885 server->backing_dev_info.name = "nfs";
885 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; 886 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
886 887
887 if (server->wsize > max_rpc_payload) 888 if (server->wsize > max_rpc_payload)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 65ca8c18476f..1434080aefeb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1250,8 +1250,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
1250 continue; 1250 continue;
1251 } 1251 }
1252 /* Initialize or reset the session */ 1252 /* Initialize or reset the session */
1253 if (nfs4_has_session(clp) && 1253 if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)
1254 test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) { 1254 && nfs4_has_session(clp)) {
1255 if (clp->cl_cons_state == NFS_CS_SESSION_INITING) 1255 if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
1256 status = nfs4_initialize_session(clp); 1256 status = nfs4_initialize_session(clp);
1257 else 1257 else
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 5573508f707f..36fcabbf5186 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
34 int flags = nfsexp_flags(rqstp, exp); 34 int flags = nfsexp_flags(rqstp, exp);
35 int ret; 35 int ret;
36 36
37 validate_process_creds();
38
37 /* discard any old override before preparing the new set */ 39 /* discard any old override before preparing the new set */
38 revert_creds(get_cred(current->real_cred)); 40 revert_creds(get_cred(current->real_cred));
39 new = prepare_creds(); 41 new = prepare_creds();
@@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
86 else 88 else
87 new->cap_effective = cap_raise_nfsd_set(new->cap_effective, 89 new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
88 new->cap_permitted); 90 new->cap_permitted);
91 validate_process_creds();
89 put_cred(override_creds(new)); 92 put_cred(override_creds(new));
90 put_cred(new); 93 put_cred(new);
94 validate_process_creds();
91 return 0; 95 return 0;
92 96
93oom: 97oom:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 492c79b7800b..24d58adfe5fd 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -496,7 +496,9 @@ nfsd(void *vrqstp)
496 /* Lock the export hash tables for reading. */ 496 /* Lock the export hash tables for reading. */
497 exp_readlock(); 497 exp_readlock();
498 498
499 validate_process_creds();
499 svc_process(rqstp); 500 svc_process(rqstp);
501 validate_process_creds();
500 502
501 /* Unlock export hash tables */ 503 /* Unlock export hash tables */
502 exp_readunlock(); 504 exp_readunlock();
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23341c1063bc..8fa09bfbcba7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
684 __be32 err; 684 __be32 err;
685 int host_err; 685 int host_err;
686 686
687 validate_process_creds();
688
687 /* 689 /*
688 * If we get here, then the client has already done an "open", 690 * If we get here, then the client has already done an "open",
689 * and (hopefully) checked permission - so allow OWNER_OVERRIDE 691 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
740out_nfserr: 742out_nfserr:
741 err = nfserrno(host_err); 743 err = nfserrno(host_err);
742out: 744out:
745 validate_process_creds();
743 return err; 746 return err;
744} 747}
745 748
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 7e0b61be212e..c668bca579c1 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
209 * We cannot call radix_tree_preload for the kernels older 209 * We cannot call radix_tree_preload for the kernels older
210 * than 2.6.23, because it is not exported for modules. 210 * than 2.6.23, because it is not exported for modules.
211 */ 211 */
212retry:
212 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 213 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
213 if (err) 214 if (err)
214 goto failed_unlock; 215 goto failed_unlock;
@@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
219 (unsigned long long)oldkey, 220 (unsigned long long)oldkey,
220 (unsigned long long)newkey); 221 (unsigned long long)newkey);
221 222
222retry:
223 spin_lock_irq(&btnc->tree_lock); 223 spin_lock_irq(&btnc->tree_lock);
224 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); 224 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
225 spin_unlock_irq(&btnc->tree_lock); 225 spin_unlock_irq(&btnc->tree_lock);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8e2ec43b18f4..151964f0de4c 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -416,8 +416,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
416 if (unlikely(err)) 416 if (unlikely(err))
417 goto failed; 417 goto failed;
418 418
419 down_read(&nilfs->ns_segctor_sem);
419 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 420 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
420 &bh_cp); 421 &bh_cp);
422 up_read(&nilfs->ns_segctor_sem);
421 if (unlikely(err)) { 423 if (unlikely(err)) {
422 if (err == -ENOENT || err == -EINVAL) { 424 if (err == -ENOENT || err == -EINVAL) {
423 printk(KERN_ERR 425 printk(KERN_ERR
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e8adbffc626f..1b9caafb8662 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -253,7 +253,7 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
253 253
254static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) 254static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
255{ 255{
256 if (!atomic_dec_and_test(&sbi->s_count)) 256 if (atomic_dec_and_test(&sbi->s_count))
257 kfree(sbi); 257 kfree(sbi);
258} 258}
259 259
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 47cd258fd24d..c9ee67b442e1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
62 event_priv->wd = wd; 62 event_priv->wd = wd;
63 63
64 ret = fsnotify_add_notify_event(group, event, fsn_event_priv); 64 ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
65 /* EEXIST is not an error */ 65 if (ret) {
66 if (ret == -EEXIST)
67 ret = 0;
68
69 /* did event_priv get attached? */
70 if (list_empty(&fsn_event_priv->event_list))
71 inotify_free_event_priv(fsn_event_priv); 66 inotify_free_event_priv(fsn_event_priv);
67 /* EEXIST says we tail matched, EOVERFLOW isn't something
68 * to report up the stack. */
69 if ((ret == -EEXIST) ||
70 (ret == -EOVERFLOW))
71 ret = 0;
72 }
72 73
73 /* 74 /*
74 * If we hold the entry until after the event is on the queue 75 * If we hold the entry until after the event is on the queue
@@ -104,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
104 return send; 105 return send;
105} 106}
106 107
108/*
109 * This is NEVER supposed to be called. Inotify marks should either have been
110 * removed from the idr when the watch was removed or in the
111 * fsnotify_destroy_mark_by_group() call when the inotify instance was being
112 * torn down. This is only called if the idr is about to be freed but there
113 * are still marks in it.
114 */
107static int idr_callback(int id, void *p, void *data) 115static int idr_callback(int id, void *p, void *data)
108{ 116{
109 BUG(); 117 struct fsnotify_mark_entry *entry;
118 struct inotify_inode_mark_entry *ientry;
119 static bool warned = false;
120
121 if (warned)
122 return 0;
123
124 warned = false;
125 entry = p;
126 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
127
128 WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
129 "idr. Probably leaking memory\n", id, p, data);
130
131 /*
132 * I'm taking the liberty of assuming that the mark in question is a
133 * valid address and I'm dereferencing it. This might help to figure
134 * out why we got here and the panic is no worse than the original
135 * BUG() that was here.
136 */
137 if (entry)
138 printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
139 entry->group, entry->inode, ientry->wd);
110 return 0; 140 return 0;
111} 141}
112 142
113static void inotify_free_group_priv(struct fsnotify_group *group) 143static void inotify_free_group_priv(struct fsnotify_group *group)
114{ 144{
115 /* ideally the idr is empty and we won't hit the BUG in teh callback */ 145 /* ideally the idr is empty and we won't hit the BUG in teh callback */
116 idr_for_each(&group->inotify_data.idr, idr_callback, NULL); 146 idr_for_each(&group->inotify_data.idr, idr_callback, group);
117 idr_remove_all(&group->inotify_data.idr); 147 idr_remove_all(&group->inotify_data.idr);
118 idr_destroy(&group->inotify_data.idr); 148 idr_destroy(&group->inotify_data.idr);
119} 149}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f30d9bbc2e1b..dcd2040d330c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,9 +47,6 @@
47 47
48static struct vfsmount *inotify_mnt __read_mostly; 48static struct vfsmount *inotify_mnt __read_mostly;
49 49
50/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */
51static struct inotify_event nul_inotify_event;
52
53/* these are configurable via /proc/sys/fs/inotify/ */ 50/* these are configurable via /proc/sys/fs/inotify/ */
54static int inotify_max_user_instances __read_mostly; 51static int inotify_max_user_instances __read_mostly;
55static int inotify_max_queued_events __read_mostly; 52static int inotify_max_queued_events __read_mostly;
@@ -157,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
157 154
158 event = fsnotify_peek_notify_event(group); 155 event = fsnotify_peek_notify_event(group);
159 156
160 event_size += roundup(event->name_len, event_size); 157 if (event->name_len)
158 event_size += roundup(event->name_len + 1, event_size);
161 159
162 if (event_size > count) 160 if (event_size > count)
163 return ERR_PTR(-EINVAL); 161 return ERR_PTR(-EINVAL);
@@ -183,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
183 struct fsnotify_event_private_data *fsn_priv; 181 struct fsnotify_event_private_data *fsn_priv;
184 struct inotify_event_private_data *priv; 182 struct inotify_event_private_data *priv;
185 size_t event_size = sizeof(struct inotify_event); 183 size_t event_size = sizeof(struct inotify_event);
186 size_t name_len; 184 size_t name_len = 0;
187 185
188 /* we get the inotify watch descriptor from the event private data */ 186 /* we get the inotify watch descriptor from the event private data */
189 spin_lock(&event->lock); 187 spin_lock(&event->lock);
@@ -199,8 +197,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
199 inotify_free_event_priv(fsn_priv); 197 inotify_free_event_priv(fsn_priv);
200 } 198 }
201 199
202 /* round up event->name_len so it is a multiple of event_size */ 200 /*
203 name_len = roundup(event->name_len, event_size); 201 * round up event->name_len so it is a multiple of event_size
202 * plus an extra byte for the terminating '\0'.
203 */
204 if (event->name_len)
205 name_len = roundup(event->name_len + 1, event_size);
204 inotify_event.len = name_len; 206 inotify_event.len = name_len;
205 207
206 inotify_event.mask = inotify_mask_to_arg(event->mask); 208 inotify_event.mask = inotify_mask_to_arg(event->mask);
@@ -224,8 +226,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
224 return -EFAULT; 226 return -EFAULT;
225 buf += event->name_len; 227 buf += event->name_len;
226 228
227 /* fill userspace with 0's from nul_inotify_event */ 229 /* fill userspace with 0's */
228 if (copy_to_user(buf, &nul_inotify_event, len_to_zero)) 230 if (clear_user(buf, len_to_zero))
229 return -EFAULT; 231 return -EFAULT;
230 buf += len_to_zero; 232 buf += len_to_zero;
231 event_size += name_len; 233 event_size += name_len;
@@ -326,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
326 list_for_each_entry(holder, &group->notification_list, event_list) { 328 list_for_each_entry(holder, &group->notification_list, event_list) {
327 event = holder->event; 329 event = holder->event;
328 send_len += sizeof(struct inotify_event); 330 send_len += sizeof(struct inotify_event);
329 send_len += roundup(event->name_len, 331 if (event->name_len)
330 sizeof(struct inotify_event)); 332 send_len += roundup(event->name_len + 1,
333 sizeof(struct inotify_event));
331 } 334 }
332 mutex_unlock(&group->notification_mutex); 335 mutex_unlock(&group->notification_mutex);
333 ret = put_user(send_len, (int __user *) p); 336 ret = put_user(send_len, (int __user *) p);
@@ -364,20 +367,53 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
364 return error; 367 return error;
365} 368}
366 369
370/*
371 * Remove the mark from the idr (if present) and drop the reference
372 * on the mark because it was in the idr.
373 */
367static void inotify_remove_from_idr(struct fsnotify_group *group, 374static void inotify_remove_from_idr(struct fsnotify_group *group,
368 struct inotify_inode_mark_entry *ientry) 375 struct inotify_inode_mark_entry *ientry)
369{ 376{
370 struct idr *idr; 377 struct idr *idr;
378 struct fsnotify_mark_entry *entry;
379 struct inotify_inode_mark_entry *found_ientry;
380 int wd;
371 381
372 spin_lock(&group->inotify_data.idr_lock); 382 spin_lock(&group->inotify_data.idr_lock);
373 idr = &group->inotify_data.idr; 383 idr = &group->inotify_data.idr;
374 idr_remove(idr, ientry->wd); 384 wd = ientry->wd;
375 spin_unlock(&group->inotify_data.idr_lock); 385
386 if (wd == -1)
387 goto out;
388
389 entry = idr_find(&group->inotify_data.idr, wd);
390 if (unlikely(!entry))
391 goto out;
392
393 found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
394 if (unlikely(found_ientry != ientry)) {
395 /* We found an entry in the idr with the right wd, but it's
396 * not the entry we were told to remove. eparis seriously
397 * fucked up somewhere. */
398 WARN_ON(1);
399 ientry->wd = -1;
400 goto out;
401 }
402
403 /* One ref for being in the idr, one ref held by the caller */
404 BUG_ON(atomic_read(&entry->refcnt) < 2);
405
406 idr_remove(idr, wd);
376 ientry->wd = -1; 407 ientry->wd = -1;
408
409 /* removed from the idr, drop that ref */
410 fsnotify_put_mark(entry);
411out:
412 spin_unlock(&group->inotify_data.idr_lock);
377} 413}
414
378/* 415/*
379 * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the 416 * Send IN_IGNORED for this wd, remove this wd from the idr.
380 * internal reference help on the mark because it is in the idr.
381 */ 417 */
382void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 418void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
383 struct fsnotify_group *group) 419 struct fsnotify_group *group)
@@ -386,6 +422,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
386 struct fsnotify_event *ignored_event; 422 struct fsnotify_event *ignored_event;
387 struct inotify_event_private_data *event_priv; 423 struct inotify_event_private_data *event_priv;
388 struct fsnotify_event_private_data *fsn_event_priv; 424 struct fsnotify_event_private_data *fsn_event_priv;
425 int ret;
389 426
390 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, 427 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
391 FSNOTIFY_EVENT_NONE, NULL, 0, 428 FSNOTIFY_EVENT_NONE, NULL, 0,
@@ -404,10 +441,8 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
404 fsn_event_priv->group = group; 441 fsn_event_priv->group = group;
405 event_priv->wd = ientry->wd; 442 event_priv->wd = ientry->wd;
406 443
407 fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); 444 ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
408 445 if (ret)
409 /* did the private data get added? */
410 if (list_empty(&fsn_event_priv->event_list))
411 inotify_free_event_priv(fsn_event_priv); 446 inotify_free_event_priv(fsn_event_priv);
412 447
413skip_send_ignore: 448skip_send_ignore:
@@ -418,9 +453,6 @@ skip_send_ignore:
418 /* remove this entry from the idr */ 453 /* remove this entry from the idr */
419 inotify_remove_from_idr(group, ientry); 454 inotify_remove_from_idr(group, ientry);
420 455
421 /* removed from idr, drop that reference */
422 fsnotify_put_mark(entry);
423
424 atomic_dec(&group->inotify_data.user->inotify_watches); 456 atomic_dec(&group->inotify_data.user->inotify_watches);
425} 457}
426 458
@@ -432,80 +464,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry)
432 kmem_cache_free(inotify_inode_mark_cachep, ientry); 464 kmem_cache_free(inotify_inode_mark_cachep, ientry);
433} 465}
434 466
435static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) 467static int inotify_update_existing_watch(struct fsnotify_group *group,
468 struct inode *inode,
469 u32 arg)
436{ 470{
437 struct fsnotify_mark_entry *entry = NULL; 471 struct fsnotify_mark_entry *entry;
438 struct inotify_inode_mark_entry *ientry; 472 struct inotify_inode_mark_entry *ientry;
439 struct inotify_inode_mark_entry *tmp_ientry;
440 int ret = 0;
441 int add = (arg & IN_MASK_ADD);
442 __u32 mask;
443 __u32 old_mask, new_mask; 473 __u32 old_mask, new_mask;
474 __u32 mask;
475 int add = (arg & IN_MASK_ADD);
476 int ret;
444 477
445 /* don't allow invalid bits: we don't want flags set */ 478 /* don't allow invalid bits: we don't want flags set */
446 mask = inotify_arg_to_mask(arg); 479 mask = inotify_arg_to_mask(arg);
447 if (unlikely(!mask)) 480 if (unlikely(!mask))
448 return -EINVAL; 481 return -EINVAL;
449 482
450 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
451 if (unlikely(!tmp_ientry))
452 return -ENOMEM;
453 /* we set the mask at the end after attaching it */
454 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
455 tmp_ientry->wd = -1;
456
457find_entry:
458 spin_lock(&inode->i_lock); 483 spin_lock(&inode->i_lock);
459 entry = fsnotify_find_mark_entry(group, inode); 484 entry = fsnotify_find_mark_entry(group, inode);
460 spin_unlock(&inode->i_lock); 485 spin_unlock(&inode->i_lock);
461 if (entry) { 486 if (!entry)
462 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 487 return -ENOENT;
463 } else {
464 ret = -ENOSPC;
465 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
466 goto out_err;
467retry:
468 ret = -ENOMEM;
469 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
470 goto out_err;
471
472 spin_lock(&group->inotify_data.idr_lock);
473 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
474 group->inotify_data.last_wd,
475 &tmp_ientry->wd);
476 spin_unlock(&group->inotify_data.idr_lock);
477 if (ret) {
478 if (ret == -EAGAIN)
479 goto retry;
480 goto out_err;
481 }
482 488
483 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); 489 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
484 if (ret) {
485 inotify_remove_from_idr(group, tmp_ientry);
486 if (ret == -EEXIST)
487 goto find_entry;
488 goto out_err;
489 }
490
491 /* tmp_ientry has been added to the inode, so we are all set up.
492 * now we just need to make sure tmp_ientry doesn't get freed and
493 * we need to set up entry and ientry so the generic code can
494 * do its thing. */
495 ientry = tmp_ientry;
496 entry = &ientry->fsn_entry;
497 tmp_ientry = NULL;
498
499 atomic_inc(&group->inotify_data.user->inotify_watches);
500
501 /* update the idr hint */
502 group->inotify_data.last_wd = ientry->wd;
503
504 /* we put the mark on the idr, take a reference */
505 fsnotify_get_mark(entry);
506 }
507
508 ret = ientry->wd;
509 490
510 spin_lock(&entry->lock); 491 spin_lock(&entry->lock);
511 492
@@ -537,18 +518,107 @@ retry:
537 fsnotify_recalc_group_mask(group); 518 fsnotify_recalc_group_mask(group);
538 } 519 }
539 520
540 /* this either matches fsnotify_find_mark_entry, or init_mark_entry 521 /* return the wd */
541 * depending on which path we took... */ 522 ret = ientry->wd;
523
524 /* match the get from fsnotify_find_mark_entry() */
542 fsnotify_put_mark(entry); 525 fsnotify_put_mark(entry);
543 526
527 return ret;
528}
529
530static int inotify_new_watch(struct fsnotify_group *group,
531 struct inode *inode,
532 u32 arg)
533{
534 struct inotify_inode_mark_entry *tmp_ientry;
535 __u32 mask;
536 int ret;
537
538 /* don't allow invalid bits: we don't want flags set */
539 mask = inotify_arg_to_mask(arg);
540 if (unlikely(!mask))
541 return -EINVAL;
542
543 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
544 if (unlikely(!tmp_ientry))
545 return -ENOMEM;
546
547 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
548 tmp_ientry->fsn_entry.mask = mask;
549 tmp_ientry->wd = -1;
550
551 ret = -ENOSPC;
552 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
553 goto out_err;
554retry:
555 ret = -ENOMEM;
556 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
557 goto out_err;
558
559 spin_lock(&group->inotify_data.idr_lock);
560 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
561 group->inotify_data.last_wd,
562 &tmp_ientry->wd);
563 spin_unlock(&group->inotify_data.idr_lock);
564 if (ret) {
565 /* idr was out of memory allocate and try again */
566 if (ret == -EAGAIN)
567 goto retry;
568 goto out_err;
569 }
570
571 /* we put the mark on the idr, take a reference */
572 fsnotify_get_mark(&tmp_ientry->fsn_entry);
573
574 /* we are on the idr, now get on the inode */
575 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
576 if (ret) {
577 /* we failed to get on the inode, get off the idr */
578 inotify_remove_from_idr(group, tmp_ientry);
579 goto out_err;
580 }
581
582 /* update the idr hint, who cares about races, it's just a hint */
583 group->inotify_data.last_wd = tmp_ientry->wd;
584
585 /* increment the number of watches the user has */
586 atomic_inc(&group->inotify_data.user->inotify_watches);
587
588 /* return the watch descriptor for this new entry */
589 ret = tmp_ientry->wd;
590
591 /* match the ref from fsnotify_init_markentry() */
592 fsnotify_put_mark(&tmp_ientry->fsn_entry);
593
594 /* if this mark added a new event update the group mask */
595 if (mask & ~group->mask)
596 fsnotify_recalc_group_mask(group);
597
544out_err: 598out_err:
545 /* could be an error, could be that we found an existing mark */ 599 if (ret < 0)
546 if (tmp_ientry) {
547 /* on the idr but didn't make it on the inode */
548 if (tmp_ientry->wd != -1)
549 inotify_remove_from_idr(group, tmp_ientry);
550 kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); 600 kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
551 } 601
602 return ret;
603}
604
605static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
606{
607 int ret = 0;
608
609retry:
610 /* try to update and existing watch with the new arg */
611 ret = inotify_update_existing_watch(group, inode, arg);
612 /* no mark present, try to add a new one */
613 if (ret == -ENOENT)
614 ret = inotify_new_watch(group, inode, arg);
615 /*
616 * inotify_new_watch could race with another thread which did an
617 * inotify_new_watch between the update_existing and the add watch
618 * here, go back and try to update an existing mark again.
619 */
620 if (ret == -EEXIST)
621 goto retry;
552 622
553 return ret; 623 return ret;
554} 624}
@@ -568,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
568 638
569 spin_lock_init(&group->inotify_data.idr_lock); 639 spin_lock_init(&group->inotify_data.idr_lock);
570 idr_init(&group->inotify_data.idr); 640 idr_init(&group->inotify_data.idr);
571 group->inotify_data.last_wd = 0; 641 group->inotify_data.last_wd = 1;
572 group->inotify_data.user = user; 642 group->inotify_data.user = user;
573 group->inotify_data.fa = NULL; 643 group->inotify_data.fa = NULL;
574 644
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 521368574e97..3816d5750dd5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -153,6 +153,10 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
153 return true; 153 return true;
154 break; 154 break;
155 case (FSNOTIFY_EVENT_NONE): 155 case (FSNOTIFY_EVENT_NONE):
156 if (old->mask & FS_Q_OVERFLOW)
157 return true;
158 else if (old->mask & FS_IN_IGNORED)
159 return false;
156 return false; 160 return false;
157 }; 161 };
158 } 162 }
@@ -171,9 +175,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
171 struct list_head *list = &group->notification_list; 175 struct list_head *list = &group->notification_list;
172 struct fsnotify_event_holder *last_holder; 176 struct fsnotify_event_holder *last_holder;
173 struct fsnotify_event *last_event; 177 struct fsnotify_event *last_event;
174 178 int ret = 0;
175 /* easy to tell if priv was attached to the event */
176 INIT_LIST_HEAD(&priv->event_list);
177 179
178 /* 180 /*
179 * There is one fsnotify_event_holder embedded inside each fsnotify_event. 181 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -194,6 +196,7 @@ alloc_holder:
194 196
195 if (group->q_len >= group->max_events) { 197 if (group->q_len >= group->max_events) {
196 event = &q_overflow_event; 198 event = &q_overflow_event;
199 ret = -EOVERFLOW;
197 /* sorry, no private data on the overflow event */ 200 /* sorry, no private data on the overflow event */
198 priv = NULL; 201 priv = NULL;
199 } 202 }
@@ -235,7 +238,7 @@ alloc_holder:
235 mutex_unlock(&group->notification_mutex); 238 mutex_unlock(&group->notification_mutex);
236 239
237 wake_up(&group->notification_waitq); 240 wake_up(&group->notification_waitq);
238 return 0; 241 return ret;
239} 242}
240 243
241/* 244/*
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f9a3e8942669..ab513ddaeff2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6851,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6851 } 6851 }
6852 status = 0; 6852 status = 0;
6853bail: 6853bail:
6854 6854 brelse(last_eb_bh);
6855 mlog_exit(status); 6855 mlog_exit(status);
6856 return status; 6856 return status;
6857} 6857}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b401654011a2..8a1e61545f41 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1747,8 +1747,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1747 * we know zeros will only be needed in the first and/or last cluster. 1747 * we know zeros will only be needed in the first and/or last cluster.
1748 */ 1748 */
1749 if (clusters_to_alloc || extents_to_split || 1749 if (clusters_to_alloc || extents_to_split ||
1750 wc->w_desc[0].c_needs_zero || 1750 (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
1751 wc->w_desc[wc->w_clen - 1].c_needs_zero) 1751 wc->w_desc[wc->w_clen - 1].c_needs_zero)))
1752 cluster_of_pages = 1; 1752 cluster_of_pages = 1;
1753 else 1753 else
1754 cluster_of_pages = 0; 1754 cluster_of_pages = 0;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 2f28b7de2c8d..b4957c7d9fe2 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
85 goto bail; 85 goto bail;
86 } 86 }
87 87
88 /*
89 * If the last lookup failed to create dentry lock, let us
90 * redo it.
91 */
92 if (!dentry->d_fsdata) {
93 mlog(0, "Inode %llu doesn't have dentry lock, "
94 "returning false\n",
95 (unsigned long long)OCFS2_I(inode)->ip_blkno);
96 goto bail;
97 }
98
88 ret = 1; 99 ret = 1;
89 100
90bail: 101bail:
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
325} 325}
326 326
327static struct backing_dev_info dlmfs_backing_dev_info = { 327static struct backing_dev_info dlmfs_backing_dev_info = {
328 .name = "ocfs2-dlmfs",
328 .ra_pages = 0, /* No readahead */ 329 .ra_pages = 0, /* No readahead */
329 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 330 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
330}; 331};
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index fcf879ed6930..756f5b0998e0 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
122 * that still has AST's pending... */ 122 * that still has AST's pending... */
123 in_use = !list_empty(&lock->ast_list); 123 in_use = !list_empty(&lock->ast_list);
124 spin_unlock(&dlm->ast_lock); 124 spin_unlock(&dlm->ast_lock);
125 if (in_use) { 125 if (in_use && !(flags & LKM_CANCEL)) {
126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " 126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
127 "while waiting for an ast!", res->lockname.len, 127 "while waiting for an ast!", res->lockname.len,
128 res->lockname.name); 128 res->lockname.name);
@@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
131 131
132 spin_lock(&res->spinlock); 132 spin_lock(&res->spinlock);
133 if (res->state & DLM_LOCK_RES_IN_PROGRESS) { 133 if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
134 if (master_node) { 134 if (master_node && !(flags & LKM_CANCEL)) {
135 mlog(ML_ERROR, "lockres in progress!\n"); 135 mlog(ML_ERROR, "lockres in progress!\n");
136 spin_unlock(&res->spinlock); 136 spin_unlock(&res->spinlock);
137 return DLM_FORWARD; 137 return DLM_FORWARD;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index fcdba091af3d..c212cf5a2bdf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -108,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
108 [OCFS2_LOCK_TYPE_OPEN] = "Open", 108 [OCFS2_LOCK_TYPE_OPEN] = "Open",
109 [OCFS2_LOCK_TYPE_FLOCK] = "Flock", 109 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
110 [OCFS2_LOCK_TYPE_QINFO] = "Quota", 110 [OCFS2_LOCK_TYPE_QINFO] = "Quota",
111 [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
111 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", 112 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
112}; 113};
113 114
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index bf7742d0ee3b..44f2a5e1d042 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -23,6 +23,7 @@
23#include "sysfile.h" 23#include "sysfile.h"
24#include "dlmglue.h" 24#include "dlmglue.h"
25#include "uptodate.h" 25#include "uptodate.h"
26#include "super.h"
26#include "quota.h" 27#include "quota.h"
27 28
28static struct workqueue_struct *ocfs2_quota_wq = NULL; 29static struct workqueue_struct *ocfs2_quota_wq = NULL;
@@ -114,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
114 int rc = 0; 115 int rc = 0;
115 struct buffer_head *tmp = *bh; 116 struct buffer_head *tmp = *bh;
116 117
118 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
119 ocfs2_error(inode->i_sb,
120 "Quota file %llu is probably corrupted! Requested "
121 "to read block %Lu but file has size only %Lu\n",
122 (unsigned long long)OCFS2_I(inode)->ip_blkno,
123 (unsigned long long)v_block,
124 (unsigned long long)i_size_read(inode));
125 return -EIO;
126 }
117 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, 127 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
118 ocfs2_validate_quota_block); 128 ocfs2_validate_quota_block);
119 if (rc) 129 if (rc)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b0ee0fdf799a..a3f8871d21fd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1218,13 +1218,17 @@ static void ocfs2_kill_sb(struct super_block *sb)
1218{ 1218{
1219 struct ocfs2_super *osb = OCFS2_SB(sb); 1219 struct ocfs2_super *osb = OCFS2_SB(sb);
1220 1220
1221 /* Failed mount? */
1222 if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
1223 goto out;
1224
1221 /* Prevent further queueing of inode drop events */ 1225 /* Prevent further queueing of inode drop events */
1222 spin_lock(&dentry_list_lock); 1226 spin_lock(&dentry_list_lock);
1223 ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); 1227 ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
1224 spin_unlock(&dentry_list_lock); 1228 spin_unlock(&dentry_list_lock);
1225 /* Wait for work to finish and/or remove it */ 1229 /* Wait for work to finish and/or remove it */
1226 cancel_work_sync(&osb->dentry_lock_work); 1230 cancel_work_sync(&osb->dentry_lock_work);
1227 1231out:
1228 kill_block_super(sb); 1232 kill_block_super(sb);
1229} 1233}
1230 1234
diff --git a/fs/open.c b/fs/open.c
index dd98e8076024..31191bf513e4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -199,7 +199,7 @@ out:
199int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, 199int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
200 struct file *filp) 200 struct file *filp)
201{ 201{
202 int err; 202 int ret;
203 struct iattr newattrs; 203 struct iattr newattrs;
204 204
205 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ 205 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
@@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
214 } 214 }
215 215
216 /* Remove suid/sgid on truncate too */ 216 /* Remove suid/sgid on truncate too */
217 newattrs.ia_valid |= should_remove_suid(dentry); 217 ret = should_remove_suid(dentry);
218 if (ret)
219 newattrs.ia_valid |= ret | ATTR_FORCE;
218 220
219 mutex_lock(&dentry->d_inode->i_mutex); 221 mutex_lock(&dentry->d_inode->i_mutex);
220 err = notify_change(dentry, &newattrs); 222 ret = notify_change(dentry, &newattrs);
221 mutex_unlock(&dentry->d_inode->i_mutex); 223 mutex_unlock(&dentry->d_inode->i_mutex);
222 return err; 224 return ret;
223} 225}
224 226
225static long do_sys_truncate(const char __user *pathname, loff_t length) 227static long do_sys_truncate(const char __user *pathname, loff_t length)
@@ -957,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
957 int error; 959 int error;
958 struct file *f; 960 struct file *f;
959 961
962 validate_creds(cred);
963
960 /* 964 /*
961 * We must always pass in a valid mount pointer. Historically 965 * We must always pass in a valid mount pointer. Historically
962 * callers got away with not passing it, but we must enforce this at 966 * callers got away with not passing it, but we must enforce this at
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 175db258942f..6f742f6658a9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1003,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
1003 1003
1004 if (!task) 1004 if (!task)
1005 return -ESRCH; 1005 return -ESRCH;
1006 task_lock(task); 1006 oom_adjust = task->oomkilladj;
1007 if (task->mm)
1008 oom_adjust = task->mm->oom_adj;
1009 else
1010 oom_adjust = OOM_DISABLE;
1011 task_unlock(task);
1012 put_task_struct(task); 1007 put_task_struct(task);
1013 1008
1014 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); 1009 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1037,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1037 task = get_proc_task(file->f_path.dentry->d_inode); 1032 task = get_proc_task(file->f_path.dentry->d_inode);
1038 if (!task) 1033 if (!task)
1039 return -ESRCH; 1034 return -ESRCH;
1040 task_lock(task); 1035 if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
1041 if (!task->mm) {
1042 task_unlock(task);
1043 put_task_struct(task);
1044 return -EINVAL;
1045 }
1046 if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1047 task_unlock(task);
1048 put_task_struct(task); 1036 put_task_struct(task);
1049 return -EACCES; 1037 return -EACCES;
1050 } 1038 }
1051 task->mm->oom_adj = oom_adjust; 1039 task->oomkilladj = oom_adjust;
1052 task_unlock(task);
1053 put_task_struct(task); 1040 put_task_struct(task);
1054 if (end - buffer == 0) 1041 if (end - buffer == 0)
1055 return -EIO; 1042 return -EIO;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a7f0110fca4c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops;
46static const struct inode_operations ramfs_dir_inode_operations; 46static const struct inode_operations ramfs_dir_inode_operations;
47 47
48static struct backing_dev_info ramfs_backing_dev_info = { 48static struct backing_dev_info ramfs_backing_dev_info = {
49 .name = "ramfs",
49 .ra_pages = 0, /* No readahead */ 50 .ra_pages = 0, /* No readahead */
50 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | 51 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
51 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | 52 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/select.c b/fs/select.c
index d870237e42c7..8084834e123e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq)
110{ 110{
111 init_poll_funcptr(&pwq->pt, __pollwait); 111 init_poll_funcptr(&pwq->pt, __pollwait);
112 pwq->polling_task = current; 112 pwq->polling_task = current;
113 pwq->triggered = 0;
113 pwq->error = 0; 114 pwq->error = 0;
114 pwq->table = NULL; 115 pwq->table = NULL;
115 pwq->inline_index = 0; 116 pwq->inline_index = 0;
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..9cda337ddae2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
62 s = NULL; 62 s = NULL;
63 goto out; 63 goto out;
64 } 64 }
65 INIT_LIST_HEAD(&s->s_dirty);
66 INIT_LIST_HEAD(&s->s_io);
67 INIT_LIST_HEAD(&s->s_more_io);
68 INIT_LIST_HEAD(&s->s_files); 65 INIT_LIST_HEAD(&s->s_files);
69 INIT_LIST_HEAD(&s->s_instances); 66 INIT_LIST_HEAD(&s->s_instances);
70 INIT_HLIST_HEAD(&s->s_anon); 67 INIT_HLIST_HEAD(&s->s_anon);
@@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
171 * Drops a temporary reference, frees superblock if there's no 168 * Drops a temporary reference, frees superblock if there's no
172 * references left. 169 * references left.
173 */ 170 */
174static void put_super(struct super_block *sb) 171void put_super(struct super_block *sb)
175{ 172{
176 spin_lock(&sb_lock); 173 spin_lock(&sb_lock);
177 __put_super(sb); 174 __put_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..103cc7fdd3df 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,22 @@
19 SYNC_FILE_RANGE_WAIT_AFTER) 19 SYNC_FILE_RANGE_WAIT_AFTER)
20 20
21/* 21/*
22 * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) 22 * Do the filesystem syncing work. For simple filesystems
23 * just dirties buffers with inodes so we have to submit IO for these buffers 23 * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
24 * via __sync_blockdev(). This also speeds up the wait == 1 case since in that 24 * submit IO for these buffers via __sync_blockdev(). This also speeds up the
25 * case write_inode() functions do sync_dirty_buffer() and thus effectively 25 * wait == 1 case since in that case write_inode() functions do
26 * write one block at a time. 26 * sync_dirty_buffer() and thus effectively write one block at a time.
27 */ 27 */
28static int __sync_filesystem(struct super_block *sb, int wait) 28static int __sync_filesystem(struct super_block *sb, int wait)
29{ 29{
30 /* Avoid doing twice syncing and cache pruning for quota sync */ 30 /* Avoid doing twice syncing and cache pruning for quota sync */
31 if (!wait) 31 if (!wait) {
32 writeout_quota_sb(sb, -1); 32 writeout_quota_sb(sb, -1);
33 else 33 writeback_inodes_sb(sb);
34 } else {
34 sync_quota_sb(sb, -1); 35 sync_quota_sb(sb, -1);
35 sync_inodes_sb(sb, wait); 36 sync_inodes_sb(sb);
37 }
36 if (sb->s_op->sync_fs) 38 if (sb->s_op->sync_fs)
37 sb->s_op->sync_fs(sb, wait); 39 sb->s_op->sync_fs(sb, wait);
38 return __sync_blockdev(sb->s_bdev, wait); 40 return __sync_blockdev(sb->s_bdev, wait);
@@ -118,7 +120,7 @@ restart:
118 */ 120 */
119SYSCALL_DEFINE0(sync) 121SYSCALL_DEFINE0(sync)
120{ 122{
121 wakeup_pdflush(0); 123 wakeup_flusher_threads(0);
122 sync_filesystems(0); 124 sync_filesystems(0);
123 sync_filesystems(1); 125 sync_filesystems(1);
124 if (unlikely(laptop_mode)) 126 if (unlikely(laptop_mode))
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 14f2d71ea3ce..0050fc40e8c9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
760const struct inode_operations sysfs_dir_inode_operations = { 760const struct inode_operations sysfs_dir_inode_operations = {
761 .lookup = sysfs_lookup, 761 .lookup = sysfs_lookup,
762 .setattr = sysfs_setattr, 762 .setattr = sysfs_setattr,
763 .setxattr = sysfs_setxattr,
763}; 764};
764 765
765static void remove_dir(struct sysfs_dirent *sd) 766static void remove_dir(struct sysfs_dirent *sd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e28cecf179f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,8 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/xattr.h>
22#include <linux/security.h>
21#include "sysfs.h" 23#include "sysfs.h"
22 24
23extern struct super_block * sysfs_sb; 25extern struct super_block * sysfs_sb;
@@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = {
29}; 31};
30 32
31static struct backing_dev_info sysfs_backing_dev_info = { 33static struct backing_dev_info sysfs_backing_dev_info = {
34 .name = "sysfs",
32 .ra_pages = 0, /* No readahead */ 35 .ra_pages = 0, /* No readahead */
33 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
34}; 37};
35 38
36static const struct inode_operations sysfs_inode_operations ={ 39static const struct inode_operations sysfs_inode_operations ={
37 .setattr = sysfs_setattr, 40 .setattr = sysfs_setattr,
41 .setxattr = sysfs_setxattr,
38}; 42};
39 43
40int __init sysfs_inode_init(void) 44int __init sysfs_inode_init(void)
@@ -42,18 +46,37 @@ int __init sysfs_inode_init(void)
42 return bdi_init(&sysfs_backing_dev_info); 46 return bdi_init(&sysfs_backing_dev_info);
43} 47}
44 48
49struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
50{
51 struct sysfs_inode_attrs *attrs;
52 struct iattr *iattrs;
53
54 attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
55 if (!attrs)
56 return NULL;
57 iattrs = &attrs->ia_iattr;
58
59 /* assign default attributes */
60 iattrs->ia_mode = sd->s_mode;
61 iattrs->ia_uid = 0;
62 iattrs->ia_gid = 0;
63 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
64
65 return attrs;
66}
45int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) 67int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
46{ 68{
47 struct inode * inode = dentry->d_inode; 69 struct inode * inode = dentry->d_inode;
48 struct sysfs_dirent * sd = dentry->d_fsdata; 70 struct sysfs_dirent * sd = dentry->d_fsdata;
49 struct iattr * sd_iattr; 71 struct sysfs_inode_attrs *sd_attrs;
72 struct iattr *iattrs;
50 unsigned int ia_valid = iattr->ia_valid; 73 unsigned int ia_valid = iattr->ia_valid;
51 int error; 74 int error;
52 75
53 if (!sd) 76 if (!sd)
54 return -EINVAL; 77 return -EINVAL;
55 78
56 sd_iattr = sd->s_iattr; 79 sd_attrs = sd->s_iattr;
57 80
58 error = inode_change_ok(inode, iattr); 81 error = inode_change_ok(inode, iattr);
59 if (error) 82 if (error)
@@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
65 if (error) 88 if (error)
66 return error; 89 return error;
67 90
68 if (!sd_iattr) { 91 if (!sd_attrs) {
69 /* setting attributes for the first time, allocate now */ 92 /* setting attributes for the first time, allocate now */
70 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); 93 sd_attrs = sysfs_init_inode_attrs(sd);
71 if (!sd_iattr) 94 if (!sd_attrs)
72 return -ENOMEM; 95 return -ENOMEM;
73 /* assign default attributes */ 96 sd->s_iattr = sd_attrs;
74 sd_iattr->ia_mode = sd->s_mode; 97 } else {
75 sd_iattr->ia_uid = 0; 98 /* attributes were changed at least once in past */
76 sd_iattr->ia_gid = 0; 99 iattrs = &sd_attrs->ia_iattr;
77 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; 100
78 sd->s_iattr = sd_iattr; 101 if (ia_valid & ATTR_UID)
102 iattrs->ia_uid = iattr->ia_uid;
103 if (ia_valid & ATTR_GID)
104 iattrs->ia_gid = iattr->ia_gid;
105 if (ia_valid & ATTR_ATIME)
106 iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
107 inode->i_sb->s_time_gran);
108 if (ia_valid & ATTR_MTIME)
109 iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
110 inode->i_sb->s_time_gran);
111 if (ia_valid & ATTR_CTIME)
112 iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
113 inode->i_sb->s_time_gran);
114 if (ia_valid & ATTR_MODE) {
115 umode_t mode = iattr->ia_mode;
116
117 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
118 mode &= ~S_ISGID;
119 iattrs->ia_mode = sd->s_mode = mode;
120 }
79 } 121 }
122 return error;
123}
80 124
81 /* attributes were changed atleast once in past */ 125int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
82 126 size_t size, int flags)
83 if (ia_valid & ATTR_UID) 127{
84 sd_iattr->ia_uid = iattr->ia_uid; 128 struct sysfs_dirent *sd = dentry->d_fsdata;
85 if (ia_valid & ATTR_GID) 129 struct sysfs_inode_attrs *iattrs;
86 sd_iattr->ia_gid = iattr->ia_gid; 130 void *secdata;
87 if (ia_valid & ATTR_ATIME) 131 int error;
88 sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, 132 u32 secdata_len = 0;
89 inode->i_sb->s_time_gran); 133
90 if (ia_valid & ATTR_MTIME) 134 if (!sd)
91 sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, 135 return -EINVAL;
92 inode->i_sb->s_time_gran); 136 if (!sd->s_iattr)
93 if (ia_valid & ATTR_CTIME) 137 sd->s_iattr = sysfs_init_inode_attrs(sd);
94 sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, 138 if (!sd->s_iattr)
95 inode->i_sb->s_time_gran); 139 return -ENOMEM;
96 if (ia_valid & ATTR_MODE) { 140
97 umode_t mode = iattr->ia_mode; 141 iattrs = sd->s_iattr;
98 142
99 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 143 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
100 mode &= ~S_ISGID; 144 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
101 sd_iattr->ia_mode = sd->s_mode = mode; 145 error = security_inode_setsecurity(dentry->d_inode, suffix,
102 } 146 value, size, flags);
147 if (error)
148 goto out;
149 error = security_inode_getsecctx(dentry->d_inode,
150 &secdata, &secdata_len);
151 if (error)
152 goto out;
153 if (iattrs->ia_secdata)
154 security_release_secctx(iattrs->ia_secdata,
155 iattrs->ia_secdata_len);
156 iattrs->ia_secdata = secdata;
157 iattrs->ia_secdata_len = secdata_len;
103 158
159 } else
160 return -EINVAL;
161out:
104 return error; 162 return error;
105} 163}
106 164
@@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
146static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) 204static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
147{ 205{
148 struct bin_attribute *bin_attr; 206 struct bin_attribute *bin_attr;
207 struct sysfs_inode_attrs *iattrs;
149 208
150 inode->i_private = sysfs_get(sd); 209 inode->i_private = sysfs_get(sd);
151 inode->i_mapping->a_ops = &sysfs_aops; 210 inode->i_mapping->a_ops = &sysfs_aops;
@@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
154 inode->i_ino = sd->s_ino; 213 inode->i_ino = sd->s_ino;
155 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); 214 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
156 215
157 if (sd->s_iattr) { 216 iattrs = sd->s_iattr;
217 if (iattrs) {
158 /* sysfs_dirent has non-default attributes 218 /* sysfs_dirent has non-default attributes
159 * get them for the new inode from persistent copy 219 * get them for the new inode from persistent copy
160 * in sysfs_dirent 220 * in sysfs_dirent
161 */ 221 */
162 set_inode_attr(inode, sd->s_iattr); 222 set_inode_attr(inode, &iattrs->ia_iattr);
223 if (iattrs->ia_secdata)
224 security_inode_notifysecctx(inode,
225 iattrs->ia_secdata,
226 iattrs->ia_secdata_len);
163 } else 227 } else
164 set_default_inode_attr(inode, sd->s_mode); 228 set_default_inode_attr(inode, sd->s_mode);
165 229
166
167 /* initialize inode according to type */ 230 /* initialize inode according to type */
168 switch (sysfs_type(sd)) { 231 switch (sysfs_type(sd)) {
169 case SYSFS_DIR: 232 case SYSFS_DIR:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1d897ad808e0..c5081ad77026 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -16,6 +16,7 @@
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/security.h>
19 20
20#include "sysfs.h" 21#include "sysfs.h"
21 22
@@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
209} 210}
210 211
211const struct inode_operations sysfs_symlink_inode_operations = { 212const struct inode_operations sysfs_symlink_inode_operations = {
213 .setxattr = sysfs_setxattr,
212 .readlink = generic_readlink, 214 .readlink = generic_readlink,
213 .follow_link = sysfs_follow_link, 215 .follow_link = sysfs_follow_link,
214 .put_link = sysfs_put_link, 216 .put_link = sysfs_put_link,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3fa0d98481e2..af4c4e7482ac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,8 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/fs.h>
12
11struct sysfs_open_dirent; 13struct sysfs_open_dirent;
12 14
13/* type-specific structures for sysfs_dirent->s_* union members */ 15/* type-specific structures for sysfs_dirent->s_* union members */
@@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr {
31 struct hlist_head buffers; 33 struct hlist_head buffers;
32}; 34};
33 35
36struct sysfs_inode_attrs {
37 struct iattr ia_iattr;
38 void *ia_secdata;
39 u32 ia_secdata_len;
40};
41
34/* 42/*
35 * sysfs_dirent - the building block of sysfs hierarchy. Each and 43 * sysfs_dirent - the building block of sysfs hierarchy. Each and
36 * every sysfs node is represented by single sysfs_dirent. 44 * every sysfs node is represented by single sysfs_dirent.
@@ -56,7 +64,7 @@ struct sysfs_dirent {
56 unsigned int s_flags; 64 unsigned int s_flags;
57 ino_t s_ino; 65 ino_t s_ino;
58 umode_t s_mode; 66 umode_t s_mode;
59 struct iattr *s_iattr; 67 struct sysfs_inode_attrs *s_iattr;
60}; 68};
61 69
62#define SD_DEACTIVATED_BIAS INT_MIN 70#define SD_DEACTIVATED_BIAS INT_MIN
@@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
148struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 156struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
149void sysfs_delete_inode(struct inode *inode); 157void sysfs_delete_inode(struct inode *inode);
150int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 158int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
159int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
160 size_t size, int flags);
151int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 161int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
152int sysfs_inode_init(void); 162int sysfs_inode_init(void);
153 163
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index eaf6d891d46f..1c8991b0db13 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -65,26 +65,14 @@
65static int shrink_liability(struct ubifs_info *c, int nr_to_write) 65static int shrink_liability(struct ubifs_info *c, int nr_to_write)
66{ 66{
67 int nr_written; 67 int nr_written;
68 struct writeback_control wbc = {
69 .sync_mode = WB_SYNC_NONE,
70 .range_end = LLONG_MAX,
71 .nr_to_write = nr_to_write,
72 };
73
74 generic_sync_sb_inodes(c->vfs_sb, &wbc);
75 nr_written = nr_to_write - wbc.nr_to_write;
76 68
69 nr_written = writeback_inodes_sb(c->vfs_sb);
77 if (!nr_written) { 70 if (!nr_written) {
78 /* 71 /*
79 * Re-try again but wait on pages/inodes which are being 72 * Re-try again but wait on pages/inodes which are being
80 * written-back concurrently (e.g., by pdflush). 73 * written-back concurrently (e.g., by pdflush).
81 */ 74 */
82 memset(&wbc, 0, sizeof(struct writeback_control)); 75 nr_written = sync_inodes_sb(c->vfs_sb);
83 wbc.sync_mode = WB_SYNC_ALL;
84 wbc.range_end = LLONG_MAX;
85 wbc.nr_to_write = nr_to_write;
86 generic_sync_sb_inodes(c->vfs_sb, &wbc);
87 nr_written = nr_to_write - wbc.nr_to_write;
88 } 76 }
89 77
90 dbg_budg("%d pages were written back", nr_written); 78 dbg_budg("%d pages were written back", nr_written);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 26d2e0d80465..51763aa8f4de 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -438,12 +438,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
438{ 438{
439 int i, err; 439 int i, err;
440 struct ubifs_info *c = sb->s_fs_info; 440 struct ubifs_info *c = sb->s_fs_info;
441 struct writeback_control wbc = {
442 .sync_mode = WB_SYNC_ALL,
443 .range_start = 0,
444 .range_end = LLONG_MAX,
445 .nr_to_write = LONG_MAX,
446 };
447 441
448 /* 442 /*
449 * Zero @wait is just an advisory thing to help the file system shove 443 * Zero @wait is just an advisory thing to help the file system shove
@@ -462,7 +456,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
462 * the user be able to get more accurate results of 'statfs()' after 456 * the user be able to get more accurate results of 'statfs()' after
463 * they synchronize the file system. 457 * they synchronize the file system.
464 */ 458 */
465 generic_sync_sb_inodes(sb, &wbc); 459 sync_inodes_sb(sb);
466 460
467 /* 461 /*
468 * Synchronize write buffers, because 'ubifs_run_commit()' does not 462 * Synchronize write buffers, because 'ubifs_run_commit()' does not
@@ -1971,6 +1965,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1971 * 1965 *
1972 * Read-ahead will be disabled because @c->bdi.ra_pages is 0. 1966 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
1973 */ 1967 */
1968 c->bdi.name = "ubifs",
1974 c->bdi.capabilities = BDI_CAP_MAP_COPY; 1969 c->bdi.capabilities = BDI_CAP_MAP_COPY;
1975 c->bdi.unplug_io_fn = default_unplug_io_fn; 1970 c->bdi.unplug_io_fn = default_unplug_io_fn;
1976 err = bdi_init(&c->bdi); 1971 err = bdi_init(&c->bdi);
diff --git a/fs/xattr.c b/fs/xattr.c
index 1c3d0af59ddf..6d4f6d3449fb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask)
66 return inode_permission(inode, mask); 66 return inode_permission(inode, mask);
67} 67}
68 68
69int 69/**
70vfs_setxattr(struct dentry *dentry, const char *name, const void *value, 70 * __vfs_setxattr_noperm - perform setxattr operation without performing
71 size_t size, int flags) 71 * permission checks.
72 *
73 * @dentry - object to perform setxattr on
74 * @name - xattr name to set
75 * @value - value to set @name to
76 * @size - size of @value
77 * @flags - flags to pass into filesystem operations
78 *
79 * returns the result of the internal setxattr or setsecurity operations.
80 *
81 * This function requires the caller to lock the inode's i_mutex before it
82 * is executed. It also assumes that the caller will make the appropriate
83 * permission checks.
84 */
85int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
86 const void *value, size_t size, int flags)
72{ 87{
73 struct inode *inode = dentry->d_inode; 88 struct inode *inode = dentry->d_inode;
74 int error; 89 int error = -EOPNOTSUPP;
75
76 error = xattr_permission(inode, name, MAY_WRITE);
77 if (error)
78 return error;
79 90
80 mutex_lock(&inode->i_mutex);
81 error = security_inode_setxattr(dentry, name, value, size, flags);
82 if (error)
83 goto out;
84 error = -EOPNOTSUPP;
85 if (inode->i_op->setxattr) { 91 if (inode->i_op->setxattr) {
86 error = inode->i_op->setxattr(dentry, name, value, size, flags); 92 error = inode->i_op->setxattr(dentry, name, value, size, flags);
87 if (!error) { 93 if (!error) {
@@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
97 if (!error) 103 if (!error)
98 fsnotify_xattr(dentry); 104 fsnotify_xattr(dentry);
99 } 105 }
106
107 return error;
108}
109
110
111int
112vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
113 size_t size, int flags)
114{
115 struct inode *inode = dentry->d_inode;
116 int error;
117
118 error = xattr_permission(inode, name, MAY_WRITE);
119 if (error)
120 return error;
121
122 mutex_lock(&inode->i_mutex);
123 error = security_inode_setxattr(dentry, name, value, size, flags);
124 if (error)
125 goto out;
126
127 error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
128
100out: 129out:
101 mutex_unlock(&inode->i_mutex); 130 mutex_unlock(&inode->i_mutex);
102 return error; 131 return error;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
619 case XFS_IOC_GETVERSION_32: 619 case XFS_IOC_GETVERSION_32:
620 cmd = _NATIVE_IOC(cmd, long); 620 cmd = _NATIVE_IOC(cmd, long);
621 return xfs_file_ioctl(filp, cmd, p); 621 return xfs_file_ioctl(filp, cmd, p);
622 case XFS_IOC_SWAPEXT: { 622 case XFS_IOC_SWAPEXT_32: {
623 struct xfs_swapext sxp; 623 struct xfs_swapext sxp;
624 struct compat_xfs_swapext __user *sxu = arg; 624 struct compat_xfs_swapext __user *sxu = arg;
625 625
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8070b34cc287..6c32f1d63d8c 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -485,14 +485,6 @@ xfs_vn_put_link(
485} 485}
486 486
487STATIC int 487STATIC int
488xfs_vn_permission(
489 struct inode *inode,
490 int mask)
491{
492 return generic_permission(inode, mask, xfs_check_acl);
493}
494
495STATIC int
496xfs_vn_getattr( 488xfs_vn_getattr(
497 struct vfsmount *mnt, 489 struct vfsmount *mnt,
498 struct dentry *dentry, 490 struct dentry *dentry,
@@ -696,7 +688,7 @@ xfs_vn_fiemap(
696} 688}
697 689
698static const struct inode_operations xfs_inode_operations = { 690static const struct inode_operations xfs_inode_operations = {
699 .permission = xfs_vn_permission, 691 .check_acl = xfs_check_acl,
700 .truncate = xfs_vn_truncate, 692 .truncate = xfs_vn_truncate,
701 .getattr = xfs_vn_getattr, 693 .getattr = xfs_vn_getattr,
702 .setattr = xfs_vn_setattr, 694 .setattr = xfs_vn_setattr,
@@ -724,7 +716,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
724 .rmdir = xfs_vn_unlink, 716 .rmdir = xfs_vn_unlink,
725 .mknod = xfs_vn_mknod, 717 .mknod = xfs_vn_mknod,
726 .rename = xfs_vn_rename, 718 .rename = xfs_vn_rename,
727 .permission = xfs_vn_permission, 719 .check_acl = xfs_check_acl,
728 .getattr = xfs_vn_getattr, 720 .getattr = xfs_vn_getattr,
729 .setattr = xfs_vn_setattr, 721 .setattr = xfs_vn_setattr,
730 .setxattr = generic_setxattr, 722 .setxattr = generic_setxattr,
@@ -749,7 +741,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
749 .rmdir = xfs_vn_unlink, 741 .rmdir = xfs_vn_unlink,
750 .mknod = xfs_vn_mknod, 742 .mknod = xfs_vn_mknod,
751 .rename = xfs_vn_rename, 743 .rename = xfs_vn_rename,
752 .permission = xfs_vn_permission, 744 .check_acl = xfs_check_acl,
753 .getattr = xfs_vn_getattr, 745 .getattr = xfs_vn_getattr,
754 .setattr = xfs_vn_setattr, 746 .setattr = xfs_vn_setattr,
755 .setxattr = generic_setxattr, 747 .setxattr = generic_setxattr,
@@ -762,7 +754,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
762 .readlink = generic_readlink, 754 .readlink = generic_readlink,
763 .follow_link = xfs_vn_follow_link, 755 .follow_link = xfs_vn_follow_link,
764 .put_link = xfs_vn_put_link, 756 .put_link = xfs_vn_put_link,
765 .permission = xfs_vn_permission, 757 .check_acl = xfs_check_acl,
766 .getattr = xfs_vn_getattr, 758 .getattr = xfs_vn_getattr,
767 .setattr = xfs_vn_setattr, 759 .setattr = xfs_vn_setattr,
768 .setxattr = generic_setxattr, 760 .setxattr = generic_setxattr,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b619d6b8ca43..98ef624d9baf 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
708 return 0; 708 return 0;
709} 709}
710 710
711void
712__xfs_inode_set_reclaim_tag(
713 struct xfs_perag *pag,
714 struct xfs_inode *ip)
715{
716 radix_tree_tag_set(&pag->pag_ici_root,
717 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
718 XFS_ICI_RECLAIM_TAG);
719}
720
711/* 721/*
712 * We set the inode flag atomically with the radix tree tag. 722 * We set the inode flag atomically with the radix tree tag.
713 * Once we get tag lookups on the radix tree, this inode flag 723 * Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
722 732
723 read_lock(&pag->pag_ici_lock); 733 read_lock(&pag->pag_ici_lock);
724 spin_lock(&ip->i_flags_lock); 734 spin_lock(&ip->i_flags_lock);
725 radix_tree_tag_set(&pag->pag_ici_root, 735 __xfs_inode_set_reclaim_tag(pag, ip);
726 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
727 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 736 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
728 spin_unlock(&ip->i_flags_lock); 737 spin_unlock(&ip->i_flags_lock);
729 read_unlock(&pag->pag_ici_lock); 738 read_unlock(&pag->pag_ici_lock);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 2a10301c99c7..59120602588a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
48int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 48int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
49 49
50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
51void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
51void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); 52void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
52void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 53void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
53 struct xfs_inode *ip); 54 struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 34ec86923f7e..ecbf8b4d2e2e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -191,80 +191,82 @@ xfs_iget_cache_hit(
191 int flags, 191 int flags,
192 int lock_flags) __releases(pag->pag_ici_lock) 192 int lock_flags) __releases(pag->pag_ici_lock)
193{ 193{
194 struct inode *inode = VFS_I(ip);
194 struct xfs_mount *mp = ip->i_mount; 195 struct xfs_mount *mp = ip->i_mount;
195 int error = EAGAIN; 196 int error;
197
198 spin_lock(&ip->i_flags_lock);
196 199
197 /* 200 /*
198 * If INEW is set this inode is being set up 201 * If we are racing with another cache hit that is currently
199 * If IRECLAIM is set this inode is being torn down 202 * instantiating this inode or currently recycling it out of
200 * Pause and try again. 203 * reclaimabe state, wait for the initialisation to complete
204 * before continuing.
205 *
206 * XXX(hch): eventually we should do something equivalent to
207 * wait_on_inode to wait for these flags to be cleared
208 * instead of polling for it.
201 */ 209 */
202 if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { 210 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
203 XFS_STATS_INC(xs_ig_frecycle); 211 XFS_STATS_INC(xs_ig_frecycle);
212 error = EAGAIN;
204 goto out_error; 213 goto out_error;
205 } 214 }
206 215
207 /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ 216 /*
208 if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { 217 * If lookup is racing with unlink return an error immediately.
209 218 */
210 /* 219 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
211 * If lookup is racing with unlink, then we should return an 220 error = ENOENT;
212 * error immediately so we don't remove it from the reclaim 221 goto out_error;
213 * list and potentially leak the inode. 222 }
214 */
215 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
216 error = ENOENT;
217 goto out_error;
218 }
219 223
224 /*
225 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
226 * Need to carefully get it back into useable state.
227 */
228 if (ip->i_flags & XFS_IRECLAIMABLE) {
220 xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); 229 xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
221 230
222 /* 231 /*
223 * We need to re-initialise the VFS inode as it has been 232 * We need to set XFS_INEW atomically with clearing the
224 * 'freed' by the VFS. Do this here so we can deal with 233 * reclaimable tag so that we do have an indicator of the
225 * errors cleanly, then tag it so it can be set up correctly 234 * inode still being initialized.
226 * later.
227 */ 235 */
228 if (inode_init_always(mp->m_super, VFS_I(ip))) { 236 ip->i_flags |= XFS_INEW;
229 error = ENOMEM; 237 ip->i_flags &= ~XFS_IRECLAIMABLE;
230 goto out_error; 238 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
231 }
232 239
233 /* 240 spin_unlock(&ip->i_flags_lock);
234 * We must set the XFS_INEW flag before clearing the 241 read_unlock(&pag->pag_ici_lock);
235 * XFS_IRECLAIMABLE flag so that if a racing lookup does
236 * not find the XFS_IRECLAIMABLE above but has the igrab()
237 * below succeed we can safely check XFS_INEW to detect
238 * that this inode is still being initialised.
239 */
240 xfs_iflags_set(ip, XFS_INEW);
241 xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
242 242
243 /* clear the radix tree reclaim flag as well. */ 243 error = -inode_init_always(mp->m_super, inode);
244 __xfs_inode_clear_reclaim_tag(mp, pag, ip); 244 if (error) {
245 } else if (!igrab(VFS_I(ip))) { 245 /*
246 * Re-initializing the inode failed, and we are in deep
247 * trouble. Try to re-add it to the reclaim list.
248 */
249 read_lock(&pag->pag_ici_lock);
250 spin_lock(&ip->i_flags_lock);
251
252 ip->i_flags &= ~XFS_INEW;
253 ip->i_flags |= XFS_IRECLAIMABLE;
254 __xfs_inode_set_reclaim_tag(pag, ip);
255 goto out_error;
256 }
257 inode->i_state = I_LOCK|I_NEW;
258 } else {
246 /* If the VFS inode is being torn down, pause and try again. */ 259 /* If the VFS inode is being torn down, pause and try again. */
247 XFS_STATS_INC(xs_ig_frecycle); 260 if (!igrab(inode)) {
248 goto out_error; 261 error = EAGAIN;
249 } else if (xfs_iflags_test(ip, XFS_INEW)) { 262 goto out_error;
250 /* 263 }
251 * We are racing with another cache hit that is
252 * currently recycling this inode out of the XFS_IRECLAIMABLE
253 * state. Wait for the initialisation to complete before
254 * continuing.
255 */
256 wait_on_inode(VFS_I(ip));
257 }
258 264
259 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { 265 /* We've got a live one. */
260 error = ENOENT; 266 spin_unlock(&ip->i_flags_lock);
261 iput(VFS_I(ip)); 267 read_unlock(&pag->pag_ici_lock);
262 goto out_error;
263 } 268 }
264 269
265 /* We've got a live one. */
266 read_unlock(&pag->pag_ici_lock);
267
268 if (lock_flags != 0) 270 if (lock_flags != 0)
269 xfs_ilock(ip, lock_flags); 271 xfs_ilock(ip, lock_flags);
270 272
@@ -274,6 +276,7 @@ xfs_iget_cache_hit(
274 return 0; 276 return 0;
275 277
276out_error: 278out_error:
279 spin_unlock(&ip->i_flags_lock);
277 read_unlock(&pag->pag_ici_lock); 280 read_unlock(&pag->pag_ici_lock);
278 return error; 281 return error;
279} 282}