aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2011-05-24 03:59:36 -0400
committerTejun Heo <tj@kernel.org>2011-05-24 03:59:36 -0400
commit6988f20fe04e9ef3aea488cb8ab57fbeb78e12f0 (patch)
treec9d7fc50a2e2147a5ca07e3096e7eeb916ad2da9 /fs
parent0415b00d175e0d8945e6785aad21b5f157976ce0 (diff)
parent6ea0c34dac89611126455537552cffe6c7e832ad (diff)
Merge branch 'fixes-2.6.39' into for-2.6.40
Diffstat (limited to 'fs')
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c4
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/root.c62
-rw-r--r--fs/autofs4/waitq.c6
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/coda/sysctl.c9
-rw-r--r--fs/drop_caches.c18
-rw-r--r--fs/ext4/balloc.c3
-rw-r--r--fs/ext4/ext4_jbd2.h7
-rw-r--r--fs/ext4/extents.c213
-rw-r--r--fs/ext4/fsync.c14
-rw-r--r--fs/ext4/ialloc.c8
-rw-r--r--fs/ext4/inode.c410
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c34
-rw-r--r--fs/ext4/mballoc.h2
-rw-r--r--fs/ext4/migrate.c10
-rw-r--r--fs/ext4/namei.c13
-rw-r--r--fs/ext4/page-io.c13
-rw-r--r--fs/ext4/resize.c12
-rw-r--r--fs/ext4/super.c48
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/fs-writeback.c141
-rw-r--r--fs/inode.c656
-rw-r--r--fs/internal.h7
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/logfs/inode.c2
-rw-r--r--fs/namei.c23
-rw-r--r--fs/nfs/dir.c89
-rw-r--r--fs/nfs/file.c3
-rw-r--r--fs/nfs/getroot.c4
-rw-r--r--fs/nfs/inode.c10
-rw-r--r--fs/nfs/internal.h27
-rw-r--r--fs/nfs/namespace.c115
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs4_fs.h5
-rw-r--r--fs/nfs/nfs4filelayout.c352
-rw-r--r--fs/nfs/nfs4filelayout.h2
-rw-r--r--fs/nfs/nfs4filelayoutdev.c178
-rw-r--r--fs/nfs/nfs4proc.c302
-rw-r--r--fs/nfs/nfs4xdr.c313
-rw-r--r--fs/nfs/pagelist.c8
-rw-r--r--fs/nfs/pnfs.c142
-rw-r--r--fs/nfs/pnfs.h83
-rw-r--r--fs/nfs/proc.c2
-rw-r--r--fs/nfs/write.c214
-rw-r--r--fs/nfs_common/nfsacl.c1
-rw-r--r--fs/notify/inode_mark.c42
-rw-r--r--fs/notify/mark.c1
-rw-r--r--fs/notify/vfsmount_mark.c1
-rw-r--r--fs/ntfs/inode.c4
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/quota/dquot.c41
55 files changed, 2648 insertions, 1118 deletions
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 54f923792728..475f9c597cb7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -61,8 +61,6 @@ do { \
61 current->pid, __func__, ##args); \ 61 current->pid, __func__, ##args); \
62} while (0) 62} while (0)
63 63
64extern spinlock_t autofs4_lock;
65
66/* Unified info structure. This is pointed to by both the dentry and 64/* Unified info structure. This is pointed to by both the dentry and
67 inode structures. Each file in the filesystem has an instance of this 65 inode structures. Each file in the filesystem has an instance of this
68 structure. It holds a reference to the dentry, so dentries are never 66 structure. It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1442da4860e5..509fe1eb66ae 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
372 return -EBUSY; 372 return -EBUSY;
373 } else { 373 } else {
374 struct file *pipe = fget(pipefd); 374 struct file *pipe = fget(pipefd);
375 if (!pipe) {
376 err = -EBADF;
377 goto out;
378 }
375 if (!pipe->f_op || !pipe->f_op->write) { 379 if (!pipe->f_op || !pipe->f_op->write) {
376 err = -EPIPE; 380 err = -EPIPE;
377 fput(pipe); 381 fput(pipe);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index f43100b9662b..450f529a4eae 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -87,18 +87,70 @@ done:
87} 87}
88 88
89/* 89/*
90 * Calculate and dget next entry in the subdirs list under root.
91 */
92static struct dentry *get_next_positive_subdir(struct dentry *prev,
93 struct dentry *root)
94{
95 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
96 struct list_head *next;
97 struct dentry *p, *q;
98
99 spin_lock(&sbi->lookup_lock);
100
101 if (prev == NULL) {
102 spin_lock(&root->d_lock);
103 prev = dget_dlock(root);
104 next = prev->d_subdirs.next;
105 p = prev;
106 goto start;
107 }
108
109 p = prev;
110 spin_lock(&p->d_lock);
111again:
112 next = p->d_u.d_child.next;
113start:
114 if (next == &root->d_subdirs) {
115 spin_unlock(&p->d_lock);
116 spin_unlock(&sbi->lookup_lock);
117 dput(prev);
118 return NULL;
119 }
120
121 q = list_entry(next, struct dentry, d_u.d_child);
122
123 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
124 /* Negative dentry - try next */
125 if (!simple_positive(q)) {
126 spin_unlock(&p->d_lock);
127 p = q;
128 goto again;
129 }
130 dget_dlock(q);
131 spin_unlock(&q->d_lock);
132 spin_unlock(&p->d_lock);
133 spin_unlock(&sbi->lookup_lock);
134
135 dput(prev);
136
137 return q;
138}
139
140/*
90 * Calculate and dget next entry in top down tree traversal. 141 * Calculate and dget next entry in top down tree traversal.
91 */ 142 */
92static struct dentry *get_next_positive_dentry(struct dentry *prev, 143static struct dentry *get_next_positive_dentry(struct dentry *prev,
93 struct dentry *root) 144 struct dentry *root)
94{ 145{
146 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
95 struct list_head *next; 147 struct list_head *next;
96 struct dentry *p, *ret; 148 struct dentry *p, *ret;
97 149
98 if (prev == NULL) 150 if (prev == NULL)
99 return dget(root); 151 return dget(root);
100 152
101 spin_lock(&autofs4_lock); 153 spin_lock(&sbi->lookup_lock);
102relock: 154relock:
103 p = prev; 155 p = prev;
104 spin_lock(&p->d_lock); 156 spin_lock(&p->d_lock);
@@ -110,7 +162,7 @@ again:
110 162
111 if (p == root) { 163 if (p == root) {
112 spin_unlock(&p->d_lock); 164 spin_unlock(&p->d_lock);
113 spin_unlock(&autofs4_lock); 165 spin_unlock(&sbi->lookup_lock);
114 dput(prev); 166 dput(prev);
115 return NULL; 167 return NULL;
116 } 168 }
@@ -140,7 +192,7 @@ again:
140 dget_dlock(ret); 192 dget_dlock(ret);
141 spin_unlock(&ret->d_lock); 193 spin_unlock(&ret->d_lock);
142 spin_unlock(&p->d_lock); 194 spin_unlock(&p->d_lock);
143 spin_unlock(&autofs4_lock); 195 spin_unlock(&sbi->lookup_lock);
144 196
145 dput(prev); 197 dput(prev);
146 198
@@ -290,11 +342,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
290 spin_lock(&sbi->fs_lock); 342 spin_lock(&sbi->fs_lock);
291 ino = autofs4_dentry_ino(root); 343 ino = autofs4_dentry_ino(root);
292 /* No point expiring a pending mount */ 344 /* No point expiring a pending mount */
293 if (ino->flags & AUTOFS_INF_PENDING) { 345 if (ino->flags & AUTOFS_INF_PENDING)
294 spin_unlock(&sbi->fs_lock); 346 goto out;
295 return NULL;
296 }
297 managed_dentry_set_transit(root);
298 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 347 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
299 struct autofs_info *ino = autofs4_dentry_ino(root); 348 struct autofs_info *ino = autofs4_dentry_ino(root);
300 ino->flags |= AUTOFS_INF_EXPIRING; 349 ino->flags |= AUTOFS_INF_EXPIRING;
@@ -302,7 +351,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
302 spin_unlock(&sbi->fs_lock); 351 spin_unlock(&sbi->fs_lock);
303 return root; 352 return root;
304 } 353 }
305 managed_dentry_clear_transit(root); 354out:
306 spin_unlock(&sbi->fs_lock); 355 spin_unlock(&sbi->fs_lock);
307 dput(root); 356 dput(root);
308 357
@@ -336,13 +385,12 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
336 timeout = sbi->exp_timeout; 385 timeout = sbi->exp_timeout;
337 386
338 dentry = NULL; 387 dentry = NULL;
339 while ((dentry = get_next_positive_dentry(dentry, root))) { 388 while ((dentry = get_next_positive_subdir(dentry, root))) {
340 spin_lock(&sbi->fs_lock); 389 spin_lock(&sbi->fs_lock);
341 ino = autofs4_dentry_ino(dentry); 390 ino = autofs4_dentry_ino(dentry);
342 /* No point expiring a pending mount */ 391 /* No point expiring a pending mount */
343 if (ino->flags & AUTOFS_INF_PENDING) 392 if (ino->flags & AUTOFS_INF_PENDING)
344 goto cont; 393 goto next;
345 managed_dentry_set_transit(dentry);
346 394
347 /* 395 /*
348 * Case 1: (i) indirect mount or top level pseudo direct mount 396 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -402,8 +450,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
402 } 450 }
403 } 451 }
404next: 452next:
405 managed_dentry_clear_transit(dentry);
406cont:
407 spin_unlock(&sbi->fs_lock); 453 spin_unlock(&sbi->fs_lock);
408 } 454 }
409 return NULL; 455 return NULL;
@@ -415,13 +461,13 @@ found:
415 ino->flags |= AUTOFS_INF_EXPIRING; 461 ino->flags |= AUTOFS_INF_EXPIRING;
416 init_completion(&ino->expire_complete); 462 init_completion(&ino->expire_complete);
417 spin_unlock(&sbi->fs_lock); 463 spin_unlock(&sbi->fs_lock);
418 spin_lock(&autofs4_lock); 464 spin_lock(&sbi->lookup_lock);
419 spin_lock(&expired->d_parent->d_lock); 465 spin_lock(&expired->d_parent->d_lock);
420 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); 466 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
421 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); 467 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
422 spin_unlock(&expired->d_lock); 468 spin_unlock(&expired->d_lock);
423 spin_unlock(&expired->d_parent->d_lock); 469 spin_unlock(&expired->d_parent->d_lock);
424 spin_unlock(&autofs4_lock); 470 spin_unlock(&sbi->lookup_lock);
425 return expired; 471 return expired;
426} 472}
427 473
@@ -484,8 +530,6 @@ int autofs4_expire_run(struct super_block *sb,
484 spin_lock(&sbi->fs_lock); 530 spin_lock(&sbi->fs_lock);
485 ino = autofs4_dentry_ino(dentry); 531 ino = autofs4_dentry_ino(dentry);
486 ino->flags &= ~AUTOFS_INF_EXPIRING; 532 ino->flags &= ~AUTOFS_INF_EXPIRING;
487 if (!d_unhashed(dentry))
488 managed_dentry_clear_transit(dentry);
489 complete_all(&ino->expire_complete); 533 complete_all(&ino->expire_complete);
490 spin_unlock(&sbi->fs_lock); 534 spin_unlock(&sbi->fs_lock);
491 535
@@ -513,9 +557,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
513 spin_lock(&sbi->fs_lock); 557 spin_lock(&sbi->fs_lock);
514 ino->flags &= ~AUTOFS_INF_EXPIRING; 558 ino->flags &= ~AUTOFS_INF_EXPIRING;
515 spin_lock(&dentry->d_lock); 559 spin_lock(&dentry->d_lock);
516 if (ret) 560 if (!ret) {
517 __managed_dentry_clear_transit(dentry);
518 else {
519 if ((IS_ROOT(dentry) || 561 if ((IS_ROOT(dentry) ||
520 (autofs_type_indirect(sbi->type) && 562 (autofs_type_indirect(sbi->type) &&
521 IS_ROOT(dentry->d_parent))) && 563 IS_ROOT(dentry->d_parent))) &&
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e6f84d26f4cf..96804a17bbd0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,8 +23,6 @@
23 23
24#include "autofs_i.h" 24#include "autofs_i.h"
25 25
26DEFINE_SPINLOCK(autofs4_lock);
27
28static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
29static int autofs4_dir_unlink(struct inode *,struct dentry *); 27static int autofs4_dir_unlink(struct inode *,struct dentry *);
30static int autofs4_dir_rmdir(struct inode *,struct dentry *); 28static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
125 * autofs file system so just let the libfs routines handle 123 * autofs file system so just let the libfs routines handle
126 * it. 124 * it.
127 */ 125 */
128 spin_lock(&autofs4_lock); 126 spin_lock(&sbi->lookup_lock);
129 spin_lock(&dentry->d_lock); 127 spin_lock(&dentry->d_lock);
130 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 128 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
131 spin_unlock(&dentry->d_lock); 129 spin_unlock(&dentry->d_lock);
132 spin_unlock(&autofs4_lock); 130 spin_unlock(&sbi->lookup_lock);
133 return -ENOENT; 131 return -ENOENT;
134 } 132 }
135 spin_unlock(&dentry->d_lock); 133 spin_unlock(&dentry->d_lock);
136 spin_unlock(&autofs4_lock); 134 spin_unlock(&sbi->lookup_lock);
137 135
138out: 136out:
139 return dcache_dir_open(inode, file); 137 return dcache_dir_open(inode, file);
@@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
171 const unsigned char *str = name->name; 169 const unsigned char *str = name->name;
172 struct list_head *p, *head; 170 struct list_head *p, *head;
173 171
174 spin_lock(&autofs4_lock);
175 spin_lock(&sbi->lookup_lock); 172 spin_lock(&sbi->lookup_lock);
176 head = &sbi->active_list; 173 head = &sbi->active_list;
177 list_for_each(p, head) { 174 list_for_each(p, head) {
@@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
204 dget_dlock(active); 201 dget_dlock(active);
205 spin_unlock(&active->d_lock); 202 spin_unlock(&active->d_lock);
206 spin_unlock(&sbi->lookup_lock); 203 spin_unlock(&sbi->lookup_lock);
207 spin_unlock(&autofs4_lock);
208 return active; 204 return active;
209 } 205 }
210next: 206next:
211 spin_unlock(&active->d_lock); 207 spin_unlock(&active->d_lock);
212 } 208 }
213 spin_unlock(&sbi->lookup_lock); 209 spin_unlock(&sbi->lookup_lock);
214 spin_unlock(&autofs4_lock);
215 210
216 return NULL; 211 return NULL;
217} 212}
@@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
226 const unsigned char *str = name->name; 221 const unsigned char *str = name->name;
227 struct list_head *p, *head; 222 struct list_head *p, *head;
228 223
229 spin_lock(&autofs4_lock);
230 spin_lock(&sbi->lookup_lock); 224 spin_lock(&sbi->lookup_lock);
231 head = &sbi->expiring_list; 225 head = &sbi->expiring_list;
232 list_for_each(p, head) { 226 list_for_each(p, head) {
@@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
259 dget_dlock(expiring); 253 dget_dlock(expiring);
260 spin_unlock(&expiring->d_lock); 254 spin_unlock(&expiring->d_lock);
261 spin_unlock(&sbi->lookup_lock); 255 spin_unlock(&sbi->lookup_lock);
262 spin_unlock(&autofs4_lock);
263 return expiring; 256 return expiring;
264 } 257 }
265next: 258next:
266 spin_unlock(&expiring->d_lock); 259 spin_unlock(&expiring->d_lock);
267 } 260 }
268 spin_unlock(&sbi->lookup_lock); 261 spin_unlock(&sbi->lookup_lock);
269 spin_unlock(&autofs4_lock);
270 262
271 return NULL; 263 return NULL;
272} 264}
@@ -275,17 +267,16 @@ static int autofs4_mount_wait(struct dentry *dentry)
275{ 267{
276 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 268 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
277 struct autofs_info *ino = autofs4_dentry_ino(dentry); 269 struct autofs_info *ino = autofs4_dentry_ino(dentry);
278 int status; 270 int status = 0;
279 271
280 if (ino->flags & AUTOFS_INF_PENDING) { 272 if (ino->flags & AUTOFS_INF_PENDING) {
281 DPRINTK("waiting for mount name=%.*s", 273 DPRINTK("waiting for mount name=%.*s",
282 dentry->d_name.len, dentry->d_name.name); 274 dentry->d_name.len, dentry->d_name.name);
283 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 275 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
284 DPRINTK("mount wait done status=%d", status); 276 DPRINTK("mount wait done status=%d", status);
285 ino->last_used = jiffies;
286 return status;
287 } 277 }
288 return 0; 278 ino->last_used = jiffies;
279 return status;
289} 280}
290 281
291static int do_expire_wait(struct dentry *dentry) 282static int do_expire_wait(struct dentry *dentry)
@@ -319,9 +310,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
319 */ 310 */
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { 311 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent; 312 struct dentry *parent = dentry->d_parent;
313 struct autofs_info *ino;
322 struct dentry *new = d_lookup(parent, &dentry->d_name); 314 struct dentry *new = d_lookup(parent, &dentry->d_name);
323 if (!new) 315 if (!new)
324 return NULL; 316 return NULL;
317 ino = autofs4_dentry_ino(new);
318 ino->last_used = jiffies;
325 dput(path->dentry); 319 dput(path->dentry);
326 path->dentry = new; 320 path->dentry = new;
327 } 321 }
@@ -338,18 +332,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
338 DPRINTK("dentry=%p %.*s", 332 DPRINTK("dentry=%p %.*s",
339 dentry, dentry->d_name.len, dentry->d_name.name); 333 dentry, dentry->d_name.len, dentry->d_name.name);
340 334
341 /*
342 * Someone may have manually umounted this or it was a submount
343 * that has gone away.
344 */
345 spin_lock(&dentry->d_lock);
346 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
347 if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
348 (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
349 __managed_dentry_set_transit(path->dentry);
350 }
351 spin_unlock(&dentry->d_lock);
352
353 /* The daemon never triggers a mount. */ 335 /* The daemon never triggers a mount. */
354 if (autofs4_oz_mode(sbi)) 336 if (autofs4_oz_mode(sbi))
355 return NULL; 337 return NULL;
@@ -418,18 +400,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
418done: 400done:
419 if (!(ino->flags & AUTOFS_INF_EXPIRING)) { 401 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
420 /* 402 /*
421 * Any needed mounting has been completed and the path updated 403 * Any needed mounting has been completed and the path
422 * so turn this into a normal dentry so we don't continually 404 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
423 * call ->d_automount() and ->d_manage(). 405 * call ->d_automount() on rootless multi-mounts since
424 */ 406 * it can lead to an incorrect ELOOP error return.
425 spin_lock(&dentry->d_lock); 407 *
426 __managed_dentry_clear_transit(dentry);
427 /*
428 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and 408 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
429 * symlinks as in all other cases the dentry will be covered by 409 * symlinks as in all other cases the dentry will be covered by
430 * an actual mount so ->d_automount() won't be called during 410 * an actual mount so ->d_automount() won't be called during
431 * the follow. 411 * the follow.
432 */ 412 */
413 spin_lock(&dentry->d_lock);
433 if ((!d_mountpoint(dentry) && 414 if ((!d_mountpoint(dentry) &&
434 !list_empty(&dentry->d_subdirs)) || 415 !list_empty(&dentry->d_subdirs)) ||
435 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) 416 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
@@ -455,6 +436,8 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
455 436
456 /* The daemon never waits. */ 437 /* The daemon never waits. */
457 if (autofs4_oz_mode(sbi)) { 438 if (autofs4_oz_mode(sbi)) {
439 if (rcu_walk)
440 return 0;
458 if (!d_mountpoint(dentry)) 441 if (!d_mountpoint(dentry))
459 return -EISDIR; 442 return -EISDIR;
460 return 0; 443 return 0;
@@ -612,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
612 595
613 dir->i_mtime = CURRENT_TIME; 596 dir->i_mtime = CURRENT_TIME;
614 597
615 spin_lock(&autofs4_lock); 598 spin_lock(&sbi->lookup_lock);
616 autofs4_add_expiring(dentry); 599 __autofs4_add_expiring(dentry);
617 spin_lock(&dentry->d_lock); 600 spin_lock(&dentry->d_lock);
618 __d_drop(dentry); 601 __d_drop(dentry);
619 spin_unlock(&dentry->d_lock); 602 spin_unlock(&dentry->d_lock);
620 spin_unlock(&autofs4_lock); 603 spin_unlock(&sbi->lookup_lock);
621 604
622 return 0; 605 return 0;
623} 606}
@@ -686,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
686 if (!autofs4_oz_mode(sbi)) 669 if (!autofs4_oz_mode(sbi))
687 return -EACCES; 670 return -EACCES;
688 671
689 spin_lock(&autofs4_lock);
690 spin_lock(&sbi->lookup_lock); 672 spin_lock(&sbi->lookup_lock);
691 spin_lock(&dentry->d_lock); 673 spin_lock(&dentry->d_lock);
692 if (!list_empty(&dentry->d_subdirs)) { 674 if (!list_empty(&dentry->d_subdirs)) {
693 spin_unlock(&dentry->d_lock); 675 spin_unlock(&dentry->d_lock);
694 spin_unlock(&sbi->lookup_lock); 676 spin_unlock(&sbi->lookup_lock);
695 spin_unlock(&autofs4_lock);
696 return -ENOTEMPTY; 677 return -ENOTEMPTY;
697 } 678 }
698 __autofs4_add_expiring(dentry); 679 __autofs4_add_expiring(dentry);
699 spin_unlock(&sbi->lookup_lock);
700 __d_drop(dentry); 680 __d_drop(dentry);
701 spin_unlock(&dentry->d_lock); 681 spin_unlock(&dentry->d_lock);
702 spin_unlock(&autofs4_lock); 682 spin_unlock(&sbi->lookup_lock);
703 683
704 if (sbi->version < 5) 684 if (sbi->version < 5)
705 autofs_clear_leaf_automount_flags(dentry); 685 autofs_clear_leaf_automount_flags(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 56010056b2e6..25435987d6ae 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -197,12 +197,12 @@ rename_retry:
197 197
198 seq = read_seqbegin(&rename_lock); 198 seq = read_seqbegin(&rename_lock);
199 rcu_read_lock(); 199 rcu_read_lock();
200 spin_lock(&autofs4_lock); 200 spin_lock(&sbi->fs_lock);
201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) 201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
202 len += tmp->d_name.len + 1; 202 len += tmp->d_name.len + 1;
203 203
204 if (!len || --len > NAME_MAX) { 204 if (!len || --len > NAME_MAX) {
205 spin_unlock(&autofs4_lock); 205 spin_unlock(&sbi->fs_lock);
206 rcu_read_unlock(); 206 rcu_read_unlock();
207 if (read_seqretry(&rename_lock, seq)) 207 if (read_seqretry(&rename_lock, seq))
208 goto rename_retry; 208 goto rename_retry;
@@ -218,7 +218,7 @@ rename_retry:
218 p -= tmp->d_name.len; 218 p -= tmp->d_name.len;
219 strncpy(p, tmp->d_name.name, tmp->d_name.len); 219 strncpy(p, tmp->d_name.name, tmp->d_name.len);
220 } 220 }
221 spin_unlock(&autofs4_lock); 221 spin_unlock(&sbi->fs_lock);
222 rcu_read_unlock(); 222 rcu_read_unlock();
223 if (read_seqretry(&rename_lock, seq)) 223 if (read_seqretry(&rename_lock, seq))
224 goto rename_retry; 224 goto rename_retry;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7d02afb2b7f4..c1511c674f53 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,11 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
55static void bdev_inode_switch_bdi(struct inode *inode, 55static void bdev_inode_switch_bdi(struct inode *inode,
56 struct backing_dev_info *dst) 56 struct backing_dev_info *dst)
57{ 57{
58 spin_lock(&inode_lock); 58 spin_lock(&inode_wb_list_lock);
59 spin_lock(&inode->i_lock);
59 inode->i_data.backing_dev_info = dst; 60 inode->i_data.backing_dev_info = dst;
60 if (inode->i_state & I_DIRTY) 61 if (inode->i_state & I_DIRTY)
61 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
62 spin_unlock(&inode_lock); 63 spin_unlock(&inode->i_lock);
64 spin_unlock(&inode_wb_list_lock);
63} 65}
64 66
65static sector_t max_block(struct block_device *bdev) 67static sector_t max_block(struct block_device *bdev)
diff --git a/fs/buffer.c b/fs/buffer.c
index 2e6b1a387b7e..a08bb8e61c6f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1138,7 +1138,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1138 * inode list. 1138 * inode list.
1139 * 1139 *
1140 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1140 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1141 * mapping->tree_lock and the global inode_lock. 1141 * mapping->tree_lock and mapping->host->i_lock.
1142 */ 1142 */
1143void mark_buffer_dirty(struct buffer_head *bh) 1143void mark_buffer_dirty(struct buffer_head *bh)
1144{ 1144{
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 06d27a41807f..af56ad56a89a 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -61,4 +61,13 @@ void coda_sysctl_clean(void)
61 fs_table_header = NULL; 61 fs_table_header = NULL;
62 } 62 }
63} 63}
64
65#else
66void coda_sysctl_init(void)
67{
68}
69
70void coda_sysctl_clean(void)
71{
72}
64#endif 73#endif
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 816f88e6b9ce..98b77c89494c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,6 +8,7 @@
8#include <linux/writeback.h> 8#include <linux/writeback.h>
9#include <linux/sysctl.h> 9#include <linux/sysctl.h>
10#include <linux/gfp.h> 10#include <linux/gfp.h>
11#include "internal.h"
11 12
12/* A global variable is a bit ugly, but it keeps the code simple */ 13/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches; 14int sysctl_drop_caches;
@@ -16,20 +17,23 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
16{ 17{
17 struct inode *inode, *toput_inode = NULL; 18 struct inode *inode, *toput_inode = NULL;
18 19
19 spin_lock(&inode_lock); 20 spin_lock(&inode_sb_list_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 21 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 22 spin_lock(&inode->i_lock);
22 continue; 23 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
23 if (inode->i_mapping->nrpages == 0) 24 (inode->i_mapping->nrpages == 0)) {
25 spin_unlock(&inode->i_lock);
24 continue; 26 continue;
27 }
25 __iget(inode); 28 __iget(inode);
26 spin_unlock(&inode_lock); 29 spin_unlock(&inode->i_lock);
30 spin_unlock(&inode_sb_list_lock);
27 invalidate_mapping_pages(inode->i_mapping, 0, -1); 31 invalidate_mapping_pages(inode->i_mapping, 0, -1);
28 iput(toput_inode); 32 iput(toput_inode);
29 toput_inode = inode; 33 toput_inode = inode;
30 spin_lock(&inode_lock); 34 spin_lock(&inode_sb_list_lock);
31 } 35 }
32 spin_unlock(&inode_lock); 36 spin_unlock(&inode_sb_list_lock);
33 iput(toput_inode); 37 iput(toput_inode);
34} 38}
35 39
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index adf96b822781..97b970e7dd13 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -21,6 +21,8 @@
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "mballoc.h" 22#include "mballoc.h"
23 23
24#include <trace/events/ext4.h>
25
24/* 26/*
25 * balloc.c contains the blocks allocation and deallocation routines 27 * balloc.c contains the blocks allocation and deallocation routines
26 */ 28 */
@@ -342,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
342 * We do it here so the bitmap uptodate bit 344 * We do it here so the bitmap uptodate bit
343 * get set with buffer lock held. 345 * get set with buffer lock held.
344 */ 346 */
347 trace_ext4_read_block_bitmap_load(sb, block_group);
345 set_bitmap_uptodate(bh); 348 set_bitmap_uptodate(bh);
346 if (bh_submit_read(bh) < 0) { 349 if (bh_submit_read(bh) < 0) {
347 put_bh(bh); 350 put_bh(bh);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d8b992e658c1..e25e99bf7ee1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -202,13 +202,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
202 return 1; 202 return 1;
203} 203}
204 204
205static inline void ext4_journal_release_buffer(handle_t *handle,
206 struct buffer_head *bh)
207{
208 if (ext4_handle_valid(handle))
209 jbd2_journal_release_buffer(handle, bh);
210}
211
212static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) 205static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
213{ 206{
214 return ext4_journal_start_sb(inode->i_sb, nblocks); 207 return ext4_journal_start_sb(inode->i_sb, nblocks);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7516fb9c0bd5..dd2cb5076ff9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,8 @@
44#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
45#include "ext4_extents.h" 45#include "ext4_extents.h"
46 46
47#include <trace/events/ext4.h>
48
47static int ext4_ext_truncate_extend_restart(handle_t *handle, 49static int ext4_ext_truncate_extend_restart(handle_t *handle,
48 struct inode *inode, 50 struct inode *inode,
49 int needed) 51 int needed)
@@ -664,6 +666,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
664 if (unlikely(!bh)) 666 if (unlikely(!bh))
665 goto err; 667 goto err;
666 if (!bh_uptodate_or_lock(bh)) { 668 if (!bh_uptodate_or_lock(bh)) {
669 trace_ext4_ext_load_extent(inode, block,
670 path[ppos].p_block);
667 if (bh_submit_read(bh) < 0) { 671 if (bh_submit_read(bh) < 0) {
668 put_bh(bh); 672 put_bh(bh);
669 goto err; 673 goto err;
@@ -1034,7 +1038,7 @@ cleanup:
1034 for (i = 0; i < depth; i++) { 1038 for (i = 0; i < depth; i++) {
1035 if (!ablocks[i]) 1039 if (!ablocks[i])
1036 continue; 1040 continue;
1037 ext4_free_blocks(handle, inode, 0, ablocks[i], 1, 1041 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1038 EXT4_FREE_BLOCKS_METADATA); 1042 EXT4_FREE_BLOCKS_METADATA);
1039 } 1043 }
1040 } 1044 }
@@ -2059,7 +2063,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2059 if (err) 2063 if (err)
2060 return err; 2064 return err;
2061 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2065 ext_debug("index is empty, remove it, free block %llu\n", leaf);
2062 ext4_free_blocks(handle, inode, 0, leaf, 1, 2066 ext4_free_blocks(handle, inode, NULL, leaf, 1,
2063 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2067 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2064 return err; 2068 return err;
2065} 2069}
@@ -2156,7 +2160,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2156 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2160 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2157 start = ext4_ext_pblock(ex) + ee_len - num; 2161 start = ext4_ext_pblock(ex) + ee_len - num;
2158 ext_debug("free last %u blocks starting %llu\n", num, start); 2162 ext_debug("free last %u blocks starting %llu\n", num, start);
2159 ext4_free_blocks(handle, inode, 0, start, num, flags); 2163 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2160 } else if (from == le32_to_cpu(ex->ee_block) 2164 } else if (from == le32_to_cpu(ex->ee_block)
2161 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2165 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2162 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2166 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -3108,14 +3112,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3108{ 3112{
3109 int i, depth; 3113 int i, depth;
3110 struct ext4_extent_header *eh; 3114 struct ext4_extent_header *eh;
3111 struct ext4_extent *ex, *last_ex; 3115 struct ext4_extent *last_ex;
3112 3116
3113 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3117 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3114 return 0; 3118 return 0;
3115 3119
3116 depth = ext_depth(inode); 3120 depth = ext_depth(inode);
3117 eh = path[depth].p_hdr; 3121 eh = path[depth].p_hdr;
3118 ex = path[depth].p_ext;
3119 3122
3120 if (unlikely(!eh->eh_entries)) { 3123 if (unlikely(!eh->eh_entries)) {
3121 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " 3124 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
@@ -3295,9 +3298,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3295 struct ext4_map_blocks *map, int flags) 3298 struct ext4_map_blocks *map, int flags)
3296{ 3299{
3297 struct ext4_ext_path *path = NULL; 3300 struct ext4_ext_path *path = NULL;
3298 struct ext4_extent_header *eh;
3299 struct ext4_extent newex, *ex; 3301 struct ext4_extent newex, *ex;
3300 ext4_fsblk_t newblock; 3302 ext4_fsblk_t newblock = 0;
3301 int err = 0, depth, ret; 3303 int err = 0, depth, ret;
3302 unsigned int allocated = 0; 3304 unsigned int allocated = 0;
3303 struct ext4_allocation_request ar; 3305 struct ext4_allocation_request ar;
@@ -3305,6 +3307,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3305 3307
3306 ext_debug("blocks %u/%u requested for inode %lu\n", 3308 ext_debug("blocks %u/%u requested for inode %lu\n",
3307 map->m_lblk, map->m_len, inode->i_ino); 3309 map->m_lblk, map->m_len, inode->i_ino);
3310 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3308 3311
3309 /* check in cache */ 3312 /* check in cache */
3310 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3313 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
@@ -3352,7 +3355,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3352 err = -EIO; 3355 err = -EIO;
3353 goto out2; 3356 goto out2;
3354 } 3357 }
3355 eh = path[depth].p_hdr;
3356 3358
3357 ex = path[depth].p_ext; 3359 ex = path[depth].p_ext;
3358 if (ex) { 3360 if (ex) {
@@ -3485,7 +3487,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3485 /* not a good idea to call discard here directly, 3487 /* not a good idea to call discard here directly,
3486 * but otherwise we'd need to call it every free() */ 3488 * but otherwise we'd need to call it every free() */
3487 ext4_discard_preallocations(inode); 3489 ext4_discard_preallocations(inode);
3488 ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex), 3490 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
3489 ext4_ext_get_actual_len(&newex), 0); 3491 ext4_ext_get_actual_len(&newex), 0);
3490 goto out2; 3492 goto out2;
3491 } 3493 }
@@ -3525,6 +3527,8 @@ out2:
3525 ext4_ext_drop_refs(path); 3527 ext4_ext_drop_refs(path);
3526 kfree(path); 3528 kfree(path);
3527 } 3529 }
3530 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3531 newblock, map->m_len, err ? err : allocated);
3528 return err ? err : allocated; 3532 return err ? err : allocated;
3529} 3533}
3530 3534
@@ -3658,6 +3662,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3658 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3662 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3659 return -EOPNOTSUPP; 3663 return -EOPNOTSUPP;
3660 3664
3665 trace_ext4_fallocate_enter(inode, offset, len, mode);
3661 map.m_lblk = offset >> blkbits; 3666 map.m_lblk = offset >> blkbits;
3662 /* 3667 /*
3663 * We can't just convert len to max_blocks because 3668 * We can't just convert len to max_blocks because
@@ -3673,6 +3678,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3673 ret = inode_newsize_ok(inode, (len + offset)); 3678 ret = inode_newsize_ok(inode, (len + offset));
3674 if (ret) { 3679 if (ret) {
3675 mutex_unlock(&inode->i_mutex); 3680 mutex_unlock(&inode->i_mutex);
3681 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
3676 return ret; 3682 return ret;
3677 } 3683 }
3678retry: 3684retry:
@@ -3717,6 +3723,8 @@ retry:
3717 goto retry; 3723 goto retry;
3718 } 3724 }
3719 mutex_unlock(&inode->i_mutex); 3725 mutex_unlock(&inode->i_mutex);
3726 trace_ext4_fallocate_exit(inode, offset, max_blocks,
3727 ret > 0 ? ret2 : ret);
3720 return ret > 0 ? ret2 : ret; 3728 return ret > 0 ? ret2 : ret;
3721} 3729}
3722 3730
@@ -3775,6 +3783,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3775 } 3783 }
3776 return ret > 0 ? ret2 : ret; 3784 return ret > 0 ? ret2 : ret;
3777} 3785}
3786
3778/* 3787/*
3779 * Callback function called for each extent to gather FIEMAP information. 3788 * Callback function called for each extent to gather FIEMAP information.
3780 */ 3789 */
@@ -3782,38 +3791,162 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3782 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3791 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3783 void *data) 3792 void *data)
3784{ 3793{
3785 struct fiemap_extent_info *fieinfo = data;
3786 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3787 __u64 logical; 3794 __u64 logical;
3788 __u64 physical; 3795 __u64 physical;
3789 __u64 length; 3796 __u64 length;
3797 loff_t size;
3790 __u32 flags = 0; 3798 __u32 flags = 0;
3791 int error; 3799 int ret = 0;
3800 struct fiemap_extent_info *fieinfo = data;
3801 unsigned char blksize_bits;
3792 3802
3793 logical = (__u64)newex->ec_block << blksize_bits; 3803 blksize_bits = inode->i_sb->s_blocksize_bits;
3804 logical = (__u64)newex->ec_block << blksize_bits;
3794 3805
3795 if (newex->ec_start == 0) { 3806 if (newex->ec_start == 0) {
3796 pgoff_t offset; 3807 /*
3797 struct page *page; 3808 * No extent in extent-tree contains block @newex->ec_start,
3809 * then the block may stay in 1)a hole or 2)delayed-extent.
3810 *
3811 * Holes or delayed-extents are processed as follows.
3812 * 1. lookup dirty pages with specified range in pagecache.
3813 * If no page is got, then there is no delayed-extent and
3814 * return with EXT_CONTINUE.
3815 * 2. find the 1st mapped buffer,
3816 * 3. check if the mapped buffer is both in the request range
3817 * and a delayed buffer. If not, there is no delayed-extent,
3818 * then return.
3819 * 4. a delayed-extent is found, the extent will be collected.
3820 */
3821 ext4_lblk_t end = 0;
3822 pgoff_t last_offset;
3823 pgoff_t offset;
3824 pgoff_t index;
3825 struct page **pages = NULL;
3798 struct buffer_head *bh = NULL; 3826 struct buffer_head *bh = NULL;
3827 struct buffer_head *head = NULL;
3828 unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
3829
3830 pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
3831 if (pages == NULL)
3832 return -ENOMEM;
3799 3833
3800 offset = logical >> PAGE_SHIFT; 3834 offset = logical >> PAGE_SHIFT;
3801 page = find_get_page(inode->i_mapping, offset); 3835repeat:
3802 if (!page || !page_has_buffers(page)) 3836 last_offset = offset;
3803 return EXT_CONTINUE; 3837 head = NULL;
3838 ret = find_get_pages_tag(inode->i_mapping, &offset,
3839 PAGECACHE_TAG_DIRTY, nr_pages, pages);
3840
3841 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
3842 /* First time, try to find a mapped buffer. */
3843 if (ret == 0) {
3844out:
3845 for (index = 0; index < ret; index++)
3846 page_cache_release(pages[index]);
3847 /* just a hole. */
3848 kfree(pages);
3849 return EXT_CONTINUE;
3850 }
3804 3851
3805 bh = page_buffers(page); 3852 /* Try to find the 1st mapped buffer. */
3853 end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
3854 blksize_bits;
3855 if (!page_has_buffers(pages[0]))
3856 goto out;
3857 head = page_buffers(pages[0]);
3858 if (!head)
3859 goto out;
3806 3860
3807 if (!bh) 3861 bh = head;
3808 return EXT_CONTINUE; 3862 do {
3863 if (buffer_mapped(bh)) {
3864 /* get the 1st mapped buffer. */
3865 if (end > newex->ec_block +
3866 newex->ec_len)
3867 /* The buffer is out of
3868 * the request range.
3869 */
3870 goto out;
3871 goto found_mapped_buffer;
3872 }
3873 bh = bh->b_this_page;
3874 end++;
3875 } while (bh != head);
3809 3876
3810 if (buffer_delay(bh)) { 3877 /* No mapped buffer found. */
3811 flags |= FIEMAP_EXTENT_DELALLOC; 3878 goto out;
3812 page_cache_release(page);
3813 } else { 3879 } else {
3814 page_cache_release(page); 3880 /*Find contiguous delayed buffers. */
3815 return EXT_CONTINUE; 3881 if (ret > 0 && pages[0]->index == last_offset)
3882 head = page_buffers(pages[0]);
3883 bh = head;
3816 } 3884 }
3885
3886found_mapped_buffer:
3887 if (bh != NULL && buffer_delay(bh)) {
3888 /* 1st or contiguous delayed buffer found. */
3889 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
3890 /*
3891 * 1st delayed buffer found, record
3892 * the start of extent.
3893 */
3894 flags |= FIEMAP_EXTENT_DELALLOC;
3895 newex->ec_block = end;
3896 logical = (__u64)end << blksize_bits;
3897 }
3898 /* Find contiguous delayed buffers. */
3899 do {
3900 if (!buffer_delay(bh))
3901 goto found_delayed_extent;
3902 bh = bh->b_this_page;
3903 end++;
3904 } while (bh != head);
3905
3906 for (index = 1; index < ret; index++) {
3907 if (!page_has_buffers(pages[index])) {
3908 bh = NULL;
3909 break;
3910 }
3911 head = page_buffers(pages[index]);
3912 if (!head) {
3913 bh = NULL;
3914 break;
3915 }
3916 if (pages[index]->index !=
3917 pages[0]->index + index) {
3918 /* Blocks are not contiguous. */
3919 bh = NULL;
3920 break;
3921 }
3922 bh = head;
3923 do {
3924 if (!buffer_delay(bh))
3925 /* Delayed-extent ends. */
3926 goto found_delayed_extent;
3927 bh = bh->b_this_page;
3928 end++;
3929 } while (bh != head);
3930 }
3931 } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
3932 /* a hole found. */
3933 goto out;
3934
3935found_delayed_extent:
3936 newex->ec_len = min(end - newex->ec_block,
3937 (ext4_lblk_t)EXT_INIT_MAX_LEN);
3938 if (ret == nr_pages && bh != NULL &&
3939 newex->ec_len < EXT_INIT_MAX_LEN &&
3940 buffer_delay(bh)) {
3941 /* Have not collected an extent and continue. */
3942 for (index = 0; index < ret; index++)
3943 page_cache_release(pages[index]);
3944 goto repeat;
3945 }
3946
3947 for (index = 0; index < ret; index++)
3948 page_cache_release(pages[index]);
3949 kfree(pages);
3817 } 3950 }
3818 3951
3819 physical = (__u64)newex->ec_start << blksize_bits; 3952 physical = (__u64)newex->ec_start << blksize_bits;
@@ -3822,32 +3955,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3822 if (ex && ext4_ext_is_uninitialized(ex)) 3955 if (ex && ext4_ext_is_uninitialized(ex))
3823 flags |= FIEMAP_EXTENT_UNWRITTEN; 3956 flags |= FIEMAP_EXTENT_UNWRITTEN;
3824 3957
3825 /* 3958 size = i_size_read(inode);
3826 * If this extent reaches EXT_MAX_BLOCK, it must be last. 3959 if (logical + length >= size)
3827 *
3828 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3829 * this also indicates no more allocated blocks.
3830 *
3831 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3832 */
3833 if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3834 newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3835 loff_t size = i_size_read(inode);
3836 loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3837
3838 flags |= FIEMAP_EXTENT_LAST; 3960 flags |= FIEMAP_EXTENT_LAST;
3839 if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3840 logical+length > size)
3841 length = (size - logical + bs - 1) & ~(bs-1);
3842 }
3843 3961
3844 error = fiemap_fill_next_extent(fieinfo, logical, physical, 3962 ret = fiemap_fill_next_extent(fieinfo, logical, physical,
3845 length, flags); 3963 length, flags);
3846 if (error < 0) 3964 if (ret < 0)
3847 return error; 3965 return ret;
3848 if (error == 1) 3966 if (ret == 1)
3849 return EXT_BREAK; 3967 return EXT_BREAK;
3850
3851 return EXT_CONTINUE; 3968 return EXT_CONTINUE;
3852} 3969}
3853 3970
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 7829b287822a..7f74019d6d77 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -164,20 +164,20 @@ int ext4_sync_file(struct file *file, int datasync)
164 164
165 J_ASSERT(ext4_journal_current_handle() == NULL); 165 J_ASSERT(ext4_journal_current_handle() == NULL);
166 166
167 trace_ext4_sync_file(file, datasync); 167 trace_ext4_sync_file_enter(file, datasync);
168 168
169 if (inode->i_sb->s_flags & MS_RDONLY) 169 if (inode->i_sb->s_flags & MS_RDONLY)
170 return 0; 170 return 0;
171 171
172 ret = ext4_flush_completed_IO(inode); 172 ret = ext4_flush_completed_IO(inode);
173 if (ret < 0) 173 if (ret < 0)
174 return ret; 174 goto out;
175 175
176 if (!journal) { 176 if (!journal) {
177 ret = generic_file_fsync(file, datasync); 177 ret = generic_file_fsync(file, datasync);
178 if (!ret && !list_empty(&inode->i_dentry)) 178 if (!ret && !list_empty(&inode->i_dentry))
179 ext4_sync_parent(inode); 179 ext4_sync_parent(inode);
180 return ret; 180 goto out;
181 } 181 }
182 182
183 /* 183 /*
@@ -194,8 +194,10 @@ int ext4_sync_file(struct file *file, int datasync)
194 * (they were dirtied by commit). But that's OK - the blocks are 194 * (they were dirtied by commit). But that's OK - the blocks are
195 * safe in-journal, which is all fsync() needs to ensure. 195 * safe in-journal, which is all fsync() needs to ensure.
196 */ 196 */
197 if (ext4_should_journal_data(inode)) 197 if (ext4_should_journal_data(inode)) {
198 return ext4_force_commit(inode->i_sb); 198 ret = ext4_force_commit(inode->i_sb);
199 goto out;
200 }
199 201
200 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 202 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
201 if (jbd2_log_start_commit(journal, commit_tid)) { 203 if (jbd2_log_start_commit(journal, commit_tid)) {
@@ -215,5 +217,7 @@ int ext4_sync_file(struct file *file, int datasync)
215 ret = jbd2_log_wait_commit(journal, commit_tid); 217 ret = jbd2_log_wait_commit(journal, commit_tid);
216 } else if (journal->j_flags & JBD2_BARRIER) 218 } else if (journal->j_flags & JBD2_BARRIER)
217 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 219 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
220 out:
221 trace_ext4_sync_file_exit(inode, ret);
218 return ret; 222 return ret;
219} 223}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 78b79e1bd7ed..21bb2f61e502 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -152,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
152 * We do it here so the bitmap uptodate bit 152 * We do it here so the bitmap uptodate bit
153 * get set with buffer lock held. 153 * get set with buffer lock held.
154 */ 154 */
155 trace_ext4_load_inode_bitmap(sb, block_group);
155 set_bitmap_uptodate(bh); 156 set_bitmap_uptodate(bh);
156 if (bh_submit_read(bh) < 0) { 157 if (bh_submit_read(bh) < 0) {
157 put_bh(bh); 158 put_bh(bh);
@@ -649,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
649 *group = parent_group + flex_size; 650 *group = parent_group + flex_size;
650 if (*group > ngroups) 651 if (*group > ngroups)
651 *group = 0; 652 *group = 0;
652 return find_group_orlov(sb, parent, group, mode, 0); 653 return find_group_orlov(sb, parent, group, mode, NULL);
653 } 654 }
654 655
655 /* 656 /*
@@ -1054,6 +1055,11 @@ got:
1054 } 1055 }
1055 } 1056 }
1056 1057
1058 if (ext4_handle_valid(handle)) {
1059 ei->i_sync_tid = handle->h_transaction->t_tid;
1060 ei->i_datasync_tid = handle->h_transaction->t_tid;
1061 }
1062
1057 err = ext4_mark_inode_dirty(handle, inode); 1063 err = ext4_mark_inode_dirty(handle, inode);
1058 if (err) { 1064 if (err) {
1059 ext4_std_error(sb, err); 1065 ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9297ad46c465..1a86282b9024 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -173,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
173 BUG_ON(EXT4_JOURNAL(inode) == NULL); 173 BUG_ON(EXT4_JOURNAL(inode) == NULL);
174 jbd_debug(2, "restarting handle %p\n", handle); 174 jbd_debug(2, "restarting handle %p\n", handle);
175 up_write(&EXT4_I(inode)->i_data_sem); 175 up_write(&EXT4_I(inode)->i_data_sem);
176 ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); 176 ret = ext4_journal_restart(handle, nblocks);
177 down_write(&EXT4_I(inode)->i_data_sem); 177 down_write(&EXT4_I(inode)->i_data_sem);
178 ext4_discard_preallocations(inode); 178 ext4_discard_preallocations(inode);
179 179
@@ -720,7 +720,7 @@ allocated:
720 return ret; 720 return ret;
721failed_out: 721failed_out:
722 for (i = 0; i < index; i++) 722 for (i = 0; i < index; i++)
723 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
724 return ret; 724 return ret;
725} 725}
726 726
@@ -823,20 +823,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
823 return err; 823 return err;
824failed: 824failed:
825 /* Allocation failed, free what we already allocated */ 825 /* Allocation failed, free what we already allocated */
826 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
827 for (i = 1; i <= n ; i++) { 827 for (i = 1; i <= n ; i++) {
828 /* 828 /*
829 * branch[i].bh is newly allocated, so there is no 829 * branch[i].bh is newly allocated, so there is no
830 * need to revoke the block, which is why we don't 830 * need to revoke the block, which is why we don't
831 * need to set EXT4_FREE_BLOCKS_METADATA. 831 * need to set EXT4_FREE_BLOCKS_METADATA.
832 */ 832 */
833 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
834 EXT4_FREE_BLOCKS_FORGET); 834 EXT4_FREE_BLOCKS_FORGET);
835 } 835 }
836 for (i = n+1; i < indirect_blks; i++) 836 for (i = n+1; i < indirect_blks; i++)
837 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
838 838
839 ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); 839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
840 840
841 return err; 841 return err;
842} 842}
@@ -924,7 +924,7 @@ err_out:
924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
925 EXT4_FREE_BLOCKS_FORGET); 925 EXT4_FREE_BLOCKS_FORGET);
926 } 926 }
927 ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), 927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
928 blks, 0); 928 blks, 0);
929 929
930 return err; 930 return err;
@@ -973,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
973 int count = 0; 973 int count = 0;
974 ext4_fsblk_t first_block = 0; 974 ext4_fsblk_t first_block = 0;
975 975
976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
976 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
977 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
978 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 979 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1058,6 +1059,8 @@ cleanup:
1058 partial--; 1059 partial--;
1059 } 1060 }
1060out: 1061out:
1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063 map->m_pblk, map->m_len, err);
1061 return err; 1064 return err;
1062} 1065}
1063 1066
@@ -2060,7 +2063,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2060 if (nr_pages == 0) 2063 if (nr_pages == 0)
2061 break; 2064 break;
2062 for (i = 0; i < nr_pages; i++) { 2065 for (i = 0; i < nr_pages; i++) {
2063 int commit_write = 0, redirty_page = 0; 2066 int commit_write = 0, skip_page = 0;
2064 struct page *page = pvec.pages[i]; 2067 struct page *page = pvec.pages[i];
2065 2068
2066 index = page->index; 2069 index = page->index;
@@ -2086,14 +2089,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2086 * If the page does not have buffers (for 2089 * If the page does not have buffers (for
2087 * whatever reason), try to create them using 2090 * whatever reason), try to create them using
2088 * __block_write_begin. If this fails, 2091 * __block_write_begin. If this fails,
2089 * redirty the page and move on. 2092 * skip the page and move on.
2090 */ 2093 */
2091 if (!page_has_buffers(page)) { 2094 if (!page_has_buffers(page)) {
2092 if (__block_write_begin(page, 0, len, 2095 if (__block_write_begin(page, 0, len,
2093 noalloc_get_block_write)) { 2096 noalloc_get_block_write)) {
2094 redirty_page: 2097 skip_page:
2095 redirty_page_for_writepage(mpd->wbc,
2096 page);
2097 unlock_page(page); 2098 unlock_page(page);
2098 continue; 2099 continue;
2099 } 2100 }
@@ -2104,7 +2105,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2104 block_start = 0; 2105 block_start = 0;
2105 do { 2106 do {
2106 if (!bh) 2107 if (!bh)
2107 goto redirty_page; 2108 goto skip_page;
2108 if (map && (cur_logical >= map->m_lblk) && 2109 if (map && (cur_logical >= map->m_lblk) &&
2109 (cur_logical <= (map->m_lblk + 2110 (cur_logical <= (map->m_lblk +
2110 (map->m_len - 1)))) { 2111 (map->m_len - 1)))) {
@@ -2120,22 +2121,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2120 clear_buffer_unwritten(bh); 2121 clear_buffer_unwritten(bh);
2121 } 2122 }
2122 2123
2123 /* redirty page if block allocation undone */ 2124 /* skip page if block allocation undone */
2124 if (buffer_delay(bh) || buffer_unwritten(bh)) 2125 if (buffer_delay(bh) || buffer_unwritten(bh))
2125 redirty_page = 1; 2126 skip_page = 1;
2126 bh = bh->b_this_page; 2127 bh = bh->b_this_page;
2127 block_start += bh->b_size; 2128 block_start += bh->b_size;
2128 cur_logical++; 2129 cur_logical++;
2129 pblock++; 2130 pblock++;
2130 } while (bh != page_bufs); 2131 } while (bh != page_bufs);
2131 2132
2132 if (redirty_page) 2133 if (skip_page)
2133 goto redirty_page; 2134 goto skip_page;
2134 2135
2135 if (commit_write) 2136 if (commit_write)
2136 /* mark the buffer_heads as dirty & uptodate */ 2137 /* mark the buffer_heads as dirty & uptodate */
2137 block_commit_write(page, 0, len); 2138 block_commit_write(page, 0, len);
2138 2139
2140 clear_page_dirty_for_io(page);
2139 /* 2141 /*
2140 * Delalloc doesn't support data journalling, 2142 * Delalloc doesn't support data journalling,
2141 * but eventually maybe we'll lift this 2143 * but eventually maybe we'll lift this
@@ -2165,8 +2167,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2165 return ret; 2167 return ret;
2166} 2168}
2167 2169
2168static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2170static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
2169 sector_t logical, long blk_cnt)
2170{ 2171{
2171 int nr_pages, i; 2172 int nr_pages, i;
2172 pgoff_t index, end; 2173 pgoff_t index, end;
@@ -2174,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2174 struct inode *inode = mpd->inode; 2175 struct inode *inode = mpd->inode;
2175 struct address_space *mapping = inode->i_mapping; 2176 struct address_space *mapping = inode->i_mapping;
2176 2177
2177 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2178 index = mpd->first_page;
2178 end = (logical + blk_cnt - 1) >> 2179 end = mpd->next_page - 1;
2179 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2180 while (index <= end) { 2180 while (index <= end) {
2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2182 if (nr_pages == 0) 2182 if (nr_pages == 0)
@@ -2279,9 +2279,8 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2279 err = blks; 2279 err = blks;
2280 /* 2280 /*
2281 * If get block returns EAGAIN or ENOSPC and there 2281 * If get block returns EAGAIN or ENOSPC and there
2282 * appears to be free blocks we will call 2282 * appears to be free blocks we will just let
2283 * ext4_writepage() for all of the pages which will 2283 * mpage_da_submit_io() unlock all of the pages.
2284 * just redirty the pages.
2285 */ 2284 */
2286 if (err == -EAGAIN) 2285 if (err == -EAGAIN)
2287 goto submit_io; 2286 goto submit_io;
@@ -2312,8 +2311,10 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2312 ext4_print_free_blocks(mpd->inode); 2311 ext4_print_free_blocks(mpd->inode);
2313 } 2312 }
2314 /* invalidate all the pages */ 2313 /* invalidate all the pages */
2315 ext4_da_block_invalidatepages(mpd, next, 2314 ext4_da_block_invalidatepages(mpd);
2316 mpd->b_size >> mpd->inode->i_blkbits); 2315
2316 /* Mark this page range as having been completed */
2317 mpd->io_done = 1;
2317 return; 2318 return;
2318 } 2319 }
2319 BUG_ON(blks == 0); 2320 BUG_ON(blks == 0);
@@ -2438,102 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2438} 2439}
2439 2440
2440/* 2441/*
2441 * __mpage_da_writepage - finds extent of pages and blocks
2442 *
2443 * @page: page to consider
2444 * @wbc: not used, we just follow rules
2445 * @data: context
2446 *
2447 * The function finds extents of pages and scan them for all blocks.
2448 */
2449static int __mpage_da_writepage(struct page *page,
2450 struct writeback_control *wbc,
2451 struct mpage_da_data *mpd)
2452{
2453 struct inode *inode = mpd->inode;
2454 struct buffer_head *bh, *head;
2455 sector_t logical;
2456
2457 /*
2458 * Can we merge this page to current extent?
2459 */
2460 if (mpd->next_page != page->index) {
2461 /*
2462 * Nope, we can't. So, we map non-allocated blocks
2463 * and start IO on them
2464 */
2465 if (mpd->next_page != mpd->first_page) {
2466 mpage_da_map_and_submit(mpd);
2467 /*
2468 * skip rest of the page in the page_vec
2469 */
2470 redirty_page_for_writepage(wbc, page);
2471 unlock_page(page);
2472 return MPAGE_DA_EXTENT_TAIL;
2473 }
2474
2475 /*
2476 * Start next extent of pages ...
2477 */
2478 mpd->first_page = page->index;
2479
2480 /*
2481 * ... and blocks
2482 */
2483 mpd->b_size = 0;
2484 mpd->b_state = 0;
2485 mpd->b_blocknr = 0;
2486 }
2487
2488 mpd->next_page = page->index + 1;
2489 logical = (sector_t) page->index <<
2490 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2491
2492 if (!page_has_buffers(page)) {
2493 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2494 (1 << BH_Dirty) | (1 << BH_Uptodate));
2495 if (mpd->io_done)
2496 return MPAGE_DA_EXTENT_TAIL;
2497 } else {
2498 /*
2499 * Page with regular buffer heads, just add all dirty ones
2500 */
2501 head = page_buffers(page);
2502 bh = head;
2503 do {
2504 BUG_ON(buffer_locked(bh));
2505 /*
2506 * We need to try to allocate
2507 * unmapped blocks in the same page.
2508 * Otherwise we won't make progress
2509 * with the page in ext4_writepage
2510 */
2511 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2512 mpage_add_bh_to_extent(mpd, logical,
2513 bh->b_size,
2514 bh->b_state);
2515 if (mpd->io_done)
2516 return MPAGE_DA_EXTENT_TAIL;
2517 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2518 /*
2519 * mapped dirty buffer. We need to update
2520 * the b_state because we look at
2521 * b_state in mpage_da_map_blocks. We don't
2522 * update b_size because if we find an
2523 * unmapped buffer_head later we need to
2524 * use the b_state flag of that buffer_head.
2525 */
2526 if (mpd->b_size == 0)
2527 mpd->b_state = bh->b_state & BH_FLAGS;
2528 }
2529 logical++;
2530 } while ((bh = bh->b_this_page) != head);
2531 }
2532
2533 return 0;
2534}
2535
2536/*
2537 * This is a special get_blocks_t callback which is used by 2442 * This is a special get_blocks_t callback which is used by
2538 * ext4_da_write_begin(). It will either return mapped block or 2443 * ext4_da_write_begin(). It will either return mapped block or
2539 * reserve space for a single block. 2444 * reserve space for a single block.
@@ -2597,7 +2502,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2597 * for partial write. 2502 * for partial write.
2598 */ 2503 */
2599 set_buffer_new(bh); 2504 set_buffer_new(bh);
2600 set_buffer_mapped(bh);
2601 } 2505 }
2602 return 0; 2506 return 0;
2603} 2507}
@@ -2811,27 +2715,27 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2811 2715
2812/* 2716/*
2813 * write_cache_pages_da - walk the list of dirty pages of the given 2717 * write_cache_pages_da - walk the list of dirty pages of the given
2814 * address space and call the callback function (which usually writes 2718 * address space and accumulate pages that need writing, and call
2815 * the pages). 2719 * mpage_da_map_and_submit to map a single contiguous memory region
2816 * 2720 * and then write them.
2817 * This is a forked version of write_cache_pages(). Differences:
2818 * Range cyclic is ignored.
2819 * no_nrwrite_index_update is always presumed true
2820 */ 2721 */
2821static int write_cache_pages_da(struct address_space *mapping, 2722static int write_cache_pages_da(struct address_space *mapping,
2822 struct writeback_control *wbc, 2723 struct writeback_control *wbc,
2823 struct mpage_da_data *mpd, 2724 struct mpage_da_data *mpd,
2824 pgoff_t *done_index) 2725 pgoff_t *done_index)
2825{ 2726{
2826 int ret = 0; 2727 struct buffer_head *bh, *head;
2827 int done = 0; 2728 struct inode *inode = mapping->host;
2828 struct pagevec pvec; 2729 struct pagevec pvec;
2829 unsigned nr_pages; 2730 unsigned int nr_pages;
2830 pgoff_t index; 2731 sector_t logical;
2831 pgoff_t end; /* Inclusive */ 2732 pgoff_t index, end;
2832 long nr_to_write = wbc->nr_to_write; 2733 long nr_to_write = wbc->nr_to_write;
2833 int tag; 2734 int i, tag, ret = 0;
2834 2735
2736 memset(mpd, 0, sizeof(struct mpage_da_data));
2737 mpd->wbc = wbc;
2738 mpd->inode = inode;
2835 pagevec_init(&pvec, 0); 2739 pagevec_init(&pvec, 0);
2836 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2740 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2837 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2741 end = wbc->range_end >> PAGE_CACHE_SHIFT;
@@ -2842,13 +2746,11 @@ static int write_cache_pages_da(struct address_space *mapping,
2842 tag = PAGECACHE_TAG_DIRTY; 2746 tag = PAGECACHE_TAG_DIRTY;
2843 2747
2844 *done_index = index; 2748 *done_index = index;
2845 while (!done && (index <= end)) { 2749 while (index <= end) {
2846 int i;
2847
2848 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2750 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2849 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2751 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2850 if (nr_pages == 0) 2752 if (nr_pages == 0)
2851 break; 2753 return 0;
2852 2754
2853 for (i = 0; i < nr_pages; i++) { 2755 for (i = 0; i < nr_pages; i++) {
2854 struct page *page = pvec.pages[i]; 2756 struct page *page = pvec.pages[i];
@@ -2860,60 +2762,100 @@ static int write_cache_pages_da(struct address_space *mapping,
2860 * mapping. However, page->index will not change 2762 * mapping. However, page->index will not change
2861 * because we have a reference on the page. 2763 * because we have a reference on the page.
2862 */ 2764 */
2863 if (page->index > end) { 2765 if (page->index > end)
2864 done = 1; 2766 goto out;
2865 break;
2866 }
2867 2767
2868 *done_index = page->index + 1; 2768 *done_index = page->index + 1;
2869 2769
2770 /*
2771 * If we can't merge this page, and we have
2772 * accumulated an contiguous region, write it
2773 */
2774 if ((mpd->next_page != page->index) &&
2775 (mpd->next_page != mpd->first_page)) {
2776 mpage_da_map_and_submit(mpd);
2777 goto ret_extent_tail;
2778 }
2779
2870 lock_page(page); 2780 lock_page(page);
2871 2781
2872 /* 2782 /*
2873 * Page truncated or invalidated. We can freely skip it 2783 * If the page is no longer dirty, or its
2874 * then, even for data integrity operations: the page 2784 * mapping no longer corresponds to inode we
2875 * has disappeared concurrently, so there could be no 2785 * are writing (which means it has been
2876 * real expectation of this data interity operation 2786 * truncated or invalidated), or the page is
2877 * even if there is now a new, dirty page at the same 2787 * already under writeback and we are not
2878 * pagecache address. 2788 * doing a data integrity writeback, skip the page
2879 */ 2789 */
2880 if (unlikely(page->mapping != mapping)) { 2790 if (!PageDirty(page) ||
2881continue_unlock: 2791 (PageWriteback(page) &&
2792 (wbc->sync_mode == WB_SYNC_NONE)) ||
2793 unlikely(page->mapping != mapping)) {
2882 unlock_page(page); 2794 unlock_page(page);
2883 continue; 2795 continue;
2884 } 2796 }
2885 2797
2886 if (!PageDirty(page)) { 2798 if (PageWriteback(page))
2887 /* someone wrote it for us */ 2799 wait_on_page_writeback(page);
2888 goto continue_unlock;
2889 }
2890
2891 if (PageWriteback(page)) {
2892 if (wbc->sync_mode != WB_SYNC_NONE)
2893 wait_on_page_writeback(page);
2894 else
2895 goto continue_unlock;
2896 }
2897 2800
2898 BUG_ON(PageWriteback(page)); 2801 BUG_ON(PageWriteback(page));
2899 if (!clear_page_dirty_for_io(page))
2900 goto continue_unlock;
2901 2802
2902 ret = __mpage_da_writepage(page, wbc, mpd); 2803 if (mpd->next_page != page->index)
2903 if (unlikely(ret)) { 2804 mpd->first_page = page->index;
2904 if (ret == AOP_WRITEPAGE_ACTIVATE) { 2805 mpd->next_page = page->index + 1;
2905 unlock_page(page); 2806 logical = (sector_t) page->index <<
2906 ret = 0; 2807 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2907 } else { 2808
2908 done = 1; 2809 if (!page_has_buffers(page)) {
2909 break; 2810 mpage_add_bh_to_extent(mpd, logical,
2910 } 2811 PAGE_CACHE_SIZE,
2812 (1 << BH_Dirty) | (1 << BH_Uptodate));
2813 if (mpd->io_done)
2814 goto ret_extent_tail;
2815 } else {
2816 /*
2817 * Page with regular buffer heads,
2818 * just add all dirty ones
2819 */
2820 head = page_buffers(page);
2821 bh = head;
2822 do {
2823 BUG_ON(buffer_locked(bh));
2824 /*
2825 * We need to try to allocate
2826 * unmapped blocks in the same page.
2827 * Otherwise we won't make progress
2828 * with the page in ext4_writepage
2829 */
2830 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2831 mpage_add_bh_to_extent(mpd, logical,
2832 bh->b_size,
2833 bh->b_state);
2834 if (mpd->io_done)
2835 goto ret_extent_tail;
2836 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2837 /*
2838 * mapped dirty buffer. We need
2839 * to update the b_state
2840 * because we look at b_state
2841 * in mpage_da_map_blocks. We
2842 * don't update b_size because
2843 * if we find an unmapped
2844 * buffer_head later we need to
2845 * use the b_state flag of that
2846 * buffer_head.
2847 */
2848 if (mpd->b_size == 0)
2849 mpd->b_state = bh->b_state & BH_FLAGS;
2850 }
2851 logical++;
2852 } while ((bh = bh->b_this_page) != head);
2911 } 2853 }
2912 2854
2913 if (nr_to_write > 0) { 2855 if (nr_to_write > 0) {
2914 nr_to_write--; 2856 nr_to_write--;
2915 if (nr_to_write == 0 && 2857 if (nr_to_write == 0 &&
2916 wbc->sync_mode == WB_SYNC_NONE) { 2858 wbc->sync_mode == WB_SYNC_NONE)
2917 /* 2859 /*
2918 * We stop writing back only if we are 2860 * We stop writing back only if we are
2919 * not doing integrity sync. In case of 2861 * not doing integrity sync. In case of
@@ -2924,14 +2866,18 @@ continue_unlock:
2924 * pages, but have not synced all of the 2866 * pages, but have not synced all of the
2925 * old dirty pages. 2867 * old dirty pages.
2926 */ 2868 */
2927 done = 1; 2869 goto out;
2928 break;
2929 }
2930 } 2870 }
2931 } 2871 }
2932 pagevec_release(&pvec); 2872 pagevec_release(&pvec);
2933 cond_resched(); 2873 cond_resched();
2934 } 2874 }
2875 return 0;
2876ret_extent_tail:
2877 ret = MPAGE_DA_EXTENT_TAIL;
2878out:
2879 pagevec_release(&pvec);
2880 cond_resched();
2935 return ret; 2881 return ret;
2936} 2882}
2937 2883
@@ -2945,7 +2891,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2945 struct mpage_da_data mpd; 2891 struct mpage_da_data mpd;
2946 struct inode *inode = mapping->host; 2892 struct inode *inode = mapping->host;
2947 int pages_written = 0; 2893 int pages_written = 0;
2948 long pages_skipped;
2949 unsigned int max_pages; 2894 unsigned int max_pages;
2950 int range_cyclic, cycled = 1, io_done = 0; 2895 int range_cyclic, cycled = 1, io_done = 0;
2951 int needed_blocks, ret = 0; 2896 int needed_blocks, ret = 0;
@@ -3028,11 +2973,6 @@ static int ext4_da_writepages(struct address_space *mapping,
3028 wbc->nr_to_write = desired_nr_to_write; 2973 wbc->nr_to_write = desired_nr_to_write;
3029 } 2974 }
3030 2975
3031 mpd.wbc = wbc;
3032 mpd.inode = mapping->host;
3033
3034 pages_skipped = wbc->pages_skipped;
3035
3036retry: 2976retry:
3037 if (wbc->sync_mode == WB_SYNC_ALL) 2977 if (wbc->sync_mode == WB_SYNC_ALL)
3038 tag_pages_for_writeback(mapping, index, end); 2978 tag_pages_for_writeback(mapping, index, end);
@@ -3059,22 +2999,10 @@ retry:
3059 } 2999 }
3060 3000
3061 /* 3001 /*
3062 * Now call __mpage_da_writepage to find the next 3002 * Now call write_cache_pages_da() to find the next
3063 * contiguous region of logical blocks that need 3003 * contiguous region of logical blocks that need
3064 * blocks to be allocated by ext4. We don't actually 3004 * blocks to be allocated by ext4 and submit them.
3065 * submit the blocks for I/O here, even though
3066 * write_cache_pages thinks it will, and will set the
3067 * pages as clean for write before calling
3068 * __mpage_da_writepage().
3069 */ 3005 */
3070 mpd.b_size = 0;
3071 mpd.b_state = 0;
3072 mpd.b_blocknr = 0;
3073 mpd.first_page = 0;
3074 mpd.next_page = 0;
3075 mpd.io_done = 0;
3076 mpd.pages_written = 0;
3077 mpd.retval = 0;
3078 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); 3006 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3079 /* 3007 /*
3080 * If we have a contiguous extent of pages and we 3008 * If we have a contiguous extent of pages and we
@@ -3096,7 +3024,6 @@ retry:
3096 * and try again 3024 * and try again
3097 */ 3025 */
3098 jbd2_journal_force_commit_nested(sbi->s_journal); 3026 jbd2_journal_force_commit_nested(sbi->s_journal);
3099 wbc->pages_skipped = pages_skipped;
3100 ret = 0; 3027 ret = 0;
3101 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 3028 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
3102 /* 3029 /*
@@ -3104,7 +3031,6 @@ retry:
3104 * rest of the pages 3031 * rest of the pages
3105 */ 3032 */
3106 pages_written += mpd.pages_written; 3033 pages_written += mpd.pages_written;
3107 wbc->pages_skipped = pages_skipped;
3108 ret = 0; 3034 ret = 0;
3109 io_done = 1; 3035 io_done = 1;
3110 } else if (wbc->nr_to_write) 3036 } else if (wbc->nr_to_write)
@@ -3122,11 +3048,6 @@ retry:
3122 wbc->range_end = mapping->writeback_index - 1; 3048 wbc->range_end = mapping->writeback_index - 1;
3123 goto retry; 3049 goto retry;
3124 } 3050 }
3125 if (pages_skipped != wbc->pages_skipped)
3126 ext4_msg(inode->i_sb, KERN_CRIT,
3127 "This should not happen leaving %s "
3128 "with nr_to_write = %ld ret = %d",
3129 __func__, wbc->nr_to_write, ret);
3130 3051
3131 /* Update index */ 3052 /* Update index */
3132 wbc->range_cyclic = range_cyclic; 3053 wbc->range_cyclic = range_cyclic;
@@ -3460,6 +3381,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3460 3381
3461static int ext4_readpage(struct file *file, struct page *page) 3382static int ext4_readpage(struct file *file, struct page *page)
3462{ 3383{
3384 trace_ext4_readpage(page);
3463 return mpage_readpage(page, ext4_get_block); 3385 return mpage_readpage(page, ext4_get_block);
3464} 3386}
3465 3387
@@ -3494,6 +3416,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
3494{ 3416{
3495 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3417 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3496 3418
3419 trace_ext4_invalidatepage(page, offset);
3420
3497 /* 3421 /*
3498 * free any io_end structure allocated for buffers to be discarded 3422 * free any io_end structure allocated for buffers to be discarded
3499 */ 3423 */
@@ -3515,6 +3439,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3515{ 3439{
3516 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3440 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3517 3441
3442 trace_ext4_releasepage(page);
3443
3518 WARN_ON(PageChecked(page)); 3444 WARN_ON(PageChecked(page));
3519 if (!page_has_buffers(page)) 3445 if (!page_has_buffers(page))
3520 return 0; 3446 return 0;
@@ -3873,11 +3799,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3873{ 3799{
3874 struct file *file = iocb->ki_filp; 3800 struct file *file = iocb->ki_filp;
3875 struct inode *inode = file->f_mapping->host; 3801 struct inode *inode = file->f_mapping->host;
3802 ssize_t ret;
3876 3803
3804 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3877 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3805 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3878 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3806 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3879 3807 else
3880 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3808 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3809 trace_ext4_direct_IO_exit(inode, offset,
3810 iov_length(iov, nr_segs), rw, ret);
3811 return ret;
3881} 3812}
3882 3813
3883/* 3814/*
@@ -4173,6 +4104,9 @@ no_top:
4173 * 4104 *
4174 * We release `count' blocks on disk, but (last - first) may be greater 4105 * We release `count' blocks on disk, but (last - first) may be greater
4175 * than `count' because there can be holes in there. 4106 * than `count' because there can be holes in there.
4107 *
4108 * Return 0 on success, 1 on invalid block range
4109 * and < 0 on fatal error.
4176 */ 4110 */
4177static int ext4_clear_blocks(handle_t *handle, struct inode *inode, 4111static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4178 struct buffer_head *bh, 4112 struct buffer_head *bh,
@@ -4199,33 +4133,32 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4199 if (bh) { 4133 if (bh) {
4200 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4134 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4201 err = ext4_handle_dirty_metadata(handle, inode, bh); 4135 err = ext4_handle_dirty_metadata(handle, inode, bh);
4202 if (unlikely(err)) { 4136 if (unlikely(err))
4203 ext4_std_error(inode->i_sb, err); 4137 goto out_err;
4204 return 1;
4205 }
4206 } 4138 }
4207 err = ext4_mark_inode_dirty(handle, inode); 4139 err = ext4_mark_inode_dirty(handle, inode);
4208 if (unlikely(err)) { 4140 if (unlikely(err))
4209 ext4_std_error(inode->i_sb, err); 4141 goto out_err;
4210 return 1;
4211 }
4212 err = ext4_truncate_restart_trans(handle, inode, 4142 err = ext4_truncate_restart_trans(handle, inode,
4213 blocks_for_truncate(inode)); 4143 blocks_for_truncate(inode));
4214 if (unlikely(err)) { 4144 if (unlikely(err))
4215 ext4_std_error(inode->i_sb, err); 4145 goto out_err;
4216 return 1;
4217 }
4218 if (bh) { 4146 if (bh) {
4219 BUFFER_TRACE(bh, "retaking write access"); 4147 BUFFER_TRACE(bh, "retaking write access");
4220 ext4_journal_get_write_access(handle, bh); 4148 err = ext4_journal_get_write_access(handle, bh);
4149 if (unlikely(err))
4150 goto out_err;
4221 } 4151 }
4222 } 4152 }
4223 4153
4224 for (p = first; p < last; p++) 4154 for (p = first; p < last; p++)
4225 *p = 0; 4155 *p = 0;
4226 4156
4227 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4157 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4228 return 0; 4158 return 0;
4159out_err:
4160 ext4_std_error(inode->i_sb, err);
4161 return err;
4229} 4162}
4230 4163
4231/** 4164/**
@@ -4259,7 +4192,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4259 ext4_fsblk_t nr; /* Current block # */ 4192 ext4_fsblk_t nr; /* Current block # */
4260 __le32 *p; /* Pointer into inode/ind 4193 __le32 *p; /* Pointer into inode/ind
4261 for current block */ 4194 for current block */
4262 int err; 4195 int err = 0;
4263 4196
4264 if (this_bh) { /* For indirect block */ 4197 if (this_bh) { /* For indirect block */
4265 BUFFER_TRACE(this_bh, "get_write_access"); 4198 BUFFER_TRACE(this_bh, "get_write_access");
@@ -4281,9 +4214,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4281 } else if (nr == block_to_free + count) { 4214 } else if (nr == block_to_free + count) {
4282 count++; 4215 count++;
4283 } else { 4216 } else {
4284 if (ext4_clear_blocks(handle, inode, this_bh, 4217 err = ext4_clear_blocks(handle, inode, this_bh,
4285 block_to_free, count, 4218 block_to_free, count,
4286 block_to_free_p, p)) 4219 block_to_free_p, p);
4220 if (err)
4287 break; 4221 break;
4288 block_to_free = nr; 4222 block_to_free = nr;
4289 block_to_free_p = p; 4223 block_to_free_p = p;
@@ -4292,9 +4226,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4292 } 4226 }
4293 } 4227 }
4294 4228
4295 if (count > 0) 4229 if (!err && count > 0)
4296 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 4230 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4297 count, block_to_free_p, p); 4231 count, block_to_free_p, p);
4232 if (err < 0)
4233 /* fatal error */
4234 return;
4298 4235
4299 if (this_bh) { 4236 if (this_bh) {
4300 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 4237 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4412,7 +4349,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4412 * transaction where the data blocks are 4349 * transaction where the data blocks are
4413 * actually freed. 4350 * actually freed.
4414 */ 4351 */
4415 ext4_free_blocks(handle, inode, 0, nr, 1, 4352 ext4_free_blocks(handle, inode, NULL, nr, 1,
4416 EXT4_FREE_BLOCKS_METADATA| 4353 EXT4_FREE_BLOCKS_METADATA|
4417 EXT4_FREE_BLOCKS_FORGET); 4354 EXT4_FREE_BLOCKS_FORGET);
4418 4355
@@ -4496,6 +4433,8 @@ void ext4_truncate(struct inode *inode)
4496 ext4_lblk_t last_block; 4433 ext4_lblk_t last_block;
4497 unsigned blocksize = inode->i_sb->s_blocksize; 4434 unsigned blocksize = inode->i_sb->s_blocksize;
4498 4435
4436 trace_ext4_truncate_enter(inode);
4437
4499 if (!ext4_can_truncate(inode)) 4438 if (!ext4_can_truncate(inode))
4500 return; 4439 return;
4501 4440
@@ -4506,6 +4445,7 @@ void ext4_truncate(struct inode *inode)
4506 4445
4507 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4446 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4508 ext4_ext_truncate(inode); 4447 ext4_ext_truncate(inode);
4448 trace_ext4_truncate_exit(inode);
4509 return; 4449 return;
4510 } 4450 }
4511 4451
@@ -4635,6 +4575,7 @@ out_stop:
4635 ext4_orphan_del(handle, inode); 4575 ext4_orphan_del(handle, inode);
4636 4576
4637 ext4_journal_stop(handle); 4577 ext4_journal_stop(handle);
4578 trace_ext4_truncate_exit(inode);
4638} 4579}
4639 4580
4640/* 4581/*
@@ -4766,6 +4707,7 @@ make_io:
4766 * has in-inode xattrs, or we don't have this inode in memory. 4707 * has in-inode xattrs, or we don't have this inode in memory.
4767 * Read the block from disk. 4708 * Read the block from disk.
4768 */ 4709 */
4710 trace_ext4_load_inode(inode);
4769 get_bh(bh); 4711 get_bh(bh);
4770 bh->b_end_io = end_buffer_read_sync; 4712 bh->b_end_io = end_buffer_read_sync;
4771 submit_bh(READ_META, bh); 4713 submit_bh(READ_META, bh);
@@ -4871,7 +4813,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4871 return inode; 4813 return inode;
4872 4814
4873 ei = EXT4_I(inode); 4815 ei = EXT4_I(inode);
4874 iloc.bh = 0; 4816 iloc.bh = NULL;
4875 4817
4876 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4818 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4877 if (ret < 0) 4819 if (ret < 0)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a84faa110bcd..808c554e773f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -334,16 +334,22 @@ mext_out:
334 case FITRIM: 334 case FITRIM:
335 { 335 {
336 struct super_block *sb = inode->i_sb; 336 struct super_block *sb = inode->i_sb;
337 struct request_queue *q = bdev_get_queue(sb->s_bdev);
337 struct fstrim_range range; 338 struct fstrim_range range;
338 int ret = 0; 339 int ret = 0;
339 340
340 if (!capable(CAP_SYS_ADMIN)) 341 if (!capable(CAP_SYS_ADMIN))
341 return -EPERM; 342 return -EPERM;
342 343
344 if (!blk_queue_discard(q))
345 return -EOPNOTSUPP;
346
343 if (copy_from_user(&range, (struct fstrim_range *)arg, 347 if (copy_from_user(&range, (struct fstrim_range *)arg,
344 sizeof(range))) 348 sizeof(range)))
345 return -EFAULT; 349 return -EFAULT;
346 350
351 range.minlen = max((unsigned int)range.minlen,
352 q->limits.discard_granularity);
347 ret = ext4_trim_fs(sb, &range); 353 ret = ext4_trim_fs(sb, &range);
348 if (ret < 0) 354 if (ret < 0)
349 return ret; 355 return ret;
@@ -421,6 +427,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
421 return err; 427 return err;
422 } 428 }
423 case EXT4_IOC_MOVE_EXT: 429 case EXT4_IOC_MOVE_EXT:
430 case FITRIM:
424 break; 431 break;
425 default: 432 default:
426 return -ENOIOCTLCMD; 433 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d1fe09aea73d..a5837a837a8b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -432,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
432 } 432 }
433 433
434 /* at order 0 we see each particular block */ 434 /* at order 0 we see each particular block */
435 *max = 1 << (e4b->bd_blkbits + 3); 435 if (order == 0) {
436 if (order == 0) 436 *max = 1 << (e4b->bd_blkbits + 3);
437 return EXT4_MB_BITMAP(e4b); 437 return EXT4_MB_BITMAP(e4b);
438 }
438 439
439 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 440 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
440 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 441 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -616,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
616 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 617 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
617 618
618 grp = ext4_get_group_info(sb, e4b->bd_group); 619 grp = ext4_get_group_info(sb, e4b->bd_group);
619 buddy = mb_find_buddy(e4b, 0, &max);
620 list_for_each(cur, &grp->bb_prealloc_list) { 620 list_for_each(cur, &grp->bb_prealloc_list) {
621 ext4_group_t groupnr; 621 ext4_group_t groupnr;
622 struct ext4_prealloc_space *pa; 622 struct ext4_prealloc_space *pa;
@@ -635,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
635#define mb_check_buddy(e4b) 635#define mb_check_buddy(e4b)
636#endif 636#endif
637 637
638/* FIXME!! need more doc */ 638/*
639 * Divide blocks started from @first with length @len into
640 * smaller chunks with power of 2 blocks.
641 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
642 * then increase bb_counters[] for corresponded chunk size.
643 */
639static void ext4_mb_mark_free_simple(struct super_block *sb, 644static void ext4_mb_mark_free_simple(struct super_block *sb,
640 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 645 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
641 struct ext4_group_info *grp) 646 struct ext4_group_info *grp)
@@ -2381,7 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2381 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2386 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2382 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2387 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2383 * So a two level scheme suffices for now. */ 2388 * So a two level scheme suffices for now. */
2384 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); 2389 sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
2385 if (sbi->s_group_info == NULL) { 2390 if (sbi->s_group_info == NULL) {
2386 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2391 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2387 return -ENOMEM; 2392 return -ENOMEM;
@@ -3208,7 +3213,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3208 cur_distance = abs(goal_block - cpa->pa_pstart); 3213 cur_distance = abs(goal_block - cpa->pa_pstart);
3209 new_distance = abs(goal_block - pa->pa_pstart); 3214 new_distance = abs(goal_block - pa->pa_pstart);
3210 3215
3211 if (cur_distance < new_distance) 3216 if (cur_distance <= new_distance)
3212 return cpa; 3217 return cpa;
3213 3218
3214 /* drop the previous reference */ 3219 /* drop the previous reference */
@@ -3907,7 +3912,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3907 struct super_block *sb = ac->ac_sb; 3912 struct super_block *sb = ac->ac_sb;
3908 ext4_group_t ngroups, i; 3913 ext4_group_t ngroups, i;
3909 3914
3910 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) 3915 if (!mb_enable_debug ||
3916 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3911 return; 3917 return;
3912 3918
3913 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3919 printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4753,7 +4759,8 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4753 * bitmap. Then issue a TRIM command on this extent and free the extent in 4759 * bitmap. Then issue a TRIM command on this extent and free the extent in
4754 * the group buddy bitmap. This is done until whole group is scanned. 4760 * the group buddy bitmap. This is done until whole group is scanned.
4755 */ 4761 */
4756ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, 4762static ext4_grpblk_t
4763ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4757 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) 4764 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
4758{ 4765{
4759 void *bitmap; 4766 void *bitmap;
@@ -4863,10 +4870,15 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4863 break; 4870 break;
4864 } 4871 }
4865 4872
4866 if (len >= EXT4_BLOCKS_PER_GROUP(sb)) 4873 /*
4867 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); 4874 * For all the groups except the last one, last block will
4868 else 4875 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
4876 * change it for the last group in which case start +
4877 * len < EXT4_BLOCKS_PER_GROUP(sb).
4878 */
4879 if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
4869 last_block = first_block + len; 4880 last_block = first_block + len;
4881 len -= last_block - first_block;
4870 4882
4871 if (e4b.bd_info->bb_free >= minlen) { 4883 if (e4b.bd_info->bb_free >= minlen) {
4872 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4884 cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b619322c76f0..22bd4d7f289b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -169,7 +169,7 @@ struct ext4_allocation_context {
169 /* original request */ 169 /* original request */
170 struct ext4_free_extent ac_o_ex; 170 struct ext4_free_extent ac_o_ex;
171 171
172 /* goal request (after normalization) */ 172 /* goal request (normalized ac_o_ex) */
173 struct ext4_free_extent ac_g_ex; 173 struct ext4_free_extent ac_g_ex;
174 174
175 /* the best found extent */ 175 /* the best found extent */
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b0a126f23c20..d1bafa57f483 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -263,7 +263,7 @@ static int free_dind_blocks(handle_t *handle,
263 for (i = 0; i < max_entries; i++) { 263 for (i = 0; i < max_entries; i++) {
264 if (tmp_idata[i]) { 264 if (tmp_idata[i]) {
265 extend_credit_for_blkdel(handle, inode); 265 extend_credit_for_blkdel(handle, inode);
266 ext4_free_blocks(handle, inode, 0, 266 ext4_free_blocks(handle, inode, NULL,
267 le32_to_cpu(tmp_idata[i]), 1, 267 le32_to_cpu(tmp_idata[i]), 1,
268 EXT4_FREE_BLOCKS_METADATA | 268 EXT4_FREE_BLOCKS_METADATA |
269 EXT4_FREE_BLOCKS_FORGET); 269 EXT4_FREE_BLOCKS_FORGET);
@@ -271,7 +271,7 @@ static int free_dind_blocks(handle_t *handle,
271 } 271 }
272 put_bh(bh); 272 put_bh(bh);
273 extend_credit_for_blkdel(handle, inode); 273 extend_credit_for_blkdel(handle, inode);
274 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 274 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
275 EXT4_FREE_BLOCKS_METADATA | 275 EXT4_FREE_BLOCKS_METADATA |
276 EXT4_FREE_BLOCKS_FORGET); 276 EXT4_FREE_BLOCKS_FORGET);
277 return 0; 277 return 0;
@@ -302,7 +302,7 @@ static int free_tind_blocks(handle_t *handle,
302 } 302 }
303 put_bh(bh); 303 put_bh(bh);
304 extend_credit_for_blkdel(handle, inode); 304 extend_credit_for_blkdel(handle, inode);
305 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 305 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
306 EXT4_FREE_BLOCKS_METADATA | 306 EXT4_FREE_BLOCKS_METADATA |
307 EXT4_FREE_BLOCKS_FORGET); 307 EXT4_FREE_BLOCKS_FORGET);
308 return 0; 308 return 0;
@@ -315,7 +315,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
315 /* ei->i_data[EXT4_IND_BLOCK] */ 315 /* ei->i_data[EXT4_IND_BLOCK] */
316 if (i_data[0]) { 316 if (i_data[0]) {
317 extend_credit_for_blkdel(handle, inode); 317 extend_credit_for_blkdel(handle, inode);
318 ext4_free_blocks(handle, inode, 0, 318 ext4_free_blocks(handle, inode, NULL,
319 le32_to_cpu(i_data[0]), 1, 319 le32_to_cpu(i_data[0]), 1,
320 EXT4_FREE_BLOCKS_METADATA | 320 EXT4_FREE_BLOCKS_METADATA |
321 EXT4_FREE_BLOCKS_FORGET); 321 EXT4_FREE_BLOCKS_FORGET);
@@ -428,7 +428,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
428 } 428 }
429 put_bh(bh); 429 put_bh(bh);
430 extend_credit_for_blkdel(handle, inode); 430 extend_credit_for_blkdel(handle, inode);
431 ext4_free_blocks(handle, inode, 0, block, 1, 431 ext4_free_blocks(handle, inode, NULL, block, 1,
432 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 432 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
433 return retval; 433 return retval;
434} 434}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e781b7ea5630..67fd0b025858 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -40,6 +40,7 @@
40#include "xattr.h" 40#include "xattr.h"
41#include "acl.h" 41#include "acl.h"
42 42
43#include <trace/events/ext4.h>
43/* 44/*
44 * define how far ahead to read directories while searching them. 45 * define how far ahead to read directories while searching them.
45 */ 46 */
@@ -2183,6 +2184,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2183 struct ext4_dir_entry_2 *de; 2184 struct ext4_dir_entry_2 *de;
2184 handle_t *handle; 2185 handle_t *handle;
2185 2186
2187 trace_ext4_unlink_enter(dir, dentry);
2186 /* Initialize quotas before so that eventual writes go 2188 /* Initialize quotas before so that eventual writes go
2187 * in separate transaction */ 2189 * in separate transaction */
2188 dquot_initialize(dir); 2190 dquot_initialize(dir);
@@ -2228,6 +2230,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2228end_unlink: 2230end_unlink:
2229 ext4_journal_stop(handle); 2231 ext4_journal_stop(handle);
2230 brelse(bh); 2232 brelse(bh);
2233 trace_ext4_unlink_exit(dentry, retval);
2231 return retval; 2234 return retval;
2232} 2235}
2233 2236
@@ -2402,6 +2405,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2402 if (!new_inode && new_dir != old_dir && 2405 if (!new_inode && new_dir != old_dir &&
2403 EXT4_DIR_LINK_MAX(new_dir)) 2406 EXT4_DIR_LINK_MAX(new_dir))
2404 goto end_rename; 2407 goto end_rename;
2408 BUFFER_TRACE(dir_bh, "get_write_access");
2409 retval = ext4_journal_get_write_access(handle, dir_bh);
2410 if (retval)
2411 goto end_rename;
2405 } 2412 }
2406 if (!new_bh) { 2413 if (!new_bh) {
2407 retval = ext4_add_entry(handle, new_dentry, old_inode); 2414 retval = ext4_add_entry(handle, new_dentry, old_inode);
@@ -2409,7 +2416,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2409 goto end_rename; 2416 goto end_rename;
2410 } else { 2417 } else {
2411 BUFFER_TRACE(new_bh, "get write access"); 2418 BUFFER_TRACE(new_bh, "get write access");
2412 ext4_journal_get_write_access(handle, new_bh); 2419 retval = ext4_journal_get_write_access(handle, new_bh);
2420 if (retval)
2421 goto end_rename;
2413 new_de->inode = cpu_to_le32(old_inode->i_ino); 2422 new_de->inode = cpu_to_le32(old_inode->i_ino);
2414 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, 2423 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2415 EXT4_FEATURE_INCOMPAT_FILETYPE)) 2424 EXT4_FEATURE_INCOMPAT_FILETYPE))
@@ -2470,8 +2479,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2470 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); 2479 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2471 ext4_update_dx_flag(old_dir); 2480 ext4_update_dx_flag(old_dir);
2472 if (dir_bh) { 2481 if (dir_bh) {
2473 BUFFER_TRACE(dir_bh, "get_write_access");
2474 ext4_journal_get_write_access(handle, dir_bh);
2475 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2482 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2476 cpu_to_le32(new_dir->i_ino); 2483 cpu_to_le32(new_dir->i_ino);
2477 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2484 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index e2cd90e4bb7c..b6dbd056fcb1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -259,6 +259,11 @@ static void ext4_end_bio(struct bio *bio, int error)
259 bi_sector >> (inode->i_blkbits - 9)); 259 bi_sector >> (inode->i_blkbits - 9));
260 } 260 }
261 261
262 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
263 ext4_free_io_end(io_end);
264 return;
265 }
266
262 /* Add the io_end to per-inode completed io list*/ 267 /* Add the io_end to per-inode completed io list*/
263 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 268 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
264 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); 269 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
@@ -279,9 +284,9 @@ void ext4_io_submit(struct ext4_io_submit *io)
279 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); 284 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
280 bio_put(io->io_bio); 285 bio_put(io->io_bio);
281 } 286 }
282 io->io_bio = 0; 287 io->io_bio = NULL;
283 io->io_op = 0; 288 io->io_op = 0;
284 io->io_end = 0; 289 io->io_end = NULL;
285} 290}
286 291
287static int io_submit_init(struct ext4_io_submit *io, 292static int io_submit_init(struct ext4_io_submit *io,
@@ -380,8 +385,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
380 385
381 BUG_ON(!PageLocked(page)); 386 BUG_ON(!PageLocked(page));
382 BUG_ON(PageWriteback(page)); 387 BUG_ON(PageWriteback(page));
383 set_page_writeback(page);
384 ClearPageError(page);
385 388
386 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); 389 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
387 if (!io_page) { 390 if (!io_page) {
@@ -392,6 +395,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
392 io_page->p_page = page; 395 io_page->p_page = page;
393 atomic_set(&io_page->p_count, 1); 396 atomic_set(&io_page->p_count, 1);
394 get_page(page); 397 get_page(page);
398 set_page_writeback(page);
399 ClearPageError(page);
395 400
396 for (bh = head = page_buffers(page), block_start = 0; 401 for (bh = head = page_buffers(page), block_start = 0;
397 bh != head || !block_start; 402 bh != head || !block_start;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3ecc6e45d2f9..80bbc9c60c24 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -230,7 +230,7 @@ static int setup_new_group_blocks(struct super_block *sb,
230 } 230 }
231 231
232 /* Zero out all of the reserved backup group descriptor table blocks */ 232 /* Zero out all of the reserved backup group descriptor table blocks */
233 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", 233 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
234 block, sbi->s_itb_per_group); 234 block, sbi->s_itb_per_group);
235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, 235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
236 GFP_NOFS); 236 GFP_NOFS);
@@ -248,7 +248,7 @@ static int setup_new_group_blocks(struct super_block *sb,
248 248
249 /* Zero out all of the inode table blocks */ 249 /* Zero out all of the inode table blocks */
250 block = input->inode_table; 250 block = input->inode_table;
251 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", 251 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
252 block, sbi->s_itb_per_group); 252 block, sbi->s_itb_per_group);
253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); 253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
254 if (err) 254 if (err)
@@ -499,12 +499,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
499 return err; 499 return err;
500 500
501exit_inode: 501exit_inode:
502 /* ext4_journal_release_buffer(handle, iloc.bh); */ 502 /* ext4_handle_release_buffer(handle, iloc.bh); */
503 brelse(iloc.bh); 503 brelse(iloc.bh);
504exit_dindj: 504exit_dindj:
505 /* ext4_journal_release_buffer(handle, dind); */ 505 /* ext4_handle_release_buffer(handle, dind); */
506exit_sbh: 506exit_sbh:
507 /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ 507 /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
508exit_dind: 508exit_dind:
509 brelse(dind); 509 brelse(dind);
510exit_bh: 510exit_bh:
@@ -586,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
586 /* 586 /*
587 int j; 587 int j;
588 for (j = 0; j < i; j++) 588 for (j = 0; j < i; j++)
589 ext4_journal_release_buffer(handle, primary[j]); 589 ext4_handle_release_buffer(handle, primary[j]);
590 */ 590 */
591 goto exit_bh; 591 goto exit_bh;
592 } 592 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 203f9e4a70be..22546ad7f0ae 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -54,9 +54,9 @@
54 54
55static struct proc_dir_entry *ext4_proc_root; 55static struct proc_dir_entry *ext4_proc_root;
56static struct kset *ext4_kset; 56static struct kset *ext4_kset;
57struct ext4_lazy_init *ext4_li_info; 57static struct ext4_lazy_init *ext4_li_info;
58struct mutex ext4_li_mtx; 58static struct mutex ext4_li_mtx;
59struct ext4_features *ext4_feat; 59static struct ext4_features *ext4_feat;
60 60
61static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 61static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
62 unsigned long journal_devnum); 62 unsigned long journal_devnum);
@@ -75,6 +75,7 @@ static void ext4_write_super(struct super_block *sb);
75static int ext4_freeze(struct super_block *sb); 75static int ext4_freeze(struct super_block *sb);
76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77 const char *dev_name, void *data); 77 const char *dev_name, void *data);
78static int ext4_feature_set_ok(struct super_block *sb, int readonly);
78static void ext4_destroy_lazyinit_thread(void); 79static void ext4_destroy_lazyinit_thread(void);
79static void ext4_unregister_li_request(struct super_block *sb); 80static void ext4_unregister_li_request(struct super_block *sb);
80static void ext4_clear_request_list(void); 81static void ext4_clear_request_list(void);
@@ -594,7 +595,7 @@ __acquires(bitlock)
594 595
595 vaf.fmt = fmt; 596 vaf.fmt = fmt;
596 vaf.va = &args; 597 vaf.va = &args;
597 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", 598 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
598 sb->s_id, function, line, grp); 599 sb->s_id, function, line, grp);
599 if (ino) 600 if (ino)
600 printk(KERN_CONT "inode %lu: ", ino); 601 printk(KERN_CONT "inode %lu: ", ino);
@@ -997,13 +998,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
997 if (test_opt(sb, OLDALLOC)) 998 if (test_opt(sb, OLDALLOC))
998 seq_puts(seq, ",oldalloc"); 999 seq_puts(seq, ",oldalloc");
999#ifdef CONFIG_EXT4_FS_XATTR 1000#ifdef CONFIG_EXT4_FS_XATTR
1000 if (test_opt(sb, XATTR_USER) && 1001 if (test_opt(sb, XATTR_USER))
1001 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
1002 seq_puts(seq, ",user_xattr"); 1002 seq_puts(seq, ",user_xattr");
1003 if (!test_opt(sb, XATTR_USER) && 1003 if (!test_opt(sb, XATTR_USER))
1004 (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
1005 seq_puts(seq, ",nouser_xattr"); 1004 seq_puts(seq, ",nouser_xattr");
1006 }
1007#endif 1005#endif
1008#ifdef CONFIG_EXT4_FS_POSIX_ACL 1006#ifdef CONFIG_EXT4_FS_POSIX_ACL
1009 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) 1007 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
@@ -1041,8 +1039,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1041 !(def_mount_opts & EXT4_DEFM_NODELALLOC)) 1039 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1042 seq_puts(seq, ",nodelalloc"); 1040 seq_puts(seq, ",nodelalloc");
1043 1041
1044 if (test_opt(sb, MBLK_IO_SUBMIT)) 1042 if (!test_opt(sb, MBLK_IO_SUBMIT))
1045 seq_puts(seq, ",mblk_io_submit"); 1043 seq_puts(seq, ",nomblk_io_submit");
1046 if (sbi->s_stripe) 1044 if (sbi->s_stripe)
1047 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1045 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1048 /* 1046 /*
@@ -1451,7 +1449,7 @@ static int parse_options(char *options, struct super_block *sb,
1451 * Initialize args struct so we know whether arg was 1449 * Initialize args struct so we know whether arg was
1452 * found; some options take optional arguments. 1450 * found; some options take optional arguments.
1453 */ 1451 */
1454 args[0].to = args[0].from = 0; 1452 args[0].to = args[0].from = NULL;
1455 token = match_token(p, tokens, args); 1453 token = match_token(p, tokens, args);
1456 switch (token) { 1454 switch (token) {
1457 case Opt_bsd_df: 1455 case Opt_bsd_df:
@@ -1771,7 +1769,7 @@ set_qf_format:
1771 return 0; 1769 return 0;
1772 if (option < 0 || option > (1 << 30)) 1770 if (option < 0 || option > (1 << 30))
1773 return 0; 1771 return 0;
1774 if (!is_power_of_2(option)) { 1772 if (option && !is_power_of_2(option)) {
1775 ext4_msg(sb, KERN_ERR, 1773 ext4_msg(sb, KERN_ERR,
1776 "EXT4-fs: inode_readahead_blks" 1774 "EXT4-fs: inode_readahead_blks"
1777 " must be a power of 2"); 1775 " must be a power of 2");
@@ -2120,6 +2118,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2120 return; 2118 return;
2121 } 2119 }
2122 2120
2121 /* Check if feature set would not allow a r/w mount */
2122 if (!ext4_feature_set_ok(sb, 0)) {
2123 ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
2124 "unknown ROCOMPAT features");
2125 return;
2126 }
2127
2123 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 2128 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2124 if (es->s_last_orphan) 2129 if (es->s_last_orphan)
2125 jbd_debug(1, "Errors on filesystem, " 2130 jbd_debug(1, "Errors on filesystem, "
@@ -2412,7 +2417,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2412 if (parse_strtoul(buf, 0x40000000, &t)) 2417 if (parse_strtoul(buf, 0x40000000, &t))
2413 return -EINVAL; 2418 return -EINVAL;
2414 2419
2415 if (!is_power_of_2(t)) 2420 if (t && !is_power_of_2(t))
2416 return -EINVAL; 2421 return -EINVAL;
2417 2422
2418 sbi->s_inode_readahead_blks = t; 2423 sbi->s_inode_readahead_blks = t;
@@ -3095,14 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3095 } 3100 }
3096 if (def_mount_opts & EXT4_DEFM_UID16) 3101 if (def_mount_opts & EXT4_DEFM_UID16)
3097 set_opt(sb, NO_UID32); 3102 set_opt(sb, NO_UID32);
3103 /* xattr user namespace & acls are now defaulted on */
3098#ifdef CONFIG_EXT4_FS_XATTR 3104#ifdef CONFIG_EXT4_FS_XATTR
3099 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 3105 set_opt(sb, XATTR_USER);
3100 set_opt(sb, XATTR_USER);
3101#endif 3106#endif
3102#ifdef CONFIG_EXT4_FS_POSIX_ACL 3107#ifdef CONFIG_EXT4_FS_POSIX_ACL
3103 if (def_mount_opts & EXT4_DEFM_ACL) 3108 set_opt(sb, POSIX_ACL);
3104 set_opt(sb, POSIX_ACL);
3105#endif 3109#endif
3110 set_opt(sb, MBLK_IO_SUBMIT);
3106 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 3111 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3107 set_opt(sb, JOURNAL_DATA); 3112 set_opt(sb, JOURNAL_DATA);
3108 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 3113 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3516,7 +3521,7 @@ no_journal:
3516 * concurrency isn't really necessary. Limit it to 1. 3521 * concurrency isn't really necessary. Limit it to 1.
3517 */ 3522 */
3518 EXT4_SB(sb)->dio_unwritten_wq = 3523 EXT4_SB(sb)->dio_unwritten_wq =
3519 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1); 3524 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3520 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3525 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3521 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3526 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
3522 goto failed_mount_wq; 3527 goto failed_mount_wq;
@@ -3531,17 +3536,16 @@ no_journal:
3531 if (IS_ERR(root)) { 3536 if (IS_ERR(root)) {
3532 ext4_msg(sb, KERN_ERR, "get root inode failed"); 3537 ext4_msg(sb, KERN_ERR, "get root inode failed");
3533 ret = PTR_ERR(root); 3538 ret = PTR_ERR(root);
3539 root = NULL;
3534 goto failed_mount4; 3540 goto failed_mount4;
3535 } 3541 }
3536 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 3542 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
3537 iput(root);
3538 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); 3543 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
3539 goto failed_mount4; 3544 goto failed_mount4;
3540 } 3545 }
3541 sb->s_root = d_alloc_root(root); 3546 sb->s_root = d_alloc_root(root);
3542 if (!sb->s_root) { 3547 if (!sb->s_root) {
3543 ext4_msg(sb, KERN_ERR, "get root dentry failed"); 3548 ext4_msg(sb, KERN_ERR, "get root dentry failed");
3544 iput(root);
3545 ret = -ENOMEM; 3549 ret = -ENOMEM;
3546 goto failed_mount4; 3550 goto failed_mount4;
3547 } 3551 }
@@ -3657,6 +3661,8 @@ cantfind_ext4:
3657 goto failed_mount; 3661 goto failed_mount;
3658 3662
3659failed_mount4: 3663failed_mount4:
3664 iput(root);
3665 sb->s_root = NULL;
3660 ext4_msg(sb, KERN_ERR, "mount failed"); 3666 ext4_msg(sb, KERN_ERR, "mount failed");
3661 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 3667 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
3662failed_mount_wq: 3668failed_mount_wq:
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fc32176eee39..b545ca1c459c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -735,7 +735,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
735 int offset = (char *)s->here - bs->bh->b_data; 735 int offset = (char *)s->here - bs->bh->b_data;
736 736
737 unlock_buffer(bs->bh); 737 unlock_buffer(bs->bh);
738 jbd2_journal_release_buffer(handle, bs->bh); 738 ext4_handle_release_buffer(handle, bs->bh);
739 if (ce) { 739 if (ce) {
740 mb_cache_entry_release(ce); 740 mb_cache_entry_release(ce);
741 ce = NULL; 741 ce = NULL;
@@ -833,7 +833,7 @@ inserted:
833 new_bh = sb_getblk(sb, block); 833 new_bh = sb_getblk(sb, block);
834 if (!new_bh) { 834 if (!new_bh) {
835getblk_failed: 835getblk_failed:
836 ext4_free_blocks(handle, inode, 0, block, 1, 836 ext4_free_blocks(handle, inode, NULL, block, 1,
837 EXT4_FREE_BLOCKS_METADATA); 837 EXT4_FREE_BLOCKS_METADATA);
838 error = -EIO; 838 error = -EIO;
839 goto cleanup; 839 goto cleanup;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c6e4956786..b5ed541fb137 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -176,6 +176,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
176} 176}
177 177
178/* 178/*
179 * Remove the inode from the writeback list it is on.
180 */
181void inode_wb_list_del(struct inode *inode)
182{
183 spin_lock(&inode_wb_list_lock);
184 list_del_init(&inode->i_wb_list);
185 spin_unlock(&inode_wb_list_lock);
186}
187
188
189/*
179 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
180 * furthest end of its superblock's dirty-inode list. 191 * furthest end of its superblock's dirty-inode list.
181 * 192 *
@@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
188{ 199{
189 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 200 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
190 201
202 assert_spin_locked(&inode_wb_list_lock);
191 if (!list_empty(&wb->b_dirty)) { 203 if (!list_empty(&wb->b_dirty)) {
192 struct inode *tail; 204 struct inode *tail;
193 205
@@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
205{ 217{
206 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 218 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
207 219
220 assert_spin_locked(&inode_wb_list_lock);
208 list_move(&inode->i_wb_list, &wb->b_more_io); 221 list_move(&inode->i_wb_list, &wb->b_more_io);
209} 222}
210 223
211static void inode_sync_complete(struct inode *inode) 224static void inode_sync_complete(struct inode *inode)
212{ 225{
213 /* 226 /*
214 * Prevent speculative execution through spin_unlock(&inode_lock); 227 * Prevent speculative execution through
228 * spin_unlock(&inode_wb_list_lock);
215 */ 229 */
230
216 smp_mb(); 231 smp_mb();
217 wake_up_bit(&inode->i_state, __I_SYNC); 232 wake_up_bit(&inode->i_state, __I_SYNC);
218} 233}
@@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
286 */ 301 */
287static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 302static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
288{ 303{
304 assert_spin_locked(&inode_wb_list_lock);
289 list_splice_init(&wb->b_more_io, &wb->b_io); 305 list_splice_init(&wb->b_more_io, &wb->b_io);
290 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 306 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
291} 307}
@@ -306,25 +322,25 @@ static void inode_wait_for_writeback(struct inode *inode)
306 wait_queue_head_t *wqh; 322 wait_queue_head_t *wqh;
307 323
308 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 324 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
309 while (inode->i_state & I_SYNC) { 325 while (inode->i_state & I_SYNC) {
310 spin_unlock(&inode_lock); 326 spin_unlock(&inode->i_lock);
327 spin_unlock(&inode_wb_list_lock);
311 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 328 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
312 spin_lock(&inode_lock); 329 spin_lock(&inode_wb_list_lock);
330 spin_lock(&inode->i_lock);
313 } 331 }
314} 332}
315 333
316/* 334/*
317 * Write out an inode's dirty pages. Called under inode_lock. Either the 335 * Write out an inode's dirty pages. Called under inode_wb_list_lock and
318 * caller has ref on the inode (either via __iget or via syscall against an fd) 336 * inode->i_lock. Either the caller has an active reference on the inode or
319 * or the inode has I_WILL_FREE set (via generic_forget_inode) 337 * the inode has I_WILL_FREE set.
320 * 338 *
321 * If `wait' is set, wait on the writeout. 339 * If `wait' is set, wait on the writeout.
322 * 340 *
323 * The whole writeout design is quite complex and fragile. We want to avoid 341 * The whole writeout design is quite complex and fragile. We want to avoid
324 * starvation of particular inodes when others are being redirtied, prevent 342 * starvation of particular inodes when others are being redirtied, prevent
325 * livelocks, etc. 343 * livelocks, etc.
326 *
327 * Called under inode_lock.
328 */ 344 */
329static int 345static int
330writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 346writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -333,6 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
333 unsigned dirty; 349 unsigned dirty;
334 int ret; 350 int ret;
335 351
352 assert_spin_locked(&inode_wb_list_lock);
353 assert_spin_locked(&inode->i_lock);
354
336 if (!atomic_read(&inode->i_count)) 355 if (!atomic_read(&inode->i_count))
337 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 356 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
338 else 357 else
@@ -363,7 +382,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
363 /* Set I_SYNC, reset I_DIRTY_PAGES */ 382 /* Set I_SYNC, reset I_DIRTY_PAGES */
364 inode->i_state |= I_SYNC; 383 inode->i_state |= I_SYNC;
365 inode->i_state &= ~I_DIRTY_PAGES; 384 inode->i_state &= ~I_DIRTY_PAGES;
366 spin_unlock(&inode_lock); 385 spin_unlock(&inode->i_lock);
386 spin_unlock(&inode_wb_list_lock);
367 387
368 ret = do_writepages(mapping, wbc); 388 ret = do_writepages(mapping, wbc);
369 389
@@ -383,10 +403,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
383 * due to delalloc, clear dirty metadata flags right before 403 * due to delalloc, clear dirty metadata flags right before
384 * write_inode() 404 * write_inode()
385 */ 405 */
386 spin_lock(&inode_lock); 406 spin_lock(&inode->i_lock);
387 dirty = inode->i_state & I_DIRTY; 407 dirty = inode->i_state & I_DIRTY;
388 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 408 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
389 spin_unlock(&inode_lock); 409 spin_unlock(&inode->i_lock);
390 /* Don't write the inode if only I_DIRTY_PAGES was set */ 410 /* Don't write the inode if only I_DIRTY_PAGES was set */
391 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 411 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
392 int err = write_inode(inode, wbc); 412 int err = write_inode(inode, wbc);
@@ -394,7 +414,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
394 ret = err; 414 ret = err;
395 } 415 }
396 416
397 spin_lock(&inode_lock); 417 spin_lock(&inode_wb_list_lock);
418 spin_lock(&inode->i_lock);
398 inode->i_state &= ~I_SYNC; 419 inode->i_state &= ~I_SYNC;
399 if (!(inode->i_state & I_FREEING)) { 420 if (!(inode->i_state & I_FREEING)) {
400 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 421 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -506,7 +527,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
506 * kind does not need peridic writeout yet, and for the latter 527 * kind does not need peridic writeout yet, and for the latter
507 * kind writeout is handled by the freer. 528 * kind writeout is handled by the freer.
508 */ 529 */
530 spin_lock(&inode->i_lock);
509 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 531 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
532 spin_unlock(&inode->i_lock);
510 requeue_io(inode); 533 requeue_io(inode);
511 continue; 534 continue;
512 } 535 }
@@ -515,10 +538,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
515 * Was this inode dirtied after sync_sb_inodes was called? 538 * Was this inode dirtied after sync_sb_inodes was called?
516 * This keeps sync from extra jobs and livelock. 539 * This keeps sync from extra jobs and livelock.
517 */ 540 */
518 if (inode_dirtied_after(inode, wbc->wb_start)) 541 if (inode_dirtied_after(inode, wbc->wb_start)) {
542 spin_unlock(&inode->i_lock);
519 return 1; 543 return 1;
544 }
520 545
521 __iget(inode); 546 __iget(inode);
547
522 pages_skipped = wbc->pages_skipped; 548 pages_skipped = wbc->pages_skipped;
523 writeback_single_inode(inode, wbc); 549 writeback_single_inode(inode, wbc);
524 if (wbc->pages_skipped != pages_skipped) { 550 if (wbc->pages_skipped != pages_skipped) {
@@ -528,10 +554,11 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
528 */ 554 */
529 redirty_tail(inode); 555 redirty_tail(inode);
530 } 556 }
531 spin_unlock(&inode_lock); 557 spin_unlock(&inode->i_lock);
558 spin_unlock(&inode_wb_list_lock);
532 iput(inode); 559 iput(inode);
533 cond_resched(); 560 cond_resched();
534 spin_lock(&inode_lock); 561 spin_lock(&inode_wb_list_lock);
535 if (wbc->nr_to_write <= 0) { 562 if (wbc->nr_to_write <= 0) {
536 wbc->more_io = 1; 563 wbc->more_io = 1;
537 return 1; 564 return 1;
@@ -550,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
550 577
551 if (!wbc->wb_start) 578 if (!wbc->wb_start)
552 wbc->wb_start = jiffies; /* livelock avoidance */ 579 wbc->wb_start = jiffies; /* livelock avoidance */
553 spin_lock(&inode_lock); 580 spin_lock(&inode_wb_list_lock);
554 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 581 if (!wbc->for_kupdate || list_empty(&wb->b_io))
555 queue_io(wb, wbc->older_than_this); 582 queue_io(wb, wbc->older_than_this);
556 583
@@ -568,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
568 if (ret) 595 if (ret)
569 break; 596 break;
570 } 597 }
571 spin_unlock(&inode_lock); 598 spin_unlock(&inode_wb_list_lock);
572 /* Leave any unwritten inodes on b_io */ 599 /* Leave any unwritten inodes on b_io */
573} 600}
574 601
@@ -577,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
577{ 604{
578 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 605 WARN_ON(!rwsem_is_locked(&sb->s_umount));
579 606
580 spin_lock(&inode_lock); 607 spin_lock(&inode_wb_list_lock);
581 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 608 if (!wbc->for_kupdate || list_empty(&wb->b_io))
582 queue_io(wb, wbc->older_than_this); 609 queue_io(wb, wbc->older_than_this);
583 writeback_sb_inodes(sb, wb, wbc, true); 610 writeback_sb_inodes(sb, wb, wbc, true);
584 spin_unlock(&inode_lock); 611 spin_unlock(&inode_wb_list_lock);
585} 612}
586 613
587/* 614/*
@@ -720,13 +747,15 @@ static long wb_writeback(struct bdi_writeback *wb,
720 * become available for writeback. Otherwise 747 * become available for writeback. Otherwise
721 * we'll just busyloop. 748 * we'll just busyloop.
722 */ 749 */
723 spin_lock(&inode_lock); 750 spin_lock(&inode_wb_list_lock);
724 if (!list_empty(&wb->b_more_io)) { 751 if (!list_empty(&wb->b_more_io)) {
725 inode = wb_inode(wb->b_more_io.prev); 752 inode = wb_inode(wb->b_more_io.prev);
726 trace_wbc_writeback_wait(&wbc, wb->bdi); 753 trace_wbc_writeback_wait(&wbc, wb->bdi);
754 spin_lock(&inode->i_lock);
727 inode_wait_for_writeback(inode); 755 inode_wait_for_writeback(inode);
756 spin_unlock(&inode->i_lock);
728 } 757 }
729 spin_unlock(&inode_lock); 758 spin_unlock(&inode_wb_list_lock);
730 } 759 }
731 760
732 return wrote; 761 return wrote;
@@ -992,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
992{ 1021{
993 struct super_block *sb = inode->i_sb; 1022 struct super_block *sb = inode->i_sb;
994 struct backing_dev_info *bdi = NULL; 1023 struct backing_dev_info *bdi = NULL;
995 bool wakeup_bdi = false;
996 1024
997 /* 1025 /*
998 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1026 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1016,7 +1044,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1016 if (unlikely(block_dump)) 1044 if (unlikely(block_dump))
1017 block_dump___mark_inode_dirty(inode); 1045 block_dump___mark_inode_dirty(inode);
1018 1046
1019 spin_lock(&inode_lock); 1047 spin_lock(&inode->i_lock);
1020 if ((inode->i_state & flags) != flags) { 1048 if ((inode->i_state & flags) != flags) {
1021 const int was_dirty = inode->i_state & I_DIRTY; 1049 const int was_dirty = inode->i_state & I_DIRTY;
1022 1050
@@ -1028,7 +1056,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1028 * superblock list, based upon its state. 1056 * superblock list, based upon its state.
1029 */ 1057 */
1030 if (inode->i_state & I_SYNC) 1058 if (inode->i_state & I_SYNC)
1031 goto out; 1059 goto out_unlock_inode;
1032 1060
1033 /* 1061 /*
1034 * Only add valid (hashed) inodes to the superblock's 1062 * Only add valid (hashed) inodes to the superblock's
@@ -1036,16 +1064,17 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1036 */ 1064 */
1037 if (!S_ISBLK(inode->i_mode)) { 1065 if (!S_ISBLK(inode->i_mode)) {
1038 if (inode_unhashed(inode)) 1066 if (inode_unhashed(inode))
1039 goto out; 1067 goto out_unlock_inode;
1040 } 1068 }
1041 if (inode->i_state & I_FREEING) 1069 if (inode->i_state & I_FREEING)
1042 goto out; 1070 goto out_unlock_inode;
1043 1071
1044 /* 1072 /*
1045 * If the inode was already on b_dirty/b_io/b_more_io, don't 1073 * If the inode was already on b_dirty/b_io/b_more_io, don't
1046 * reposition it (that would break b_dirty time-ordering). 1074 * reposition it (that would break b_dirty time-ordering).
1047 */ 1075 */
1048 if (!was_dirty) { 1076 if (!was_dirty) {
1077 bool wakeup_bdi = false;
1049 bdi = inode_to_bdi(inode); 1078 bdi = inode_to_bdi(inode);
1050 1079
1051 if (bdi_cap_writeback_dirty(bdi)) { 1080 if (bdi_cap_writeback_dirty(bdi)) {
@@ -1062,15 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1062 wakeup_bdi = true; 1091 wakeup_bdi = true;
1063 } 1092 }
1064 1093
1094 spin_unlock(&inode->i_lock);
1095 spin_lock(&inode_wb_list_lock);
1065 inode->dirtied_when = jiffies; 1096 inode->dirtied_when = jiffies;
1066 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1097 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1098 spin_unlock(&inode_wb_list_lock);
1099
1100 if (wakeup_bdi)
1101 bdi_wakeup_thread_delayed(bdi);
1102 return;
1067 } 1103 }
1068 } 1104 }
1069out: 1105out_unlock_inode:
1070 spin_unlock(&inode_lock); 1106 spin_unlock(&inode->i_lock);
1071 1107
1072 if (wakeup_bdi)
1073 bdi_wakeup_thread_delayed(bdi);
1074} 1108}
1075EXPORT_SYMBOL(__mark_inode_dirty); 1109EXPORT_SYMBOL(__mark_inode_dirty);
1076 1110
@@ -1101,7 +1135,7 @@ static void wait_sb_inodes(struct super_block *sb)
1101 */ 1135 */
1102 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1136 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1103 1137
1104 spin_lock(&inode_lock); 1138 spin_lock(&inode_sb_list_lock);
1105 1139
1106 /* 1140 /*
1107 * Data integrity sync. Must wait for all pages under writeback, 1141 * Data integrity sync. Must wait for all pages under writeback,
@@ -1111,22 +1145,25 @@ static void wait_sb_inodes(struct super_block *sb)
1111 * we still have to wait for that writeout. 1145 * we still have to wait for that writeout.
1112 */ 1146 */
1113 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1147 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1114 struct address_space *mapping; 1148 struct address_space *mapping = inode->i_mapping;
1115 1149
1116 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 1150 spin_lock(&inode->i_lock);
1117 continue; 1151 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
1118 mapping = inode->i_mapping; 1152 (mapping->nrpages == 0)) {
1119 if (mapping->nrpages == 0) 1153 spin_unlock(&inode->i_lock);
1120 continue; 1154 continue;
1155 }
1121 __iget(inode); 1156 __iget(inode);
1122 spin_unlock(&inode_lock); 1157 spin_unlock(&inode->i_lock);
1158 spin_unlock(&inode_sb_list_lock);
1159
1123 /* 1160 /*
1124 * We hold a reference to 'inode' so it couldn't have 1161 * We hold a reference to 'inode' so it couldn't have been
1125 * been removed from s_inodes list while we dropped the 1162 * removed from s_inodes list while we dropped the
1126 * inode_lock. We cannot iput the inode now as we can 1163 * inode_sb_list_lock. We cannot iput the inode now as we can
1127 * be holding the last reference and we cannot iput it 1164 * be holding the last reference and we cannot iput it under
1128 * under inode_lock. So we keep the reference and iput 1165 * inode_sb_list_lock. So we keep the reference and iput it
1129 * it later. 1166 * later.
1130 */ 1167 */
1131 iput(old_inode); 1168 iput(old_inode);
1132 old_inode = inode; 1169 old_inode = inode;
@@ -1135,9 +1172,9 @@ static void wait_sb_inodes(struct super_block *sb)
1135 1172
1136 cond_resched(); 1173 cond_resched();
1137 1174
1138 spin_lock(&inode_lock); 1175 spin_lock(&inode_sb_list_lock);
1139 } 1176 }
1140 spin_unlock(&inode_lock); 1177 spin_unlock(&inode_sb_list_lock);
1141 iput(old_inode); 1178 iput(old_inode);
1142} 1179}
1143 1180
@@ -1271,9 +1308,11 @@ int write_inode_now(struct inode *inode, int sync)
1271 wbc.nr_to_write = 0; 1308 wbc.nr_to_write = 0;
1272 1309
1273 might_sleep(); 1310 might_sleep();
1274 spin_lock(&inode_lock); 1311 spin_lock(&inode_wb_list_lock);
1312 spin_lock(&inode->i_lock);
1275 ret = writeback_single_inode(inode, &wbc); 1313 ret = writeback_single_inode(inode, &wbc);
1276 spin_unlock(&inode_lock); 1314 spin_unlock(&inode->i_lock);
1315 spin_unlock(&inode_wb_list_lock);
1277 if (sync) 1316 if (sync)
1278 inode_sync_wait(inode); 1317 inode_sync_wait(inode);
1279 return ret; 1318 return ret;
@@ -1295,9 +1334,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1295{ 1334{
1296 int ret; 1335 int ret;
1297 1336
1298 spin_lock(&inode_lock); 1337 spin_lock(&inode_wb_list_lock);
1338 spin_lock(&inode->i_lock);
1299 ret = writeback_single_inode(inode, wbc); 1339 ret = writeback_single_inode(inode, wbc);
1300 spin_unlock(&inode_lock); 1340 spin_unlock(&inode->i_lock);
1341 spin_unlock(&inode_wb_list_lock);
1301 return ret; 1342 return ret;
1302} 1343}
1303EXPORT_SYMBOL(sync_inode); 1344EXPORT_SYMBOL(sync_inode);
diff --git a/fs/inode.c b/fs/inode.c
index 0b3da4a77704..5f4e11aaeb5c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -26,6 +26,38 @@
26#include <linux/posix_acl.h> 26#include <linux/posix_acl.h>
27#include <linux/ima.h> 27#include <linux/ima.h>
28#include <linux/cred.h> 28#include <linux/cred.h>
29#include "internal.h"
30
31/*
32 * inode locking rules.
33 *
34 * inode->i_lock protects:
35 * inode->i_state, inode->i_hash, __iget()
36 * inode_lru_lock protects:
37 * inode_lru, inode->i_lru
38 * inode_sb_list_lock protects:
39 * sb->s_inodes, inode->i_sb_list
40 * inode_wb_list_lock protects:
41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
42 * inode_hash_lock protects:
43 * inode_hashtable, inode->i_hash
44 *
45 * Lock ordering:
46 *
47 * inode_sb_list_lock
48 * inode->i_lock
49 * inode_lru_lock
50 *
51 * inode_wb_list_lock
52 * inode->i_lock
53 *
54 * inode_hash_lock
55 * inode_sb_list_lock
56 * inode->i_lock
57 *
58 * iunique_lock
59 * inode_hash_lock
60 */
29 61
30/* 62/*
31 * This is needed for the following functions: 63 * This is needed for the following functions:
@@ -60,6 +92,8 @@
60 92
61static unsigned int i_hash_mask __read_mostly; 93static unsigned int i_hash_mask __read_mostly;
62static unsigned int i_hash_shift __read_mostly; 94static unsigned int i_hash_shift __read_mostly;
95static struct hlist_head *inode_hashtable __read_mostly;
96static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
63 97
64/* 98/*
65 * Each inode can be on two separate lists. One is 99 * Each inode can be on two separate lists. One is
@@ -74,15 +108,10 @@ static unsigned int i_hash_shift __read_mostly;
74 */ 108 */
75 109
76static LIST_HEAD(inode_lru); 110static LIST_HEAD(inode_lru);
77static struct hlist_head *inode_hashtable __read_mostly; 111static DEFINE_SPINLOCK(inode_lru_lock);
78 112
79/* 113__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
80 * A simple spinlock to protect the list manipulations. 114__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
81 *
82 * NOTE! You also have to own the lock if you change
83 * the i_state of an inode while it is in use..
84 */
85DEFINE_SPINLOCK(inode_lock);
86 115
87/* 116/*
88 * iprune_sem provides exclusion between the icache shrinking and the 117 * iprune_sem provides exclusion between the icache shrinking and the
@@ -137,15 +166,6 @@ int proc_nr_inodes(ctl_table *table, int write,
137} 166}
138#endif 167#endif
139 168
140static void wake_up_inode(struct inode *inode)
141{
142 /*
143 * Prevent speculative execution through spin_unlock(&inode_lock);
144 */
145 smp_mb();
146 wake_up_bit(&inode->i_state, __I_NEW);
147}
148
149/** 169/**
150 * inode_init_always - perform inode structure intialisation 170 * inode_init_always - perform inode structure intialisation
151 * @sb: superblock inode belongs to 171 * @sb: superblock inode belongs to
@@ -336,7 +356,7 @@ static void init_once(void *foo)
336} 356}
337 357
338/* 358/*
339 * inode_lock must be held 359 * inode->i_lock must be held
340 */ 360 */
341void __iget(struct inode *inode) 361void __iget(struct inode *inode)
342{ 362{
@@ -354,23 +374,22 @@ EXPORT_SYMBOL(ihold);
354 374
355static void inode_lru_list_add(struct inode *inode) 375static void inode_lru_list_add(struct inode *inode)
356{ 376{
377 spin_lock(&inode_lru_lock);
357 if (list_empty(&inode->i_lru)) { 378 if (list_empty(&inode->i_lru)) {
358 list_add(&inode->i_lru, &inode_lru); 379 list_add(&inode->i_lru, &inode_lru);
359 inodes_stat.nr_unused++; 380 inodes_stat.nr_unused++;
360 } 381 }
382 spin_unlock(&inode_lru_lock);
361} 383}
362 384
363static void inode_lru_list_del(struct inode *inode) 385static void inode_lru_list_del(struct inode *inode)
364{ 386{
387 spin_lock(&inode_lru_lock);
365 if (!list_empty(&inode->i_lru)) { 388 if (!list_empty(&inode->i_lru)) {
366 list_del_init(&inode->i_lru); 389 list_del_init(&inode->i_lru);
367 inodes_stat.nr_unused--; 390 inodes_stat.nr_unused--;
368 } 391 }
369} 392 spin_unlock(&inode_lru_lock);
370
371static inline void __inode_sb_list_add(struct inode *inode)
372{
373 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
374} 393}
375 394
376/** 395/**
@@ -379,15 +398,17 @@ static inline void __inode_sb_list_add(struct inode *inode)
379 */ 398 */
380void inode_sb_list_add(struct inode *inode) 399void inode_sb_list_add(struct inode *inode)
381{ 400{
382 spin_lock(&inode_lock); 401 spin_lock(&inode_sb_list_lock);
383 __inode_sb_list_add(inode); 402 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
384 spin_unlock(&inode_lock); 403 spin_unlock(&inode_sb_list_lock);
385} 404}
386EXPORT_SYMBOL_GPL(inode_sb_list_add); 405EXPORT_SYMBOL_GPL(inode_sb_list_add);
387 406
388static inline void __inode_sb_list_del(struct inode *inode) 407static inline void inode_sb_list_del(struct inode *inode)
389{ 408{
409 spin_lock(&inode_sb_list_lock);
390 list_del_init(&inode->i_sb_list); 410 list_del_init(&inode->i_sb_list);
411 spin_unlock(&inode_sb_list_lock);
391} 412}
392 413
393static unsigned long hash(struct super_block *sb, unsigned long hashval) 414static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -412,24 +433,15 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
412{ 433{
413 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 434 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
414 435
415 spin_lock(&inode_lock); 436 spin_lock(&inode_hash_lock);
437 spin_lock(&inode->i_lock);
416 hlist_add_head(&inode->i_hash, b); 438 hlist_add_head(&inode->i_hash, b);
417 spin_unlock(&inode_lock); 439 spin_unlock(&inode->i_lock);
440 spin_unlock(&inode_hash_lock);
418} 441}
419EXPORT_SYMBOL(__insert_inode_hash); 442EXPORT_SYMBOL(__insert_inode_hash);
420 443
421/** 444/**
422 * __remove_inode_hash - remove an inode from the hash
423 * @inode: inode to unhash
424 *
425 * Remove an inode from the superblock.
426 */
427static void __remove_inode_hash(struct inode *inode)
428{
429 hlist_del_init(&inode->i_hash);
430}
431
432/**
433 * remove_inode_hash - remove an inode from the hash 445 * remove_inode_hash - remove an inode from the hash
434 * @inode: inode to unhash 446 * @inode: inode to unhash
435 * 447 *
@@ -437,9 +449,11 @@ static void __remove_inode_hash(struct inode *inode)
437 */ 449 */
438void remove_inode_hash(struct inode *inode) 450void remove_inode_hash(struct inode *inode)
439{ 451{
440 spin_lock(&inode_lock); 452 spin_lock(&inode_hash_lock);
453 spin_lock(&inode->i_lock);
441 hlist_del_init(&inode->i_hash); 454 hlist_del_init(&inode->i_hash);
442 spin_unlock(&inode_lock); 455 spin_unlock(&inode->i_lock);
456 spin_unlock(&inode_hash_lock);
443} 457}
444EXPORT_SYMBOL(remove_inode_hash); 458EXPORT_SYMBOL(remove_inode_hash);
445 459
@@ -456,10 +470,29 @@ void end_writeback(struct inode *inode)
456} 470}
457EXPORT_SYMBOL(end_writeback); 471EXPORT_SYMBOL(end_writeback);
458 472
473/*
474 * Free the inode passed in, removing it from the lists it is still connected
475 * to. We remove any pages still attached to the inode and wait for any IO that
476 * is still in progress before finally destroying the inode.
477 *
478 * An inode must already be marked I_FREEING so that we avoid the inode being
479 * moved back onto lists if we race with other code that manipulates the lists
480 * (e.g. writeback_single_inode). The caller is responsible for setting this.
481 *
482 * An inode must already be removed from the LRU list before being evicted from
483 * the cache. This should occur atomically with setting the I_FREEING state
484 * flag, so no inodes here should ever be on the LRU when being evicted.
485 */
459static void evict(struct inode *inode) 486static void evict(struct inode *inode)
460{ 487{
461 const struct super_operations *op = inode->i_sb->s_op; 488 const struct super_operations *op = inode->i_sb->s_op;
462 489
490 BUG_ON(!(inode->i_state & I_FREEING));
491 BUG_ON(!list_empty(&inode->i_lru));
492
493 inode_wb_list_del(inode);
494 inode_sb_list_del(inode);
495
463 if (op->evict_inode) { 496 if (op->evict_inode) {
464 op->evict_inode(inode); 497 op->evict_inode(inode);
465 } else { 498 } else {
@@ -471,6 +504,15 @@ static void evict(struct inode *inode)
471 bd_forget(inode); 504 bd_forget(inode);
472 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 505 if (S_ISCHR(inode->i_mode) && inode->i_cdev)
473 cd_forget(inode); 506 cd_forget(inode);
507
508 remove_inode_hash(inode);
509
510 spin_lock(&inode->i_lock);
511 wake_up_bit(&inode->i_state, __I_NEW);
512 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
513 spin_unlock(&inode->i_lock);
514
515 destroy_inode(inode);
474} 516}
475 517
476/* 518/*
@@ -489,14 +531,6 @@ static void dispose_list(struct list_head *head)
489 list_del_init(&inode->i_lru); 531 list_del_init(&inode->i_lru);
490 532
491 evict(inode); 533 evict(inode);
492
493 spin_lock(&inode_lock);
494 __remove_inode_hash(inode);
495 __inode_sb_list_del(inode);
496 spin_unlock(&inode_lock);
497
498 wake_up_inode(inode);
499 destroy_inode(inode);
500 } 534 }
501} 535}
502 536
@@ -514,25 +548,23 @@ void evict_inodes(struct super_block *sb)
514 struct inode *inode, *next; 548 struct inode *inode, *next;
515 LIST_HEAD(dispose); 549 LIST_HEAD(dispose);
516 550
517 spin_lock(&inode_lock); 551 spin_lock(&inode_sb_list_lock);
518 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 552 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
519 if (atomic_read(&inode->i_count)) 553 if (atomic_read(&inode->i_count))
520 continue; 554 continue;
521 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 555
556 spin_lock(&inode->i_lock);
557 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
558 spin_unlock(&inode->i_lock);
522 continue; 559 continue;
560 }
523 561
524 inode->i_state |= I_FREEING; 562 inode->i_state |= I_FREEING;
525 563 inode_lru_list_del(inode);
526 /* 564 spin_unlock(&inode->i_lock);
527 * Move the inode off the IO lists and LRU once I_FREEING is 565 list_add(&inode->i_lru, &dispose);
528 * set so that it won't get moved back on there if it is dirty.
529 */
530 list_move(&inode->i_lru, &dispose);
531 list_del_init(&inode->i_wb_list);
532 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
533 inodes_stat.nr_unused--;
534 } 566 }
535 spin_unlock(&inode_lock); 567 spin_unlock(&inode_sb_list_lock);
536 568
537 dispose_list(&dispose); 569 dispose_list(&dispose);
538 570
@@ -561,31 +593,30 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
561 struct inode *inode, *next; 593 struct inode *inode, *next;
562 LIST_HEAD(dispose); 594 LIST_HEAD(dispose);
563 595
564 spin_lock(&inode_lock); 596 spin_lock(&inode_sb_list_lock);
565 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 597 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
566 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 598 spin_lock(&inode->i_lock);
599 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
600 spin_unlock(&inode->i_lock);
567 continue; 601 continue;
602 }
568 if (inode->i_state & I_DIRTY && !kill_dirty) { 603 if (inode->i_state & I_DIRTY && !kill_dirty) {
604 spin_unlock(&inode->i_lock);
569 busy = 1; 605 busy = 1;
570 continue; 606 continue;
571 } 607 }
572 if (atomic_read(&inode->i_count)) { 608 if (atomic_read(&inode->i_count)) {
609 spin_unlock(&inode->i_lock);
573 busy = 1; 610 busy = 1;
574 continue; 611 continue;
575 } 612 }
576 613
577 inode->i_state |= I_FREEING; 614 inode->i_state |= I_FREEING;
578 615 inode_lru_list_del(inode);
579 /* 616 spin_unlock(&inode->i_lock);
580 * Move the inode off the IO lists and LRU once I_FREEING is 617 list_add(&inode->i_lru, &dispose);
581 * set so that it won't get moved back on there if it is dirty.
582 */
583 list_move(&inode->i_lru, &dispose);
584 list_del_init(&inode->i_wb_list);
585 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
586 inodes_stat.nr_unused--;
587 } 618 }
588 spin_unlock(&inode_lock); 619 spin_unlock(&inode_sb_list_lock);
589 620
590 dispose_list(&dispose); 621 dispose_list(&dispose);
591 622
@@ -607,7 +638,7 @@ static int can_unuse(struct inode *inode)
607 638
608/* 639/*
609 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a 640 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
610 * temporary list and then are freed outside inode_lock by dispose_list(). 641 * temporary list and then are freed outside inode_lru_lock by dispose_list().
611 * 642 *
612 * Any inodes which are pinned purely because of attached pagecache have their 643 * Any inodes which are pinned purely because of attached pagecache have their
613 * pagecache removed. If the inode has metadata buffers attached to 644 * pagecache removed. If the inode has metadata buffers attached to
@@ -628,7 +659,7 @@ static void prune_icache(int nr_to_scan)
628 unsigned long reap = 0; 659 unsigned long reap = 0;
629 660
630 down_read(&iprune_sem); 661 down_read(&iprune_sem);
631 spin_lock(&inode_lock); 662 spin_lock(&inode_lru_lock);
632 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 663 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
633 struct inode *inode; 664 struct inode *inode;
634 665
@@ -638,53 +669,67 @@ static void prune_icache(int nr_to_scan)
638 inode = list_entry(inode_lru.prev, struct inode, i_lru); 669 inode = list_entry(inode_lru.prev, struct inode, i_lru);
639 670
640 /* 671 /*
672 * we are inverting the inode_lru_lock/inode->i_lock here,
673 * so use a trylock. If we fail to get the lock, just move the
674 * inode to the back of the list so we don't spin on it.
675 */
676 if (!spin_trylock(&inode->i_lock)) {
677 list_move(&inode->i_lru, &inode_lru);
678 continue;
679 }
680
681 /*
641 * Referenced or dirty inodes are still in use. Give them 682 * Referenced or dirty inodes are still in use. Give them
642 * another pass through the LRU as we canot reclaim them now. 683 * another pass through the LRU as we canot reclaim them now.
643 */ 684 */
644 if (atomic_read(&inode->i_count) || 685 if (atomic_read(&inode->i_count) ||
645 (inode->i_state & ~I_REFERENCED)) { 686 (inode->i_state & ~I_REFERENCED)) {
646 list_del_init(&inode->i_lru); 687 list_del_init(&inode->i_lru);
688 spin_unlock(&inode->i_lock);
647 inodes_stat.nr_unused--; 689 inodes_stat.nr_unused--;
648 continue; 690 continue;
649 } 691 }
650 692
651 /* recently referenced inodes get one more pass */ 693 /* recently referenced inodes get one more pass */
652 if (inode->i_state & I_REFERENCED) { 694 if (inode->i_state & I_REFERENCED) {
653 list_move(&inode->i_lru, &inode_lru);
654 inode->i_state &= ~I_REFERENCED; 695 inode->i_state &= ~I_REFERENCED;
696 list_move(&inode->i_lru, &inode_lru);
697 spin_unlock(&inode->i_lock);
655 continue; 698 continue;
656 } 699 }
657 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 700 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
658 __iget(inode); 701 __iget(inode);
659 spin_unlock(&inode_lock); 702 spin_unlock(&inode->i_lock);
703 spin_unlock(&inode_lru_lock);
660 if (remove_inode_buffers(inode)) 704 if (remove_inode_buffers(inode))
661 reap += invalidate_mapping_pages(&inode->i_data, 705 reap += invalidate_mapping_pages(&inode->i_data,
662 0, -1); 706 0, -1);
663 iput(inode); 707 iput(inode);
664 spin_lock(&inode_lock); 708 spin_lock(&inode_lru_lock);
665 709
666 if (inode != list_entry(inode_lru.next, 710 if (inode != list_entry(inode_lru.next,
667 struct inode, i_lru)) 711 struct inode, i_lru))
668 continue; /* wrong inode or list_empty */ 712 continue; /* wrong inode or list_empty */
669 if (!can_unuse(inode)) 713 /* avoid lock inversions with trylock */
714 if (!spin_trylock(&inode->i_lock))
715 continue;
716 if (!can_unuse(inode)) {
717 spin_unlock(&inode->i_lock);
670 continue; 718 continue;
719 }
671 } 720 }
672 WARN_ON(inode->i_state & I_NEW); 721 WARN_ON(inode->i_state & I_NEW);
673 inode->i_state |= I_FREEING; 722 inode->i_state |= I_FREEING;
723 spin_unlock(&inode->i_lock);
674 724
675 /*
676 * Move the inode off the IO lists and LRU once I_FREEING is
677 * set so that it won't get moved back on there if it is dirty.
678 */
679 list_move(&inode->i_lru, &freeable); 725 list_move(&inode->i_lru, &freeable);
680 list_del_init(&inode->i_wb_list);
681 inodes_stat.nr_unused--; 726 inodes_stat.nr_unused--;
682 } 727 }
683 if (current_is_kswapd()) 728 if (current_is_kswapd())
684 __count_vm_events(KSWAPD_INODESTEAL, reap); 729 __count_vm_events(KSWAPD_INODESTEAL, reap);
685 else 730 else
686 __count_vm_events(PGINODESTEAL, reap); 731 __count_vm_events(PGINODESTEAL, reap);
687 spin_unlock(&inode_lock); 732 spin_unlock(&inode_lru_lock);
688 733
689 dispose_list(&freeable); 734 dispose_list(&freeable);
690 up_read(&iprune_sem); 735 up_read(&iprune_sem);
@@ -733,15 +778,21 @@ static struct inode *find_inode(struct super_block *sb,
733 778
734repeat: 779repeat:
735 hlist_for_each_entry(inode, node, head, i_hash) { 780 hlist_for_each_entry(inode, node, head, i_hash) {
736 if (inode->i_sb != sb) 781 spin_lock(&inode->i_lock);
782 if (inode->i_sb != sb) {
783 spin_unlock(&inode->i_lock);
737 continue; 784 continue;
738 if (!test(inode, data)) 785 }
786 if (!test(inode, data)) {
787 spin_unlock(&inode->i_lock);
739 continue; 788 continue;
789 }
740 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 790 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
741 __wait_on_freeing_inode(inode); 791 __wait_on_freeing_inode(inode);
742 goto repeat; 792 goto repeat;
743 } 793 }
744 __iget(inode); 794 __iget(inode);
795 spin_unlock(&inode->i_lock);
745 return inode; 796 return inode;
746 } 797 }
747 return NULL; 798 return NULL;
@@ -759,15 +810,21 @@ static struct inode *find_inode_fast(struct super_block *sb,
759 810
760repeat: 811repeat:
761 hlist_for_each_entry(inode, node, head, i_hash) { 812 hlist_for_each_entry(inode, node, head, i_hash) {
762 if (inode->i_ino != ino) 813 spin_lock(&inode->i_lock);
814 if (inode->i_ino != ino) {
815 spin_unlock(&inode->i_lock);
763 continue; 816 continue;
764 if (inode->i_sb != sb) 817 }
818 if (inode->i_sb != sb) {
819 spin_unlock(&inode->i_lock);
765 continue; 820 continue;
821 }
766 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 822 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
767 __wait_on_freeing_inode(inode); 823 __wait_on_freeing_inode(inode);
768 goto repeat; 824 goto repeat;
769 } 825 }
770 __iget(inode); 826 __iget(inode);
827 spin_unlock(&inode->i_lock);
771 return inode; 828 return inode;
772 } 829 }
773 return NULL; 830 return NULL;
@@ -827,19 +884,26 @@ struct inode *new_inode(struct super_block *sb)
827{ 884{
828 struct inode *inode; 885 struct inode *inode;
829 886
830 spin_lock_prefetch(&inode_lock); 887 spin_lock_prefetch(&inode_sb_list_lock);
831 888
832 inode = alloc_inode(sb); 889 inode = alloc_inode(sb);
833 if (inode) { 890 if (inode) {
834 spin_lock(&inode_lock); 891 spin_lock(&inode->i_lock);
835 __inode_sb_list_add(inode);
836 inode->i_state = 0; 892 inode->i_state = 0;
837 spin_unlock(&inode_lock); 893 spin_unlock(&inode->i_lock);
894 inode_sb_list_add(inode);
838 } 895 }
839 return inode; 896 return inode;
840} 897}
841EXPORT_SYMBOL(new_inode); 898EXPORT_SYMBOL(new_inode);
842 899
900/**
901 * unlock_new_inode - clear the I_NEW state and wake up any waiters
902 * @inode: new inode to unlock
903 *
904 * Called when the inode is fully initialised to clear the new state of the
905 * inode and wake up anyone waiting for the inode to finish initialisation.
906 */
843void unlock_new_inode(struct inode *inode) 907void unlock_new_inode(struct inode *inode)
844{ 908{
845#ifdef CONFIG_DEBUG_LOCK_ALLOC 909#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -859,51 +923,67 @@ void unlock_new_inode(struct inode *inode)
859 } 923 }
860 } 924 }
861#endif 925#endif
862 /* 926 spin_lock(&inode->i_lock);
863 * This is special! We do not need the spinlock when clearing I_NEW,
864 * because we're guaranteed that nobody else tries to do anything about
865 * the state of the inode when it is locked, as we just created it (so
866 * there can be no old holders that haven't tested I_NEW).
867 * However we must emit the memory barrier so that other CPUs reliably
868 * see the clearing of I_NEW after the other inode initialisation has
869 * completed.
870 */
871 smp_mb();
872 WARN_ON(!(inode->i_state & I_NEW)); 927 WARN_ON(!(inode->i_state & I_NEW));
873 inode->i_state &= ~I_NEW; 928 inode->i_state &= ~I_NEW;
874 wake_up_inode(inode); 929 wake_up_bit(&inode->i_state, __I_NEW);
930 spin_unlock(&inode->i_lock);
875} 931}
876EXPORT_SYMBOL(unlock_new_inode); 932EXPORT_SYMBOL(unlock_new_inode);
877 933
878/* 934/**
879 * This is called without the inode lock held.. Be careful. 935 * iget5_locked - obtain an inode from a mounted file system
936 * @sb: super block of file system
937 * @hashval: hash value (usually inode number) to get
938 * @test: callback used for comparisons between inodes
939 * @set: callback used to initialize a new struct inode
940 * @data: opaque data pointer to pass to @test and @set
941 *
942 * Search for the inode specified by @hashval and @data in the inode cache,
943 * and if present it is return it with an increased reference count. This is
944 * a generalized version of iget_locked() for file systems where the inode
945 * number is not sufficient for unique identification of an inode.
880 * 946 *
881 * We no longer cache the sb_flags in i_flags - see fs.h 947 * If the inode is not in cache, allocate a new inode and return it locked,
882 * -- rmk@arm.uk.linux.org 948 * hashed, and with the I_NEW flag set. The file system gets to fill it in
949 * before unlocking it via unlock_new_inode().
950 *
951 * Note both @test and @set are called with the inode_hash_lock held, so can't
952 * sleep.
883 */ 953 */
884static struct inode *get_new_inode(struct super_block *sb, 954struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
885 struct hlist_head *head, 955 int (*test)(struct inode *, void *),
886 int (*test)(struct inode *, void *), 956 int (*set)(struct inode *, void *), void *data)
887 int (*set)(struct inode *, void *),
888 void *data)
889{ 957{
958 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
890 struct inode *inode; 959 struct inode *inode;
891 960
961 spin_lock(&inode_hash_lock);
962 inode = find_inode(sb, head, test, data);
963 spin_unlock(&inode_hash_lock);
964
965 if (inode) {
966 wait_on_inode(inode);
967 return inode;
968 }
969
892 inode = alloc_inode(sb); 970 inode = alloc_inode(sb);
893 if (inode) { 971 if (inode) {
894 struct inode *old; 972 struct inode *old;
895 973
896 spin_lock(&inode_lock); 974 spin_lock(&inode_hash_lock);
897 /* We released the lock, so.. */ 975 /* We released the lock, so.. */
898 old = find_inode(sb, head, test, data); 976 old = find_inode(sb, head, test, data);
899 if (!old) { 977 if (!old) {
900 if (set(inode, data)) 978 if (set(inode, data))
901 goto set_failed; 979 goto set_failed;
902 980
903 hlist_add_head(&inode->i_hash, head); 981 spin_lock(&inode->i_lock);
904 __inode_sb_list_add(inode);
905 inode->i_state = I_NEW; 982 inode->i_state = I_NEW;
906 spin_unlock(&inode_lock); 983 hlist_add_head(&inode->i_hash, head);
984 spin_unlock(&inode->i_lock);
985 inode_sb_list_add(inode);
986 spin_unlock(&inode_hash_lock);
907 987
908 /* Return the locked inode with I_NEW set, the 988 /* Return the locked inode with I_NEW set, the
909 * caller is responsible for filling in the contents 989 * caller is responsible for filling in the contents
@@ -916,7 +996,7 @@ static struct inode *get_new_inode(struct super_block *sb,
916 * us. Use the old inode instead of the one we just 996 * us. Use the old inode instead of the one we just
917 * allocated. 997 * allocated.
918 */ 998 */
919 spin_unlock(&inode_lock); 999 spin_unlock(&inode_hash_lock);
920 destroy_inode(inode); 1000 destroy_inode(inode);
921 inode = old; 1001 inode = old;
922 wait_on_inode(inode); 1002 wait_on_inode(inode);
@@ -924,33 +1004,53 @@ static struct inode *get_new_inode(struct super_block *sb,
924 return inode; 1004 return inode;
925 1005
926set_failed: 1006set_failed:
927 spin_unlock(&inode_lock); 1007 spin_unlock(&inode_hash_lock);
928 destroy_inode(inode); 1008 destroy_inode(inode);
929 return NULL; 1009 return NULL;
930} 1010}
1011EXPORT_SYMBOL(iget5_locked);
931 1012
932/* 1013/**
933 * get_new_inode_fast is the fast path version of get_new_inode, see the 1014 * iget_locked - obtain an inode from a mounted file system
934 * comment at iget_locked for details. 1015 * @sb: super block of file system
1016 * @ino: inode number to get
1017 *
1018 * Search for the inode specified by @ino in the inode cache and if present
1019 * return it with an increased reference count. This is for file systems
1020 * where the inode number is sufficient for unique identification of an inode.
1021 *
1022 * If the inode is not in cache, allocate a new inode and return it locked,
1023 * hashed, and with the I_NEW flag set. The file system gets to fill it in
1024 * before unlocking it via unlock_new_inode().
935 */ 1025 */
936static struct inode *get_new_inode_fast(struct super_block *sb, 1026struct inode *iget_locked(struct super_block *sb, unsigned long ino)
937 struct hlist_head *head, unsigned long ino)
938{ 1027{
1028 struct hlist_head *head = inode_hashtable + hash(sb, ino);
939 struct inode *inode; 1029 struct inode *inode;
940 1030
1031 spin_lock(&inode_hash_lock);
1032 inode = find_inode_fast(sb, head, ino);
1033 spin_unlock(&inode_hash_lock);
1034 if (inode) {
1035 wait_on_inode(inode);
1036 return inode;
1037 }
1038
941 inode = alloc_inode(sb); 1039 inode = alloc_inode(sb);
942 if (inode) { 1040 if (inode) {
943 struct inode *old; 1041 struct inode *old;
944 1042
945 spin_lock(&inode_lock); 1043 spin_lock(&inode_hash_lock);
946 /* We released the lock, so.. */ 1044 /* We released the lock, so.. */
947 old = find_inode_fast(sb, head, ino); 1045 old = find_inode_fast(sb, head, ino);
948 if (!old) { 1046 if (!old) {
949 inode->i_ino = ino; 1047 inode->i_ino = ino;
950 hlist_add_head(&inode->i_hash, head); 1048 spin_lock(&inode->i_lock);
951 __inode_sb_list_add(inode);
952 inode->i_state = I_NEW; 1049 inode->i_state = I_NEW;
953 spin_unlock(&inode_lock); 1050 hlist_add_head(&inode->i_hash, head);
1051 spin_unlock(&inode->i_lock);
1052 inode_sb_list_add(inode);
1053 spin_unlock(&inode_hash_lock);
954 1054
955 /* Return the locked inode with I_NEW set, the 1055 /* Return the locked inode with I_NEW set, the
956 * caller is responsible for filling in the contents 1056 * caller is responsible for filling in the contents
@@ -963,13 +1063,14 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
963 * us. Use the old inode instead of the one we just 1063 * us. Use the old inode instead of the one we just
964 * allocated. 1064 * allocated.
965 */ 1065 */
966 spin_unlock(&inode_lock); 1066 spin_unlock(&inode_hash_lock);
967 destroy_inode(inode); 1067 destroy_inode(inode);
968 inode = old; 1068 inode = old;
969 wait_on_inode(inode); 1069 wait_on_inode(inode);
970 } 1070 }
971 return inode; 1071 return inode;
972} 1072}
1073EXPORT_SYMBOL(iget_locked);
973 1074
974/* 1075/*
975 * search the inode cache for a matching inode number. 1076 * search the inode cache for a matching inode number.
@@ -984,10 +1085,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
984 struct hlist_node *node; 1085 struct hlist_node *node;
985 struct inode *inode; 1086 struct inode *inode;
986 1087
1088 spin_lock(&inode_hash_lock);
987 hlist_for_each_entry(inode, node, b, i_hash) { 1089 hlist_for_each_entry(inode, node, b, i_hash) {
988 if (inode->i_ino == ino && inode->i_sb == sb) 1090 if (inode->i_ino == ino && inode->i_sb == sb) {
1091 spin_unlock(&inode_hash_lock);
989 return 0; 1092 return 0;
1093 }
990 } 1094 }
1095 spin_unlock(&inode_hash_lock);
991 1096
992 return 1; 1097 return 1;
993} 1098}
@@ -1017,7 +1122,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
1017 static unsigned int counter; 1122 static unsigned int counter;
1018 ino_t res; 1123 ino_t res;
1019 1124
1020 spin_lock(&inode_lock);
1021 spin_lock(&iunique_lock); 1125 spin_lock(&iunique_lock);
1022 do { 1126 do {
1023 if (counter <= max_reserved) 1127 if (counter <= max_reserved)
@@ -1025,7 +1129,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
1025 res = counter++; 1129 res = counter++;
1026 } while (!test_inode_iunique(sb, res)); 1130 } while (!test_inode_iunique(sb, res));
1027 spin_unlock(&iunique_lock); 1131 spin_unlock(&iunique_lock);
1028 spin_unlock(&inode_lock);
1029 1132
1030 return res; 1133 return res;
1031} 1134}
@@ -1033,116 +1136,50 @@ EXPORT_SYMBOL(iunique);
1033 1136
1034struct inode *igrab(struct inode *inode) 1137struct inode *igrab(struct inode *inode)
1035{ 1138{
1036 spin_lock(&inode_lock); 1139 spin_lock(&inode->i_lock);
1037 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) 1140 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
1038 __iget(inode); 1141 __iget(inode);
1039 else 1142 spin_unlock(&inode->i_lock);
1143 } else {
1144 spin_unlock(&inode->i_lock);
1040 /* 1145 /*
1041 * Handle the case where s_op->clear_inode is not been 1146 * Handle the case where s_op->clear_inode is not been
1042 * called yet, and somebody is calling igrab 1147 * called yet, and somebody is calling igrab
1043 * while the inode is getting freed. 1148 * while the inode is getting freed.
1044 */ 1149 */
1045 inode = NULL; 1150 inode = NULL;
1046 spin_unlock(&inode_lock); 1151 }
1047 return inode; 1152 return inode;
1048} 1153}
1049EXPORT_SYMBOL(igrab); 1154EXPORT_SYMBOL(igrab);
1050 1155
1051/** 1156/**
1052 * ifind - internal function, you want ilookup5() or iget5().
1053 * @sb: super block of file system to search
1054 * @head: the head of the list to search
1055 * @test: callback used for comparisons between inodes
1056 * @data: opaque data pointer to pass to @test
1057 * @wait: if true wait for the inode to be unlocked, if false do not
1058 *
1059 * ifind() searches for the inode specified by @data in the inode
1060 * cache. This is a generalized version of ifind_fast() for file systems where
1061 * the inode number is not sufficient for unique identification of an inode.
1062 *
1063 * If the inode is in the cache, the inode is returned with an incremented
1064 * reference count.
1065 *
1066 * Otherwise NULL is returned.
1067 *
1068 * Note, @test is called with the inode_lock held, so can't sleep.
1069 */
1070static struct inode *ifind(struct super_block *sb,
1071 struct hlist_head *head, int (*test)(struct inode *, void *),
1072 void *data, const int wait)
1073{
1074 struct inode *inode;
1075
1076 spin_lock(&inode_lock);
1077 inode = find_inode(sb, head, test, data);
1078 if (inode) {
1079 spin_unlock(&inode_lock);
1080 if (likely(wait))
1081 wait_on_inode(inode);
1082 return inode;
1083 }
1084 spin_unlock(&inode_lock);
1085 return NULL;
1086}
1087
1088/**
1089 * ifind_fast - internal function, you want ilookup() or iget().
1090 * @sb: super block of file system to search
1091 * @head: head of the list to search
1092 * @ino: inode number to search for
1093 *
1094 * ifind_fast() searches for the inode @ino in the inode cache. This is for
1095 * file systems where the inode number is sufficient for unique identification
1096 * of an inode.
1097 *
1098 * If the inode is in the cache, the inode is returned with an incremented
1099 * reference count.
1100 *
1101 * Otherwise NULL is returned.
1102 */
1103static struct inode *ifind_fast(struct super_block *sb,
1104 struct hlist_head *head, unsigned long ino)
1105{
1106 struct inode *inode;
1107
1108 spin_lock(&inode_lock);
1109 inode = find_inode_fast(sb, head, ino);
1110 if (inode) {
1111 spin_unlock(&inode_lock);
1112 wait_on_inode(inode);
1113 return inode;
1114 }
1115 spin_unlock(&inode_lock);
1116 return NULL;
1117}
1118
1119/**
1120 * ilookup5_nowait - search for an inode in the inode cache 1157 * ilookup5_nowait - search for an inode in the inode cache
1121 * @sb: super block of file system to search 1158 * @sb: super block of file system to search
1122 * @hashval: hash value (usually inode number) to search for 1159 * @hashval: hash value (usually inode number) to search for
1123 * @test: callback used for comparisons between inodes 1160 * @test: callback used for comparisons between inodes
1124 * @data: opaque data pointer to pass to @test 1161 * @data: opaque data pointer to pass to @test
1125 * 1162 *
1126 * ilookup5() uses ifind() to search for the inode specified by @hashval and 1163 * Search for the inode specified by @hashval and @data in the inode cache.
1127 * @data in the inode cache. This is a generalized version of ilookup() for
1128 * file systems where the inode number is not sufficient for unique
1129 * identification of an inode.
1130 *
1131 * If the inode is in the cache, the inode is returned with an incremented 1164 * If the inode is in the cache, the inode is returned with an incremented
1132 * reference count. Note, the inode lock is not waited upon so you have to be 1165 * reference count.
1133 * very careful what you do with the returned inode. You probably should be
1134 * using ilookup5() instead.
1135 * 1166 *
1136 * Otherwise NULL is returned. 1167 * Note: I_NEW is not waited upon so you have to be very careful what you do
1168 * with the returned inode. You probably should be using ilookup5() instead.
1137 * 1169 *
1138 * Note, @test is called with the inode_lock held, so can't sleep. 1170 * Note2: @test is called with the inode_hash_lock held, so can't sleep.
1139 */ 1171 */
1140struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1172struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
1141 int (*test)(struct inode *, void *), void *data) 1173 int (*test)(struct inode *, void *), void *data)
1142{ 1174{
1143 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1175 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1176 struct inode *inode;
1177
1178 spin_lock(&inode_hash_lock);
1179 inode = find_inode(sb, head, test, data);
1180 spin_unlock(&inode_hash_lock);
1144 1181
1145 return ifind(sb, head, test, data, 0); 1182 return inode;
1146} 1183}
1147EXPORT_SYMBOL(ilookup5_nowait); 1184EXPORT_SYMBOL(ilookup5_nowait);
1148 1185
@@ -1153,24 +1190,24 @@ EXPORT_SYMBOL(ilookup5_nowait);
1153 * @test: callback used for comparisons between inodes 1190 * @test: callback used for comparisons between inodes
1154 * @data: opaque data pointer to pass to @test 1191 * @data: opaque data pointer to pass to @test
1155 * 1192 *
1156 * ilookup5() uses ifind() to search for the inode specified by @hashval and 1193 * Search for the inode specified by @hashval and @data in the inode cache,
1157 * @data in the inode cache. This is a generalized version of ilookup() for 1194 * and if the inode is in the cache, return the inode with an incremented
1158 * file systems where the inode number is not sufficient for unique 1195 * reference count. Waits on I_NEW before returning the inode.
1159 * identification of an inode.
1160 *
1161 * If the inode is in the cache, the inode lock is waited upon and the inode is
1162 * returned with an incremented reference count. 1196 * returned with an incremented reference count.
1163 * 1197 *
1164 * Otherwise NULL is returned. 1198 * This is a generalized version of ilookup() for file systems where the
1199 * inode number is not sufficient for unique identification of an inode.
1165 * 1200 *
1166 * Note, @test is called with the inode_lock held, so can't sleep. 1201 * Note: @test is called with the inode_hash_lock held, so can't sleep.
1167 */ 1202 */
1168struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1203struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
1169 int (*test)(struct inode *, void *), void *data) 1204 int (*test)(struct inode *, void *), void *data)
1170{ 1205{
1171 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1206 struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
1172 1207
1173 return ifind(sb, head, test, data, 1); 1208 if (inode)
1209 wait_on_inode(inode);
1210 return inode;
1174} 1211}
1175EXPORT_SYMBOL(ilookup5); 1212EXPORT_SYMBOL(ilookup5);
1176 1213
@@ -1179,91 +1216,23 @@ EXPORT_SYMBOL(ilookup5);
1179 * @sb: super block of file system to search 1216 * @sb: super block of file system to search
1180 * @ino: inode number to search for 1217 * @ino: inode number to search for
1181 * 1218 *
1182 * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. 1219 * Search for the inode @ino in the inode cache, and if the inode is in the
1183 * This is for file systems where the inode number is sufficient for unique 1220 * cache, the inode is returned with an incremented reference count.
1184 * identification of an inode.
1185 *
1186 * If the inode is in the cache, the inode is returned with an incremented
1187 * reference count.
1188 *
1189 * Otherwise NULL is returned.
1190 */ 1221 */
1191struct inode *ilookup(struct super_block *sb, unsigned long ino) 1222struct inode *ilookup(struct super_block *sb, unsigned long ino)
1192{ 1223{
1193 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1224 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1194
1195 return ifind_fast(sb, head, ino);
1196}
1197EXPORT_SYMBOL(ilookup);
1198
1199/**
1200 * iget5_locked - obtain an inode from a mounted file system
1201 * @sb: super block of file system
1202 * @hashval: hash value (usually inode number) to get
1203 * @test: callback used for comparisons between inodes
1204 * @set: callback used to initialize a new struct inode
1205 * @data: opaque data pointer to pass to @test and @set
1206 *
1207 * iget5_locked() uses ifind() to search for the inode specified by @hashval
1208 * and @data in the inode cache and if present it is returned with an increased
1209 * reference count. This is a generalized version of iget_locked() for file
1210 * systems where the inode number is not sufficient for unique identification
1211 * of an inode.
1212 *
1213 * If the inode is not in cache, get_new_inode() is called to allocate a new
1214 * inode and this is returned locked, hashed, and with the I_NEW flag set. The
1215 * file system gets to fill it in before unlocking it via unlock_new_inode().
1216 *
1217 * Note both @test and @set are called with the inode_lock held, so can't sleep.
1218 */
1219struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
1220 int (*test)(struct inode *, void *),
1221 int (*set)(struct inode *, void *), void *data)
1222{
1223 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1224 struct inode *inode; 1225 struct inode *inode;
1225 1226
1226 inode = ifind(sb, head, test, data, 1); 1227 spin_lock(&inode_hash_lock);
1227 if (inode) 1228 inode = find_inode_fast(sb, head, ino);
1228 return inode; 1229 spin_unlock(&inode_hash_lock);
1229 /*
1230 * get_new_inode() will do the right thing, re-trying the search
1231 * in case it had to block at any point.
1232 */
1233 return get_new_inode(sb, head, test, set, data);
1234}
1235EXPORT_SYMBOL(iget5_locked);
1236
1237/**
1238 * iget_locked - obtain an inode from a mounted file system
1239 * @sb: super block of file system
1240 * @ino: inode number to get
1241 *
1242 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
1243 * the inode cache and if present it is returned with an increased reference
1244 * count. This is for file systems where the inode number is sufficient for
1245 * unique identification of an inode.
1246 *
1247 * If the inode is not in cache, get_new_inode_fast() is called to allocate a
1248 * new inode and this is returned locked, hashed, and with the I_NEW flag set.
1249 * The file system gets to fill it in before unlocking it via
1250 * unlock_new_inode().
1251 */
1252struct inode *iget_locked(struct super_block *sb, unsigned long ino)
1253{
1254 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1255 struct inode *inode;
1256 1230
1257 inode = ifind_fast(sb, head, ino);
1258 if (inode) 1231 if (inode)
1259 return inode; 1232 wait_on_inode(inode);
1260 /* 1233 return inode;
1261 * get_new_inode_fast() will do the right thing, re-trying the search
1262 * in case it had to block at any point.
1263 */
1264 return get_new_inode_fast(sb, head, ino);
1265} 1234}
1266EXPORT_SYMBOL(iget_locked); 1235EXPORT_SYMBOL(ilookup);
1267 1236
1268int insert_inode_locked(struct inode *inode) 1237int insert_inode_locked(struct inode *inode)
1269{ 1238{
@@ -1271,27 +1240,33 @@ int insert_inode_locked(struct inode *inode)
1271 ino_t ino = inode->i_ino; 1240 ino_t ino = inode->i_ino;
1272 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1241 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1273 1242
1274 inode->i_state |= I_NEW;
1275 while (1) { 1243 while (1) {
1276 struct hlist_node *node; 1244 struct hlist_node *node;
1277 struct inode *old = NULL; 1245 struct inode *old = NULL;
1278 spin_lock(&inode_lock); 1246 spin_lock(&inode_hash_lock);
1279 hlist_for_each_entry(old, node, head, i_hash) { 1247 hlist_for_each_entry(old, node, head, i_hash) {
1280 if (old->i_ino != ino) 1248 if (old->i_ino != ino)
1281 continue; 1249 continue;
1282 if (old->i_sb != sb) 1250 if (old->i_sb != sb)
1283 continue; 1251 continue;
1284 if (old->i_state & (I_FREEING|I_WILL_FREE)) 1252 spin_lock(&old->i_lock);
1253 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1254 spin_unlock(&old->i_lock);
1285 continue; 1255 continue;
1256 }
1286 break; 1257 break;
1287 } 1258 }
1288 if (likely(!node)) { 1259 if (likely(!node)) {
1260 spin_lock(&inode->i_lock);
1261 inode->i_state |= I_NEW;
1289 hlist_add_head(&inode->i_hash, head); 1262 hlist_add_head(&inode->i_hash, head);
1290 spin_unlock(&inode_lock); 1263 spin_unlock(&inode->i_lock);
1264 spin_unlock(&inode_hash_lock);
1291 return 0; 1265 return 0;
1292 } 1266 }
1293 __iget(old); 1267 __iget(old);
1294 spin_unlock(&inode_lock); 1268 spin_unlock(&old->i_lock);
1269 spin_unlock(&inode_hash_lock);
1295 wait_on_inode(old); 1270 wait_on_inode(old);
1296 if (unlikely(!inode_unhashed(old))) { 1271 if (unlikely(!inode_unhashed(old))) {
1297 iput(old); 1272 iput(old);
@@ -1308,29 +1283,34 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1308 struct super_block *sb = inode->i_sb; 1283 struct super_block *sb = inode->i_sb;
1309 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1284 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1310 1285
1311 inode->i_state |= I_NEW;
1312
1313 while (1) { 1286 while (1) {
1314 struct hlist_node *node; 1287 struct hlist_node *node;
1315 struct inode *old = NULL; 1288 struct inode *old = NULL;
1316 1289
1317 spin_lock(&inode_lock); 1290 spin_lock(&inode_hash_lock);
1318 hlist_for_each_entry(old, node, head, i_hash) { 1291 hlist_for_each_entry(old, node, head, i_hash) {
1319 if (old->i_sb != sb) 1292 if (old->i_sb != sb)
1320 continue; 1293 continue;
1321 if (!test(old, data)) 1294 if (!test(old, data))
1322 continue; 1295 continue;
1323 if (old->i_state & (I_FREEING|I_WILL_FREE)) 1296 spin_lock(&old->i_lock);
1297 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1298 spin_unlock(&old->i_lock);
1324 continue; 1299 continue;
1300 }
1325 break; 1301 break;
1326 } 1302 }
1327 if (likely(!node)) { 1303 if (likely(!node)) {
1304 spin_lock(&inode->i_lock);
1305 inode->i_state |= I_NEW;
1328 hlist_add_head(&inode->i_hash, head); 1306 hlist_add_head(&inode->i_hash, head);
1329 spin_unlock(&inode_lock); 1307 spin_unlock(&inode->i_lock);
1308 spin_unlock(&inode_hash_lock);
1330 return 0; 1309 return 0;
1331 } 1310 }
1332 __iget(old); 1311 __iget(old);
1333 spin_unlock(&inode_lock); 1312 spin_unlock(&old->i_lock);
1313 spin_unlock(&inode_hash_lock);
1334 wait_on_inode(old); 1314 wait_on_inode(old);
1335 if (unlikely(!inode_unhashed(old))) { 1315 if (unlikely(!inode_unhashed(old))) {
1336 iput(old); 1316 iput(old);
@@ -1375,47 +1355,35 @@ static void iput_final(struct inode *inode)
1375 const struct super_operations *op = inode->i_sb->s_op; 1355 const struct super_operations *op = inode->i_sb->s_op;
1376 int drop; 1356 int drop;
1377 1357
1358 WARN_ON(inode->i_state & I_NEW);
1359
1378 if (op && op->drop_inode) 1360 if (op && op->drop_inode)
1379 drop = op->drop_inode(inode); 1361 drop = op->drop_inode(inode);
1380 else 1362 else
1381 drop = generic_drop_inode(inode); 1363 drop = generic_drop_inode(inode);
1382 1364
1365 if (!drop && (sb->s_flags & MS_ACTIVE)) {
1366 inode->i_state |= I_REFERENCED;
1367 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1368 inode_lru_list_add(inode);
1369 spin_unlock(&inode->i_lock);
1370 return;
1371 }
1372
1383 if (!drop) { 1373 if (!drop) {
1384 if (sb->s_flags & MS_ACTIVE) {
1385 inode->i_state |= I_REFERENCED;
1386 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1387 inode_lru_list_add(inode);
1388 }
1389 spin_unlock(&inode_lock);
1390 return;
1391 }
1392 WARN_ON(inode->i_state & I_NEW);
1393 inode->i_state |= I_WILL_FREE; 1374 inode->i_state |= I_WILL_FREE;
1394 spin_unlock(&inode_lock); 1375 spin_unlock(&inode->i_lock);
1395 write_inode_now(inode, 1); 1376 write_inode_now(inode, 1);
1396 spin_lock(&inode_lock); 1377 spin_lock(&inode->i_lock);
1397 WARN_ON(inode->i_state & I_NEW); 1378 WARN_ON(inode->i_state & I_NEW);
1398 inode->i_state &= ~I_WILL_FREE; 1379 inode->i_state &= ~I_WILL_FREE;
1399 __remove_inode_hash(inode);
1400 } 1380 }
1401 1381
1402 WARN_ON(inode->i_state & I_NEW);
1403 inode->i_state |= I_FREEING; 1382 inode->i_state |= I_FREEING;
1404
1405 /*
1406 * Move the inode off the IO lists and LRU once I_FREEING is
1407 * set so that it won't get moved back on there if it is dirty.
1408 */
1409 inode_lru_list_del(inode); 1383 inode_lru_list_del(inode);
1410 list_del_init(&inode->i_wb_list); 1384 spin_unlock(&inode->i_lock);
1411 1385
1412 __inode_sb_list_del(inode);
1413 spin_unlock(&inode_lock);
1414 evict(inode); 1386 evict(inode);
1415 remove_inode_hash(inode);
1416 wake_up_inode(inode);
1417 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
1418 destroy_inode(inode);
1419} 1387}
1420 1388
1421/** 1389/**
@@ -1432,7 +1400,7 @@ void iput(struct inode *inode)
1432 if (inode) { 1400 if (inode) {
1433 BUG_ON(inode->i_state & I_CLEAR); 1401 BUG_ON(inode->i_state & I_CLEAR);
1434 1402
1435 if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1403 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
1436 iput_final(inode); 1404 iput_final(inode);
1437 } 1405 }
1438} 1406}
@@ -1611,9 +1579,8 @@ EXPORT_SYMBOL(inode_wait);
1611 * to recheck inode state. 1579 * to recheck inode state.
1612 * 1580 *
1613 * It doesn't matter if I_NEW is not set initially, a call to 1581 * It doesn't matter if I_NEW is not set initially, a call to
1614 * wake_up_inode() after removing from the hash list will DTRT. 1582 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
1615 * 1583 * will DTRT.
1616 * This is called with inode_lock held.
1617 */ 1584 */
1618static void __wait_on_freeing_inode(struct inode *inode) 1585static void __wait_on_freeing_inode(struct inode *inode)
1619{ 1586{
@@ -1621,10 +1588,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
1621 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1588 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1622 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1589 wq = bit_waitqueue(&inode->i_state, __I_NEW);
1623 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1590 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
1624 spin_unlock(&inode_lock); 1591 spin_unlock(&inode->i_lock);
1592 spin_unlock(&inode_hash_lock);
1625 schedule(); 1593 schedule();
1626 finish_wait(wq, &wait.wait); 1594 finish_wait(wq, &wait.wait);
1627 spin_lock(&inode_lock); 1595 spin_lock(&inode_hash_lock);
1628} 1596}
1629 1597
1630static __initdata unsigned long ihash_entries; 1598static __initdata unsigned long ihash_entries;
diff --git a/fs/internal.h b/fs/internal.h
index 8318059b42c6..b29c46e4e32f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -125,6 +125,13 @@ extern long do_handle_open(int mountdirfd,
125/* 125/*
126 * inode.c 126 * inode.c
127 */ 127 */
128extern spinlock_t inode_sb_list_lock;
129
130/*
131 * fs-writeback.c
132 */
133extern void inode_wb_list_del(struct inode *inode);
134
128extern int get_nr_dirty_inodes(void); 135extern int get_nr_dirty_inodes(void);
129extern void evict_inodes(struct super_block *); 136extern void evict_inodes(struct super_block *);
130extern int invalidate_inodes(struct super_block *, bool); 137extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4f9cc0482949..3e93cdd19005 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
31 * is used to release xattr name/value pair and detach from c->xattrindex. 31 * is used to release xattr name/value pair and detach from c->xattrindex.
32 * reclaim_xattr_datum(c) 32 * reclaim_xattr_datum(c)
33 * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when 33 * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
34 * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold 34 * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold
35 * is hard coded as 32KiB. 35 * is hard coded as 32KiB.
36 * do_verify_xattr_datum(c, xd) 36 * do_verify_xattr_datum(c, xd)
37 * is used to load the xdatum informations without name/value pair from the medium. 37 * is used to load the xdatum informations without name/value pair from the medium.
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 03b8c240aeda..edfea7a3a747 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
293 return ret; 293 return ret;
294} 294}
295 295
296/* called with inode_lock held */ 296/* called with inode->i_lock held */
297static int logfs_drop_inode(struct inode *inode) 297static int logfs_drop_inode(struct inode *inode)
298{ 298{
299 struct logfs_super *super = logfs_super(inode->i_sb); 299 struct logfs_super *super = logfs_super(inode->i_sb);
diff --git a/fs/namei.c b/fs/namei.c
index d0066e17d45d..3cb616d38d9c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -992,6 +992,12 @@ int follow_down_one(struct path *path)
992 return 0; 992 return 0;
993} 993}
994 994
995static inline bool managed_dentry_might_block(struct dentry *dentry)
996{
997 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
998 dentry->d_op->d_manage(dentry, true) < 0);
999}
1000
995/* 1001/*
996 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we 1002 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
997 * meet a managed dentry and we're not walking to "..". True is returned to 1003 * meet a managed dentry and we're not walking to "..". True is returned to
@@ -1000,19 +1006,26 @@ int follow_down_one(struct path *path)
1000static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 1006static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1001 struct inode **inode, bool reverse_transit) 1007 struct inode **inode, bool reverse_transit)
1002{ 1008{
1003 while (d_mountpoint(path->dentry)) { 1009 for (;;) {
1004 struct vfsmount *mounted; 1010 struct vfsmount *mounted;
1005 if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && 1011 /*
1006 !reverse_transit && 1012 * Don't forget we might have a non-mountpoint managed dentry
1007 path->dentry->d_op->d_manage(path->dentry, true) < 0) 1013 * that wants to block transit.
1014 */
1015 *inode = path->dentry->d_inode;
1016 if (!reverse_transit &&
1017 unlikely(managed_dentry_might_block(path->dentry)))
1008 return false; 1018 return false;
1019
1020 if (!d_mountpoint(path->dentry))
1021 break;
1022
1009 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 1023 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
1010 if (!mounted) 1024 if (!mounted)
1011 break; 1025 break;
1012 path->mnt = mounted; 1026 path->mnt = mounted;
1013 path->dentry = mounted->mnt_root; 1027 path->dentry = mounted->mnt_root;
1014 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 1028 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1015 *inode = path->dentry->d_inode;
1016 } 1029 }
1017 1030
1018 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) 1031 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index abdf38d5971d..7237672216c8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -44,6 +44,7 @@
44/* #define NFS_DEBUG_VERBOSE 1 */ 44/* #define NFS_DEBUG_VERBOSE 1 */
45 45
46static int nfs_opendir(struct inode *, struct file *); 46static int nfs_opendir(struct inode *, struct file *);
47static int nfs_closedir(struct inode *, struct file *);
47static int nfs_readdir(struct file *, void *, filldir_t); 48static int nfs_readdir(struct file *, void *, filldir_t);
48static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); 49static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
49static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); 50static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *);
@@ -64,7 +65,7 @@ const struct file_operations nfs_dir_operations = {
64 .read = generic_read_dir, 65 .read = generic_read_dir,
65 .readdir = nfs_readdir, 66 .readdir = nfs_readdir,
66 .open = nfs_opendir, 67 .open = nfs_opendir,
67 .release = nfs_release, 68 .release = nfs_closedir,
68 .fsync = nfs_fsync_dir, 69 .fsync = nfs_fsync_dir,
69}; 70};
70 71
@@ -133,13 +134,35 @@ const struct inode_operations nfs4_dir_inode_operations = {
133 134
134#endif /* CONFIG_NFS_V4 */ 135#endif /* CONFIG_NFS_V4 */
135 136
137static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred)
138{
139 struct nfs_open_dir_context *ctx;
140 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
141 if (ctx != NULL) {
142 ctx->duped = 0;
143 ctx->dir_cookie = 0;
144 ctx->dup_cookie = 0;
145 ctx->cred = get_rpccred(cred);
146 } else
147 ctx = ERR_PTR(-ENOMEM);
148 return ctx;
149}
150
151static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
152{
153 put_rpccred(ctx->cred);
154 kfree(ctx);
155}
156
136/* 157/*
137 * Open file 158 * Open file
138 */ 159 */
139static int 160static int
140nfs_opendir(struct inode *inode, struct file *filp) 161nfs_opendir(struct inode *inode, struct file *filp)
141{ 162{
142 int res; 163 int res = 0;
164 struct nfs_open_dir_context *ctx;
165 struct rpc_cred *cred;
143 166
144 dfprintk(FILE, "NFS: open dir(%s/%s)\n", 167 dfprintk(FILE, "NFS: open dir(%s/%s)\n",
145 filp->f_path.dentry->d_parent->d_name.name, 168 filp->f_path.dentry->d_parent->d_name.name,
@@ -147,8 +170,15 @@ nfs_opendir(struct inode *inode, struct file *filp)
147 170
148 nfs_inc_stats(inode, NFSIOS_VFSOPEN); 171 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
149 172
150 /* Call generic open code in order to cache credentials */ 173 cred = rpc_lookup_cred();
151 res = nfs_open(inode, filp); 174 if (IS_ERR(cred))
175 return PTR_ERR(cred);
176 ctx = alloc_nfs_open_dir_context(cred);
177 if (IS_ERR(ctx)) {
178 res = PTR_ERR(ctx);
179 goto out;
180 }
181 filp->private_data = ctx;
152 if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { 182 if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
153 /* This is a mountpoint, so d_revalidate will never 183 /* This is a mountpoint, so d_revalidate will never
154 * have been called, so we need to refresh the 184 * have been called, so we need to refresh the
@@ -156,9 +186,18 @@ nfs_opendir(struct inode *inode, struct file *filp)
156 */ 186 */
157 __nfs_revalidate_inode(NFS_SERVER(inode), inode); 187 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
158 } 188 }
189out:
190 put_rpccred(cred);
159 return res; 191 return res;
160} 192}
161 193
194static int
195nfs_closedir(struct inode *inode, struct file *filp)
196{
197 put_nfs_open_dir_context(filp->private_data);
198 return 0;
199}
200
162struct nfs_cache_array_entry { 201struct nfs_cache_array_entry {
163 u64 cookie; 202 u64 cookie;
164 u64 ino; 203 u64 ino;
@@ -284,19 +323,20 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
284{ 323{
285 loff_t diff = desc->file->f_pos - desc->current_index; 324 loff_t diff = desc->file->f_pos - desc->current_index;
286 unsigned int index; 325 unsigned int index;
326 struct nfs_open_dir_context *ctx = desc->file->private_data;
287 327
288 if (diff < 0) 328 if (diff < 0)
289 goto out_eof; 329 goto out_eof;
290 if (diff >= array->size) { 330 if (diff >= array->size) {
291 if (array->eof_index >= 0) 331 if (array->eof_index >= 0)
292 goto out_eof; 332 goto out_eof;
293 desc->current_index += array->size;
294 return -EAGAIN; 333 return -EAGAIN;
295 } 334 }
296 335
297 index = (unsigned int)diff; 336 index = (unsigned int)diff;
298 *desc->dir_cookie = array->array[index].cookie; 337 *desc->dir_cookie = array->array[index].cookie;
299 desc->cache_entry_index = index; 338 desc->cache_entry_index = index;
339 ctx->duped = 0;
300 return 0; 340 return 0;
301out_eof: 341out_eof:
302 desc->eof = 1; 342 desc->eof = 1;
@@ -307,10 +347,18 @@ static
307int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) 347int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
308{ 348{
309 int i; 349 int i;
350 loff_t new_pos;
310 int status = -EAGAIN; 351 int status = -EAGAIN;
352 struct nfs_open_dir_context *ctx = desc->file->private_data;
311 353
312 for (i = 0; i < array->size; i++) { 354 for (i = 0; i < array->size; i++) {
313 if (array->array[i].cookie == *desc->dir_cookie) { 355 if (array->array[i].cookie == *desc->dir_cookie) {
356 new_pos = desc->current_index + i;
357 if (new_pos < desc->file->f_pos) {
358 ctx->dup_cookie = *desc->dir_cookie;
359 ctx->duped = 1;
360 }
361 desc->file->f_pos = new_pos;
314 desc->cache_entry_index = i; 362 desc->cache_entry_index = i;
315 return 0; 363 return 0;
316 } 364 }
@@ -342,6 +390,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
342 390
343 if (status == -EAGAIN) { 391 if (status == -EAGAIN) {
344 desc->last_cookie = array->last_cookie; 392 desc->last_cookie = array->last_cookie;
393 desc->current_index += array->size;
345 desc->page_index++; 394 desc->page_index++;
346 } 395 }
347 nfs_readdir_release_array(desc->page); 396 nfs_readdir_release_array(desc->page);
@@ -354,7 +403,8 @@ static
354int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, 403int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
355 struct nfs_entry *entry, struct file *file, struct inode *inode) 404 struct nfs_entry *entry, struct file *file, struct inode *inode)
356{ 405{
357 struct rpc_cred *cred = nfs_file_cred(file); 406 struct nfs_open_dir_context *ctx = file->private_data;
407 struct rpc_cred *cred = ctx->cred;
358 unsigned long timestamp, gencount; 408 unsigned long timestamp, gencount;
359 int error; 409 int error;
360 410
@@ -693,6 +743,20 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
693 int i = 0; 743 int i = 0;
694 int res = 0; 744 int res = 0;
695 struct nfs_cache_array *array = NULL; 745 struct nfs_cache_array *array = NULL;
746 struct nfs_open_dir_context *ctx = file->private_data;
747
748 if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) {
749 if (printk_ratelimit()) {
750 pr_notice("NFS: directory %s/%s contains a readdir loop. "
751 "Please contact your server vendor. "
752 "Offending cookie: %llu\n",
753 file->f_dentry->d_parent->d_name.name,
754 file->f_dentry->d_name.name,
755 *desc->dir_cookie);
756 }
757 res = -ELOOP;
758 goto out;
759 }
696 760
697 array = nfs_readdir_get_array(desc->page); 761 array = nfs_readdir_get_array(desc->page);
698 if (IS_ERR(array)) { 762 if (IS_ERR(array)) {
@@ -785,6 +849,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
785 struct inode *inode = dentry->d_inode; 849 struct inode *inode = dentry->d_inode;
786 nfs_readdir_descriptor_t my_desc, 850 nfs_readdir_descriptor_t my_desc,
787 *desc = &my_desc; 851 *desc = &my_desc;
852 struct nfs_open_dir_context *dir_ctx = filp->private_data;
788 int res; 853 int res;
789 854
790 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 855 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -801,7 +866,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
801 memset(desc, 0, sizeof(*desc)); 866 memset(desc, 0, sizeof(*desc));
802 867
803 desc->file = filp; 868 desc->file = filp;
804 desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie; 869 desc->dir_cookie = &dir_ctx->dir_cookie;
805 desc->decode = NFS_PROTO(inode)->decode_dirent; 870 desc->decode = NFS_PROTO(inode)->decode_dirent;
806 desc->plus = NFS_USE_READDIRPLUS(inode); 871 desc->plus = NFS_USE_READDIRPLUS(inode);
807 872
@@ -853,6 +918,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
853{ 918{
854 struct dentry *dentry = filp->f_path.dentry; 919 struct dentry *dentry = filp->f_path.dentry;
855 struct inode *inode = dentry->d_inode; 920 struct inode *inode = dentry->d_inode;
921 struct nfs_open_dir_context *dir_ctx = filp->private_data;
856 922
857 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", 923 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
858 dentry->d_parent->d_name.name, 924 dentry->d_parent->d_name.name,
@@ -872,7 +938,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
872 } 938 }
873 if (offset != filp->f_pos) { 939 if (offset != filp->f_pos) {
874 filp->f_pos = offset; 940 filp->f_pos = offset;
875 nfs_file_open_context(filp)->dir_cookie = 0; 941 dir_ctx->dir_cookie = 0;
942 dir_ctx->duped = 0;
876 } 943 }
877out: 944out:
878 mutex_unlock(&inode->i_mutex); 945 mutex_unlock(&inode->i_mutex);
@@ -1068,7 +1135,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
1068 if (fhandle == NULL || fattr == NULL) 1135 if (fhandle == NULL || fattr == NULL)
1069 goto out_error; 1136 goto out_error;
1070 1137
1071 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1138 error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
1072 if (error) 1139 if (error)
1073 goto out_bad; 1140 goto out_bad;
1074 if (nfs_compare_fh(NFS_FH(inode), fhandle)) 1141 if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1224,7 +1291,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
1224 parent = dentry->d_parent; 1291 parent = dentry->d_parent;
1225 /* Protect against concurrent sillydeletes */ 1292 /* Protect against concurrent sillydeletes */
1226 nfs_block_sillyrename(parent); 1293 nfs_block_sillyrename(parent);
1227 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1294 error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
1228 if (error == -ENOENT) 1295 if (error == -ENOENT)
1229 goto no_entry; 1296 goto no_entry;
1230 if (error < 0) { 1297 if (error < 0) {
@@ -1562,7 +1629,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1562 if (dentry->d_inode) 1629 if (dentry->d_inode)
1563 goto out; 1630 goto out;
1564 if (fhandle->size == 0) { 1631 if (fhandle->size == 0) {
1565 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1632 error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
1566 if (error) 1633 if (error)
1567 goto out_error; 1634 goto out_error;
1568 } 1635 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d85a534b15cd..3ac5bd695e5e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -326,6 +326,9 @@ nfs_file_fsync(struct file *file, int datasync)
326 ret = xchg(&ctx->error, 0); 326 ret = xchg(&ctx->error, 0);
327 if (!ret && status < 0) 327 if (!ret && status < 0)
328 ret = status; 328 ret = status;
329 if (!ret && !datasync)
330 /* application has asked for meta-data sync */
331 ret = pnfs_layoutcommit_inode(inode, true);
329 return ret; 332 return ret;
330} 333}
331 334
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 1084792bc0fe..dcb61548887f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -222,6 +222,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
222 goto out; 222 goto out;
223 } 223 }
224 224
225 if (fattr->valid & NFS_ATTR_FATTR_FSID &&
226 !nfs_fsid_equal(&server->fsid, &fattr->fsid))
227 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
228
225 inode = nfs_fhget(sb, mntfh, fattr); 229 inode = nfs_fhget(sb, mntfh, fattr);
226 if (IS_ERR(inode)) { 230 if (IS_ERR(inode)) {
227 dprintk("nfs_get_root: get root inode failed\n"); 231 dprintk("nfs_get_root: get root inode failed\n");
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 01768e5e2c9b..57bb31ad7a5e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -254,7 +254,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
254 struct inode *inode = ERR_PTR(-ENOENT); 254 struct inode *inode = ERR_PTR(-ENOENT);
255 unsigned long hash; 255 unsigned long hash;
256 256
257 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) 257 nfs_attr_check_mountpoint(sb, fattr);
258
259 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0)
258 goto out_no_inode; 260 goto out_no_inode;
259 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) 261 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
260 goto out_no_inode; 262 goto out_no_inode;
@@ -298,8 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
298 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) 300 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
299 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 301 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
300 /* Deal with crossing mountpoints */ 302 /* Deal with crossing mountpoints */
301 if ((fattr->valid & NFS_ATTR_FATTR_FSID) 303 if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
302 && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { 304 fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
303 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 305 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
304 inode->i_op = &nfs_referral_inode_operations; 306 inode->i_op = &nfs_referral_inode_operations;
305 else 307 else
@@ -639,7 +641,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cr
639 ctx->mode = f_mode; 641 ctx->mode = f_mode;
640 ctx->flags = 0; 642 ctx->flags = 0;
641 ctx->error = 0; 643 ctx->error = 0;
642 ctx->dir_cookie = 0;
643 nfs_init_lock_context(&ctx->lock_context); 644 nfs_init_lock_context(&ctx->lock_context);
644 ctx->lock_context.open_context = ctx; 645 ctx->lock_context.open_context = ctx;
645 INIT_LIST_HEAD(&ctx->list); 646 INIT_LIST_HEAD(&ctx->list);
@@ -1471,6 +1472,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
1471 nfsi->delegation_state = 0; 1472 nfsi->delegation_state = 0;
1472 init_rwsem(&nfsi->rwsem); 1473 init_rwsem(&nfsi->rwsem);
1473 nfsi->layout = NULL; 1474 nfsi->layout = NULL;
1475 atomic_set(&nfsi->commits_outstanding, 0);
1474#endif 1476#endif
1475} 1477}
1476 1478
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 72e0bddf7a2f..ce118ce885dd 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -39,6 +39,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
39 return 0; 39 return 0;
40} 40}
41 41
42static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
43{
44 if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
45 fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
46}
47
42struct nfs_clone_mount { 48struct nfs_clone_mount {
43 const struct super_block *sb; 49 const struct super_block *sb;
44 const struct dentry *dentry; 50 const struct dentry *dentry;
@@ -214,6 +220,7 @@ extern const u32 nfs41_maxwrite_overhead;
214/* nfs4proc.c */ 220/* nfs4proc.c */
215#ifdef CONFIG_NFS_V4 221#ifdef CONFIG_NFS_V4
216extern struct rpc_procinfo nfs4_procedures[]; 222extern struct rpc_procinfo nfs4_procedures[];
223void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *);
217#endif 224#endif
218 225
219extern int nfs4_init_ds_session(struct nfs_client *clp); 226extern int nfs4_init_ds_session(struct nfs_client *clp);
@@ -276,11 +283,25 @@ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
276extern void nfs_read_prepare(struct rpc_task *task, void *calldata); 283extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
277 284
278/* write.c */ 285/* write.c */
286extern void nfs_commit_free(struct nfs_write_data *p);
279extern int nfs_initiate_write(struct nfs_write_data *data, 287extern int nfs_initiate_write(struct nfs_write_data *data,
280 struct rpc_clnt *clnt, 288 struct rpc_clnt *clnt,
281 const struct rpc_call_ops *call_ops, 289 const struct rpc_call_ops *call_ops,
282 int how); 290 int how);
283extern void nfs_write_prepare(struct rpc_task *task, void *calldata); 291extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
292extern int nfs_initiate_commit(struct nfs_write_data *data,
293 struct rpc_clnt *clnt,
294 const struct rpc_call_ops *call_ops,
295 int how);
296extern void nfs_init_commit(struct nfs_write_data *data,
297 struct list_head *head,
298 struct pnfs_layout_segment *lseg);
299void nfs_retry_commit(struct list_head *page_list,
300 struct pnfs_layout_segment *lseg);
301void nfs_commit_clear_lock(struct nfs_inode *nfsi);
302void nfs_commitdata_release(void *data);
303void nfs_commit_release_pages(struct nfs_write_data *data);
304
284#ifdef CONFIG_MIGRATION 305#ifdef CONFIG_MIGRATION
285extern int nfs_migrate_page(struct address_space *, 306extern int nfs_migrate_page(struct address_space *,
286 struct page *, struct page *); 307 struct page *, struct page *);
@@ -296,12 +317,14 @@ extern int nfs4_init_client(struct nfs_client *clp,
296 rpc_authflavor_t authflavour, 317 rpc_authflavor_t authflavour,
297 int noresvport); 318 int noresvport);
298extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); 319extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
299extern int _nfs4_call_sync(struct nfs_server *server, 320extern int _nfs4_call_sync(struct rpc_clnt *clnt,
321 struct nfs_server *server,
300 struct rpc_message *msg, 322 struct rpc_message *msg,
301 struct nfs4_sequence_args *args, 323 struct nfs4_sequence_args *args,
302 struct nfs4_sequence_res *res, 324 struct nfs4_sequence_res *res,
303 int cache_reply); 325 int cache_reply);
304extern int _nfs4_call_sync_session(struct nfs_server *server, 326extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
327 struct nfs_server *server,
305 struct rpc_message *msg, 328 struct rpc_message *msg,
306 struct nfs4_sequence_args *args, 329 struct nfs4_sequence_args *args,
307 struct nfs4_sequence_res *res, 330 struct nfs4_sequence_res *res,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index bf1c68009ffd..ad92bf731ff5 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -15,6 +15,7 @@
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
17#include <linux/vfs.h> 17#include <linux/vfs.h>
18#include <linux/sunrpc/gss_api.h>
18#include "internal.h" 19#include "internal.h"
19 20
20#define NFSDBG_FACILITY NFSDBG_VFS 21#define NFSDBG_FACILITY NFSDBG_VFS
@@ -27,7 +28,8 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
27 28
28static struct vfsmount *nfs_do_submount(struct dentry *dentry, 29static struct vfsmount *nfs_do_submount(struct dentry *dentry,
29 struct nfs_fh *fh, 30 struct nfs_fh *fh,
30 struct nfs_fattr *fattr); 31 struct nfs_fattr *fattr,
32 rpc_authflavor_t authflavor);
31 33
32/* 34/*
33 * nfs_path - reconstruct the path given an arbitrary dentry 35 * nfs_path - reconstruct the path given an arbitrary dentry
@@ -116,6 +118,100 @@ Elong:
116 return ERR_PTR(-ENAMETOOLONG); 118 return ERR_PTR(-ENAMETOOLONG);
117} 119}
118 120
121#ifdef CONFIG_NFS_V4
122static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors, struct inode *inode)
123{
124 struct gss_api_mech *mech;
125 struct xdr_netobj oid;
126 int i;
127 rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
128
129 for (i = 0; i < flavors->num_flavors; i++) {
130 struct nfs4_secinfo_flavor *flavor;
131 flavor = &flavors->flavors[i];
132
133 if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
134 pseudoflavor = flavor->flavor;
135 break;
136 } else if (flavor->flavor == RPC_AUTH_GSS) {
137 oid.len = flavor->gss.sec_oid4.len;
138 oid.data = flavor->gss.sec_oid4.data;
139 mech = gss_mech_get_by_OID(&oid);
140 if (!mech)
141 continue;
142 pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
143 gss_mech_put(mech);
144 break;
145 }
146 }
147
148 return pseudoflavor;
149}
150
151static rpc_authflavor_t nfs_negotiate_security(const struct dentry *parent, const struct dentry *dentry)
152{
153 int status = 0;
154 struct page *page;
155 struct nfs4_secinfo_flavors *flavors;
156 int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
157 rpc_authflavor_t flavor = RPC_AUTH_UNIX;
158
159 secinfo = NFS_PROTO(parent->d_inode)->secinfo;
160 if (secinfo != NULL) {
161 page = alloc_page(GFP_KERNEL);
162 if (!page) {
163 status = -ENOMEM;
164 goto out;
165 }
166 flavors = page_address(page);
167 status = secinfo(parent->d_inode, &dentry->d_name, flavors);
168 flavor = nfs_find_best_sec(flavors, dentry->d_inode);
169 put_page(page);
170 }
171
172 return flavor;
173
174out:
175 status = -ENOMEM;
176 return status;
177}
178
179static rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent,
180 struct dentry *dentry, struct path *path,
181 struct nfs_fh *fh, struct nfs_fattr *fattr)
182{
183 rpc_authflavor_t flavor;
184 struct rpc_clnt *clone;
185 struct rpc_auth *auth;
186 int err;
187
188 flavor = nfs_negotiate_security(parent, path->dentry);
189 if (flavor < 0)
190 goto out;
191 clone = rpc_clone_client(server->client);
192 auth = rpcauth_create(flavor, clone);
193 if (!auth) {
194 flavor = -EIO;
195 goto out;
196 }
197 err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode,
198 &path->dentry->d_name,
199 fh, fattr);
200 if (err < 0)
201 flavor = err;
202out:
203 return flavor;
204}
205#else /* CONFIG_NFS_V4 */
206static inline rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server,
207 struct dentry *parent, struct dentry *dentry,
208 struct path *path, struct nfs_fh *fh,
209 struct nfs_fattr *fattr)
210{
211 return -EPERM;
212}
213#endif /* CONFIG_NFS_V4 */
214
119/* 215/*
120 * nfs_d_automount - Handle crossing a mountpoint on the server 216 * nfs_d_automount - Handle crossing a mountpoint on the server
121 * @path - The mountpoint 217 * @path - The mountpoint
@@ -136,6 +232,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
136 struct nfs_fh *fh = NULL; 232 struct nfs_fh *fh = NULL;
137 struct nfs_fattr *fattr = NULL; 233 struct nfs_fattr *fattr = NULL;
138 int err; 234 int err;
235 rpc_authflavor_t flavor = 1;
139 236
140 dprintk("--> nfs_d_automount()\n"); 237 dprintk("--> nfs_d_automount()\n");
141 238
@@ -153,9 +250,16 @@ struct vfsmount *nfs_d_automount(struct path *path)
153 250
154 /* Look it up again to get its attributes */ 251 /* Look it up again to get its attributes */
155 parent = dget_parent(path->dentry); 252 parent = dget_parent(path->dentry);
156 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, 253 err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode,
157 &path->dentry->d_name, 254 &path->dentry->d_name,
158 fh, fattr); 255 fh, fattr);
256 if (err == -EPERM) {
257 flavor = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr);
258 if (flavor < 0)
259 err = flavor;
260 else
261 err = 0;
262 }
159 dput(parent); 263 dput(parent);
160 if (err != 0) { 264 if (err != 0) {
161 mnt = ERR_PTR(err); 265 mnt = ERR_PTR(err);
@@ -165,7 +269,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
165 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 269 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
166 mnt = nfs_do_refmount(path->dentry); 270 mnt = nfs_do_refmount(path->dentry);
167 else 271 else
168 mnt = nfs_do_submount(path->dentry, fh, fattr); 272 mnt = nfs_do_submount(path->dentry, fh, fattr, flavor);
169 if (IS_ERR(mnt)) 273 if (IS_ERR(mnt))
170 goto out; 274 goto out;
171 275
@@ -232,17 +336,20 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
232 * @dentry - parent directory 336 * @dentry - parent directory
233 * @fh - filehandle for new root dentry 337 * @fh - filehandle for new root dentry
234 * @fattr - attributes for new root inode 338 * @fattr - attributes for new root inode
339 * @authflavor - security flavor to use when performing the mount
235 * 340 *
236 */ 341 */
237static struct vfsmount *nfs_do_submount(struct dentry *dentry, 342static struct vfsmount *nfs_do_submount(struct dentry *dentry,
238 struct nfs_fh *fh, 343 struct nfs_fh *fh,
239 struct nfs_fattr *fattr) 344 struct nfs_fattr *fattr,
345 rpc_authflavor_t authflavor)
240{ 346{
241 struct nfs_clone_mount mountdata = { 347 struct nfs_clone_mount mountdata = {
242 .sb = dentry->d_sb, 348 .sb = dentry->d_sb,
243 .dentry = dentry, 349 .dentry = dentry,
244 .fh = fh, 350 .fh = fh,
245 .fattr = fattr, 351 .fattr = fattr,
352 .authflavor = authflavor,
246 }; 353 };
247 struct vfsmount *mnt = ERR_PTR(-ENOMEM); 354 struct vfsmount *mnt = ERR_PTR(-ENOMEM);
248 char *page = (char *) __get_free_page(GFP_USER); 355 char *page = (char *) __get_free_page(GFP_USER);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d0c80d8b3f96..38053d823eb0 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -141,7 +141,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
141} 141}
142 142
143static int 143static int
144nfs3_proc_lookup(struct inode *dir, struct qstr *name, 144nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
145 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 145 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
146{ 146{
147 struct nfs3_diropargs arg = { 147 struct nfs3_diropargs arg = {
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c64be1cff080..e1c261ddd65d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -57,7 +57,8 @@ enum nfs4_session_state {
57struct nfs4_minor_version_ops { 57struct nfs4_minor_version_ops {
58 u32 minor_version; 58 u32 minor_version;
59 59
60 int (*call_sync)(struct nfs_server *server, 60 int (*call_sync)(struct rpc_clnt *clnt,
61 struct nfs_server *server,
61 struct rpc_message *msg, 62 struct rpc_message *msg,
62 struct nfs4_sequence_args *args, 63 struct nfs4_sequence_args *args,
63 struct nfs4_sequence_res *res, 64 struct nfs4_sequence_res *res,
@@ -262,6 +263,8 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
262extern int nfs4_init_session(struct nfs_server *server); 263extern int nfs4_init_session(struct nfs_server *server);
263extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 264extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
264 struct nfs_fsinfo *fsinfo); 265 struct nfs_fsinfo *fsinfo);
266extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
267 bool sync);
265 268
266static inline bool 269static inline bool
267is_ds_only_client(struct nfs_client *clp) 270is_ds_only_client(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 428558464817..6f8192f4cfc7 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -154,6 +154,23 @@ static int filelayout_read_done_cb(struct rpc_task *task,
154} 154}
155 155
156/* 156/*
157 * We reference the rpc_cred of the first WRITE that triggers the need for
158 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
159 * rfc5661 is not clear about which credential should be used.
160 */
161static void
162filelayout_set_layoutcommit(struct nfs_write_data *wdata)
163{
164 if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds ||
165 wdata->res.verf->committed == NFS_FILE_SYNC)
166 return;
167
168 pnfs_set_layoutcommit(wdata);
169 dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
170 (unsigned long) wdata->lseg->pls_end_pos);
171}
172
173/*
157 * Call ops for the async read/write cases 174 * Call ops for the async read/write cases
158 * In the case of dense layouts, the offset needs to be reset to its 175 * In the case of dense layouts, the offset needs to be reset to its
159 * original value. 176 * original value.
@@ -210,6 +227,38 @@ static int filelayout_write_done_cb(struct rpc_task *task,
210 return -EAGAIN; 227 return -EAGAIN;
211 } 228 }
212 229
230 filelayout_set_layoutcommit(data);
231 return 0;
232}
233
234/* Fake up some data that will cause nfs_commit_release to retry the writes. */
235static void prepare_to_resend_writes(struct nfs_write_data *data)
236{
237 struct nfs_page *first = nfs_list_entry(data->pages.next);
238
239 data->task.tk_status = 0;
240 memcpy(data->verf.verifier, first->wb_verf.verifier,
241 sizeof(first->wb_verf.verifier));
242 data->verf.verifier[0]++; /* ensure verifier mismatch */
243}
244
245static int filelayout_commit_done_cb(struct rpc_task *task,
246 struct nfs_write_data *data)
247{
248 int reset = 0;
249
250 if (filelayout_async_handle_error(task, data->args.context->state,
251 data->ds_clp, &reset) == -EAGAIN) {
252 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
253 __func__, data->ds_clp, data->ds_clp->cl_session);
254 if (reset) {
255 prepare_to_resend_writes(data);
256 filelayout_set_lo_fail(data->lseg);
257 } else
258 nfs_restart_rpc(task, data->ds_clp);
259 return -EAGAIN;
260 }
261
213 return 0; 262 return 0;
214} 263}
215 264
@@ -240,6 +289,16 @@ static void filelayout_write_release(void *data)
240 wdata->mds_ops->rpc_release(data); 289 wdata->mds_ops->rpc_release(data);
241} 290}
242 291
292static void filelayout_commit_release(void *data)
293{
294 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
295
296 nfs_commit_release_pages(wdata);
297 if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))
298 nfs_commit_clear_lock(NFS_I(wdata->inode));
299 nfs_commitdata_release(wdata);
300}
301
243struct rpc_call_ops filelayout_read_call_ops = { 302struct rpc_call_ops filelayout_read_call_ops = {
244 .rpc_call_prepare = filelayout_read_prepare, 303 .rpc_call_prepare = filelayout_read_prepare,
245 .rpc_call_done = filelayout_read_call_done, 304 .rpc_call_done = filelayout_read_call_done,
@@ -252,6 +311,12 @@ struct rpc_call_ops filelayout_write_call_ops = {
252 .rpc_release = filelayout_write_release, 311 .rpc_release = filelayout_write_release,
253}; 312};
254 313
314struct rpc_call_ops filelayout_commit_call_ops = {
315 .rpc_call_prepare = filelayout_write_prepare,
316 .rpc_call_done = filelayout_write_call_done,
317 .rpc_release = filelayout_commit_release,
318};
319
255static enum pnfs_try_status 320static enum pnfs_try_status
256filelayout_read_pagelist(struct nfs_read_data *data) 321filelayout_read_pagelist(struct nfs_read_data *data)
257{ 322{
@@ -320,10 +385,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
320 data->inode->i_ino, sync, (size_t) data->args.count, offset, 385 data->inode->i_ino, sync, (size_t) data->args.count, offset,
321 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); 386 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
322 387
323 /* We can't handle commit to ds yet */
324 if (!FILELAYOUT_LSEG(lseg)->commit_through_mds)
325 data->args.stable = NFS_FILE_SYNC;
326
327 data->write_done_cb = filelayout_write_done_cb; 388 data->write_done_cb = filelayout_write_done_cb;
328 data->ds_clp = ds->ds_clp; 389 data->ds_clp = ds->ds_clp;
329 fh = nfs4_fl_select_ds_fh(lseg, j); 390 fh = nfs4_fl_select_ds_fh(lseg, j);
@@ -441,12 +502,33 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
441 struct nfs4_layoutget_res *lgr, 502 struct nfs4_layoutget_res *lgr,
442 struct nfs4_deviceid *id) 503 struct nfs4_deviceid *id)
443{ 504{
444 uint32_t *p = (uint32_t *)lgr->layout.buf; 505 struct xdr_stream stream;
506 struct xdr_buf buf = {
507 .pages = lgr->layoutp->pages,
508 .page_len = lgr->layoutp->len,
509 .buflen = lgr->layoutp->len,
510 .len = lgr->layoutp->len,
511 };
512 struct page *scratch;
513 __be32 *p;
445 uint32_t nfl_util; 514 uint32_t nfl_util;
446 int i; 515 int i;
447 516
448 dprintk("%s: set_layout_map Begin\n", __func__); 517 dprintk("%s: set_layout_map Begin\n", __func__);
449 518
519 scratch = alloc_page(GFP_KERNEL);
520 if (!scratch)
521 return -ENOMEM;
522
523 xdr_init_decode(&stream, &buf, NULL);
524 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
525
526 /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
527 * num_fh (4) */
528 p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20);
529 if (unlikely(!p))
530 goto out_err;
531
450 memcpy(id, p, sizeof(*id)); 532 memcpy(id, p, sizeof(*id));
451 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); 533 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
452 print_deviceid(id); 534 print_deviceid(id);
@@ -468,32 +550,57 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
468 __func__, nfl_util, fl->num_fh, fl->first_stripe_index, 550 __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
469 fl->pattern_offset); 551 fl->pattern_offset);
470 552
553 if (!fl->num_fh)
554 goto out_err;
555
471 fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), 556 fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
472 GFP_KERNEL); 557 GFP_KERNEL);
473 if (!fl->fh_array) 558 if (!fl->fh_array)
474 return -ENOMEM; 559 goto out_err;
475 560
476 for (i = 0; i < fl->num_fh; i++) { 561 for (i = 0; i < fl->num_fh; i++) {
477 /* Do we want to use a mempool here? */ 562 /* Do we want to use a mempool here? */
478 fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); 563 fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
479 if (!fl->fh_array[i]) { 564 if (!fl->fh_array[i])
480 filelayout_free_fh_array(fl); 565 goto out_err_free;
481 return -ENOMEM; 566
482 } 567 p = xdr_inline_decode(&stream, 4);
568 if (unlikely(!p))
569 goto out_err_free;
483 fl->fh_array[i]->size = be32_to_cpup(p++); 570 fl->fh_array[i]->size = be32_to_cpup(p++);
484 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { 571 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
485 printk(KERN_ERR "Too big fh %d received %d\n", 572 printk(KERN_ERR "Too big fh %d received %d\n",
486 i, fl->fh_array[i]->size); 573 i, fl->fh_array[i]->size);
487 filelayout_free_fh_array(fl); 574 goto out_err_free;
488 return -EIO;
489 } 575 }
576
577 p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
578 if (unlikely(!p))
579 goto out_err_free;
490 memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); 580 memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
491 p += XDR_QUADLEN(fl->fh_array[i]->size);
492 dprintk("DEBUG: %s: fh len %d\n", __func__, 581 dprintk("DEBUG: %s: fh len %d\n", __func__,
493 fl->fh_array[i]->size); 582 fl->fh_array[i]->size);
494 } 583 }
495 584
585 __free_page(scratch);
496 return 0; 586 return 0;
587
588out_err_free:
589 filelayout_free_fh_array(fl);
590out_err:
591 __free_page(scratch);
592 return -EIO;
593}
594
595static void
596filelayout_free_lseg(struct pnfs_layout_segment *lseg)
597{
598 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
599
600 dprintk("--> %s\n", __func__);
601 nfs4_fl_put_deviceid(fl->dsaddr);
602 kfree(fl->commit_buckets);
603 _filelayout_free_lseg(fl);
497} 604}
498 605
499static struct pnfs_layout_segment * 606static struct pnfs_layout_segment *
@@ -514,17 +621,28 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
514 _filelayout_free_lseg(fl); 621 _filelayout_free_lseg(fl);
515 return NULL; 622 return NULL;
516 } 623 }
517 return &fl->generic_hdr;
518}
519 624
520static void 625 /* This assumes there is only one IOMODE_RW lseg. What
521filelayout_free_lseg(struct pnfs_layout_segment *lseg) 626 * we really want to do is have a layout_hdr level
522{ 627 * dictionary of <multipath_list4, fh> keys, each
523 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 628 * associated with a struct list_head, populated by calls
524 629 * to filelayout_write_pagelist().
525 dprintk("--> %s\n", __func__); 630 * */
526 nfs4_fl_put_deviceid(fl->dsaddr); 631 if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) {
527 _filelayout_free_lseg(fl); 632 int i;
633 int size = (fl->stripe_type == STRIPE_SPARSE) ?
634 fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
635
636 fl->commit_buckets = kcalloc(size, sizeof(struct list_head), GFP_KERNEL);
637 if (!fl->commit_buckets) {
638 filelayout_free_lseg(&fl->generic_hdr);
639 return NULL;
640 }
641 fl->number_of_buckets = size;
642 for (i = 0; i < size; i++)
643 INIT_LIST_HEAD(&fl->commit_buckets[i]);
644 }
645 return &fl->generic_hdr;
528} 646}
529 647
530/* 648/*
@@ -552,6 +670,191 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
552 return (p_stripe == r_stripe); 670 return (p_stripe == r_stripe);
553} 671}
554 672
673static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
674{
675 return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
676}
677
678static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
679{
680 if (fl->stripe_type == STRIPE_SPARSE)
681 return nfs4_fl_calc_ds_index(&fl->generic_hdr, j);
682 else
683 return j;
684}
685
686struct list_head *filelayout_choose_commit_list(struct nfs_page *req)
687{
688 struct pnfs_layout_segment *lseg = req->wb_commit_lseg;
689 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
690 u32 i, j;
691 struct list_head *list;
692
693 /* Note that we are calling nfs4_fl_calc_j_index on each page
694 * that ends up being committed to a data server. An attractive
695 * alternative is to add a field to nfs_write_data and nfs_page
696 * to store the value calculated in filelayout_write_pagelist
697 * and just use that here.
698 */
699 j = nfs4_fl_calc_j_index(lseg,
700 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);
701 i = select_bucket_index(fl, j);
702 list = &fl->commit_buckets[i];
703 if (list_empty(list)) {
704 /* Non-empty buckets hold a reference on the lseg */
705 get_lseg(lseg);
706 }
707 return list;
708}
709
710static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
711{
712 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
713
714 if (flseg->stripe_type == STRIPE_SPARSE)
715 return i;
716 else
717 return nfs4_fl_calc_ds_index(lseg, i);
718}
719
720static struct nfs_fh *
721select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
722{
723 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
724
725 if (flseg->stripe_type == STRIPE_SPARSE) {
726 if (flseg->num_fh == 1)
727 i = 0;
728 else if (flseg->num_fh == 0)
729 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
730 return NULL;
731 }
732 return flseg->fh_array[i];
733}
734
735static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
736{
737 struct pnfs_layout_segment *lseg = data->lseg;
738 struct nfs4_pnfs_ds *ds;
739 u32 idx;
740 struct nfs_fh *fh;
741
742 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
743 ds = nfs4_fl_prepare_ds(lseg, idx);
744 if (!ds) {
745 printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
746 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
747 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
748 prepare_to_resend_writes(data);
749 data->mds_ops->rpc_release(data);
750 return -EAGAIN;
751 }
752 dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
753 data->write_done_cb = filelayout_commit_done_cb;
754 data->ds_clp = ds->ds_clp;
755 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
756 if (fh)
757 data->args.fh = fh;
758 return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient,
759 &filelayout_commit_call_ops, how);
760}
761
762/*
763 * This is only useful while we are using whole file layouts.
764 */
765static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
766{
767 struct pnfs_layout_segment *lseg, *rv = NULL;
768
769 spin_lock(&inode->i_lock);
770 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
771 if (lseg->pls_range.iomode == IOMODE_RW)
772 rv = get_lseg(lseg);
773 spin_unlock(&inode->i_lock);
774 return rv;
775}
776
777static int alloc_ds_commits(struct inode *inode, struct list_head *list)
778{
779 struct pnfs_layout_segment *lseg;
780 struct nfs4_filelayout_segment *fl;
781 struct nfs_write_data *data;
782 int i, j;
783
784 /* Won't need this when non-whole file layout segments are supported
785 * instead we will use a pnfs_layout_hdr structure */
786 lseg = find_only_write_lseg(inode);
787 if (!lseg)
788 return 0;
789 fl = FILELAYOUT_LSEG(lseg);
790 for (i = 0; i < fl->number_of_buckets; i++) {
791 if (list_empty(&fl->commit_buckets[i]))
792 continue;
793 data = nfs_commitdata_alloc();
794 if (!data)
795 goto out_bad;
796 data->ds_commit_index = i;
797 data->lseg = lseg;
798 list_add(&data->pages, list);
799 }
800 put_lseg(lseg);
801 return 0;
802
803out_bad:
804 for (j = i; j < fl->number_of_buckets; j++) {
805 if (list_empty(&fl->commit_buckets[i]))
806 continue;
807 nfs_retry_commit(&fl->commit_buckets[i], lseg);
808 put_lseg(lseg); /* associated with emptying bucket */
809 }
810 put_lseg(lseg);
811 /* Caller will clean up entries put on list */
812 return -ENOMEM;
813}
814
815/* This follows nfs_commit_list pretty closely */
816static int
817filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
818 int how)
819{
820 struct nfs_write_data *data, *tmp;
821 LIST_HEAD(list);
822
823 if (!list_empty(mds_pages)) {
824 data = nfs_commitdata_alloc();
825 if (!data)
826 goto out_bad;
827 data->lseg = NULL;
828 list_add(&data->pages, &list);
829 }
830
831 if (alloc_ds_commits(inode, &list))
832 goto out_bad;
833
834 list_for_each_entry_safe(data, tmp, &list, pages) {
835 list_del_init(&data->pages);
836 atomic_inc(&NFS_I(inode)->commits_outstanding);
837 if (!data->lseg) {
838 nfs_init_commit(data, mds_pages, NULL);
839 nfs_initiate_commit(data, NFS_CLIENT(inode),
840 data->mds_ops, how);
841 } else {
842 nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg);
843 filelayout_initiate_commit(data, how);
844 }
845 }
846 return 0;
847 out_bad:
848 list_for_each_entry_safe(data, tmp, &list, pages) {
849 nfs_retry_commit(&data->pages, data->lseg);
850 list_del_init(&data->pages);
851 nfs_commit_free(data);
852 }
853 nfs_retry_commit(mds_pages, NULL);
854 nfs_commit_clear_lock(NFS_I(inode));
855 return -ENOMEM;
856}
857
555static struct pnfs_layoutdriver_type filelayout_type = { 858static struct pnfs_layoutdriver_type filelayout_type = {
556 .id = LAYOUT_NFSV4_1_FILES, 859 .id = LAYOUT_NFSV4_1_FILES,
557 .name = "LAYOUT_NFSV4_1_FILES", 860 .name = "LAYOUT_NFSV4_1_FILES",
@@ -559,6 +862,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
559 .alloc_lseg = filelayout_alloc_lseg, 862 .alloc_lseg = filelayout_alloc_lseg,
560 .free_lseg = filelayout_free_lseg, 863 .free_lseg = filelayout_free_lseg,
561 .pg_test = filelayout_pg_test, 864 .pg_test = filelayout_pg_test,
865 .mark_pnfs_commit = filelayout_mark_pnfs_commit,
866 .choose_commit_list = filelayout_choose_commit_list,
867 .commit_pagelist = filelayout_commit_pagelist,
562 .read_pagelist = filelayout_read_pagelist, 868 .read_pagelist = filelayout_read_pagelist,
563 .write_pagelist = filelayout_write_pagelist, 869 .write_pagelist = filelayout_write_pagelist,
564}; 870};
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index ee0c907742b5..085a354e0f08 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -79,6 +79,8 @@ struct nfs4_filelayout_segment {
79 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ 79 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
80 unsigned int num_fh; 80 unsigned int num_fh;
81 struct nfs_fh **fh_array; 81 struct nfs_fh **fh_array;
82 struct list_head *commit_buckets; /* Sort commits to ds */
83 int number_of_buckets;
82}; 84};
83 85
84static inline struct nfs4_filelayout_segment * 86static inline struct nfs4_filelayout_segment *
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 68143c162e3b..de5350f2b249 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -261,7 +261,7 @@ out:
261 * Currently only support ipv4, and one multi-path address. 261 * Currently only support ipv4, and one multi-path address.
262 */ 262 */
263static struct nfs4_pnfs_ds * 263static struct nfs4_pnfs_ds *
264decode_and_add_ds(__be32 **pp, struct inode *inode) 264decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode)
265{ 265{
266 struct nfs4_pnfs_ds *ds = NULL; 266 struct nfs4_pnfs_ds *ds = NULL;
267 char *buf; 267 char *buf;
@@ -269,25 +269,34 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
269 u32 ip_addr, port; 269 u32 ip_addr, port;
270 int nlen, rlen, i; 270 int nlen, rlen, i;
271 int tmp[2]; 271 int tmp[2];
272 __be32 *r_netid, *r_addr, *p = *pp; 272 __be32 *p;
273 273
274 /* r_netid */ 274 /* r_netid */
275 p = xdr_inline_decode(streamp, 4);
276 if (unlikely(!p))
277 goto out_err;
275 nlen = be32_to_cpup(p++); 278 nlen = be32_to_cpup(p++);
276 r_netid = p;
277 p += XDR_QUADLEN(nlen);
278 279
279 /* r_addr */ 280 p = xdr_inline_decode(streamp, nlen);
280 rlen = be32_to_cpup(p++); 281 if (unlikely(!p))
281 r_addr = p; 282 goto out_err;
282 p += XDR_QUADLEN(rlen);
283 *pp = p;
284 283
285 /* Check that netid is "tcp" */ 284 /* Check that netid is "tcp" */
286 if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) { 285 if (nlen != 3 || memcmp((char *)p, "tcp", 3)) {
287 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); 286 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
288 goto out_err; 287 goto out_err;
289 } 288 }
290 289
290 /* r_addr */
291 p = xdr_inline_decode(streamp, 4);
292 if (unlikely(!p))
293 goto out_err;
294 rlen = be32_to_cpup(p);
295
296 p = xdr_inline_decode(streamp, rlen);
297 if (unlikely(!p))
298 goto out_err;
299
291 /* ipv6 length plus port is legal */ 300 /* ipv6 length plus port is legal */
292 if (rlen > INET6_ADDRSTRLEN + 8) { 301 if (rlen > INET6_ADDRSTRLEN + 8) {
293 dprintk("%s: Invalid address, length %d\n", __func__, 302 dprintk("%s: Invalid address, length %d\n", __func__,
@@ -300,7 +309,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
300 goto out_err; 309 goto out_err;
301 } 310 }
302 buf[rlen] = '\0'; 311 buf[rlen] = '\0';
303 memcpy(buf, r_addr, rlen); 312 memcpy(buf, p, rlen);
304 313
305 /* replace the port dots with dashes for the in4_pton() delimiter*/ 314 /* replace the port dots with dashes for the in4_pton() delimiter*/
306 for (i = 0; i < 2; i++) { 315 for (i = 0; i < 2; i++) {
@@ -336,90 +345,154 @@ out_err:
336static struct nfs4_file_layout_dsaddr* 345static struct nfs4_file_layout_dsaddr*
337decode_device(struct inode *ino, struct pnfs_device *pdev) 346decode_device(struct inode *ino, struct pnfs_device *pdev)
338{ 347{
339 int i, dummy; 348 int i;
340 u32 cnt, num; 349 u32 cnt, num;
341 u8 *indexp; 350 u8 *indexp;
342 __be32 *p = (__be32 *)pdev->area, *indicesp; 351 __be32 *p;
343 struct nfs4_file_layout_dsaddr *dsaddr; 352 u8 *stripe_indices;
353 u8 max_stripe_index;
354 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
355 struct xdr_stream stream;
356 struct xdr_buf buf = {
357 .pages = pdev->pages,
358 .page_len = pdev->pglen,
359 .buflen = pdev->pglen,
360 .len = pdev->pglen,
361 };
362 struct page *scratch;
363
364 /* set up xdr stream */
365 scratch = alloc_page(GFP_KERNEL);
366 if (!scratch)
367 goto out_err;
368
369 xdr_init_decode(&stream, &buf, NULL);
370 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
344 371
345 /* Get the stripe count (number of stripe index) */ 372 /* Get the stripe count (number of stripe index) */
346 cnt = be32_to_cpup(p++); 373 p = xdr_inline_decode(&stream, 4);
374 if (unlikely(!p))
375 goto out_err_free_scratch;
376
377 cnt = be32_to_cpup(p);
347 dprintk("%s stripe count %d\n", __func__, cnt); 378 dprintk("%s stripe count %d\n", __func__, cnt);
348 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { 379 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
349 printk(KERN_WARNING "%s: stripe count %d greater than " 380 printk(KERN_WARNING "%s: stripe count %d greater than "
350 "supported maximum %d\n", __func__, 381 "supported maximum %d\n", __func__,
351 cnt, NFS4_PNFS_MAX_STRIPE_CNT); 382 cnt, NFS4_PNFS_MAX_STRIPE_CNT);
352 goto out_err; 383 goto out_err_free_scratch;
384 }
385
386 /* read stripe indices */
387 stripe_indices = kcalloc(cnt, sizeof(u8), GFP_KERNEL);
388 if (!stripe_indices)
389 goto out_err_free_scratch;
390
391 p = xdr_inline_decode(&stream, cnt << 2);
392 if (unlikely(!p))
393 goto out_err_free_stripe_indices;
394
395 indexp = &stripe_indices[0];
396 max_stripe_index = 0;
397 for (i = 0; i < cnt; i++) {
398 *indexp = be32_to_cpup(p++);
399 max_stripe_index = max(max_stripe_index, *indexp);
400 indexp++;
353 } 401 }
354 402
355 /* Check the multipath list count */ 403 /* Check the multipath list count */
356 indicesp = p; 404 p = xdr_inline_decode(&stream, 4);
357 p += XDR_QUADLEN(cnt << 2); 405 if (unlikely(!p))
358 num = be32_to_cpup(p++); 406 goto out_err_free_stripe_indices;
407
408 num = be32_to_cpup(p);
359 dprintk("%s ds_num %u\n", __func__, num); 409 dprintk("%s ds_num %u\n", __func__, num);
360 if (num > NFS4_PNFS_MAX_MULTI_CNT) { 410 if (num > NFS4_PNFS_MAX_MULTI_CNT) {
361 printk(KERN_WARNING "%s: multipath count %d greater than " 411 printk(KERN_WARNING "%s: multipath count %d greater than "
362 "supported maximum %d\n", __func__, 412 "supported maximum %d\n", __func__,
363 num, NFS4_PNFS_MAX_MULTI_CNT); 413 num, NFS4_PNFS_MAX_MULTI_CNT);
364 goto out_err; 414 goto out_err_free_stripe_indices;
365 } 415 }
416
417 /* validate stripe indices are all < num */
418 if (max_stripe_index >= num) {
419 printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n",
420 __func__, max_stripe_index, num);
421 goto out_err_free_stripe_indices;
422 }
423
366 dsaddr = kzalloc(sizeof(*dsaddr) + 424 dsaddr = kzalloc(sizeof(*dsaddr) +
367 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), 425 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
368 GFP_KERNEL); 426 GFP_KERNEL);
369 if (!dsaddr) 427 if (!dsaddr)
370 goto out_err; 428 goto out_err_free_stripe_indices;
371
372 dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
373 if (!dsaddr->stripe_indices)
374 goto out_err_free;
375 429
376 dsaddr->stripe_count = cnt; 430 dsaddr->stripe_count = cnt;
431 dsaddr->stripe_indices = stripe_indices;
432 stripe_indices = NULL;
377 dsaddr->ds_num = num; 433 dsaddr->ds_num = num;
378 434
379 memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); 435 memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
380 436
381 /* Go back an read stripe indices */
382 p = indicesp;
383 indexp = &dsaddr->stripe_indices[0];
384 for (i = 0; i < dsaddr->stripe_count; i++) {
385 *indexp = be32_to_cpup(p++);
386 if (*indexp >= num)
387 goto out_err_free;
388 indexp++;
389 }
390 /* Skip already read multipath list count */
391 p++;
392
393 for (i = 0; i < dsaddr->ds_num; i++) { 437 for (i = 0; i < dsaddr->ds_num; i++) {
394 int j; 438 int j;
439 u32 mp_count;
440
441 p = xdr_inline_decode(&stream, 4);
442 if (unlikely(!p))
443 goto out_err_free_deviceid;
395 444
396 dummy = be32_to_cpup(p++); /* multipath count */ 445 mp_count = be32_to_cpup(p); /* multipath count */
397 if (dummy > 1) { 446 if (mp_count > 1) {
398 printk(KERN_WARNING 447 printk(KERN_WARNING
399 "%s: Multipath count %d not supported, " 448 "%s: Multipath count %d not supported, "
400 "skipping all greater than 1\n", __func__, 449 "skipping all greater than 1\n", __func__,
401 dummy); 450 mp_count);
402 } 451 }
403 for (j = 0; j < dummy; j++) { 452 for (j = 0; j < mp_count; j++) {
404 if (j == 0) { 453 if (j == 0) {
405 dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); 454 dsaddr->ds_list[i] = decode_and_add_ds(&stream,
455 ino);
406 if (dsaddr->ds_list[i] == NULL) 456 if (dsaddr->ds_list[i] == NULL)
407 goto out_err_free; 457 goto out_err_free_deviceid;
408 } else { 458 } else {
409 u32 len; 459 u32 len;
410 /* skip extra multipath */ 460 /* skip extra multipath */
411 len = be32_to_cpup(p++); 461
412 p += XDR_QUADLEN(len); 462 /* read len, skip */
413 len = be32_to_cpup(p++); 463 p = xdr_inline_decode(&stream, 4);
414 p += XDR_QUADLEN(len); 464 if (unlikely(!p))
415 continue; 465 goto out_err_free_deviceid;
466 len = be32_to_cpup(p);
467
468 p = xdr_inline_decode(&stream, len);
469 if (unlikely(!p))
470 goto out_err_free_deviceid;
471
472 /* read len, skip */
473 p = xdr_inline_decode(&stream, 4);
474 if (unlikely(!p))
475 goto out_err_free_deviceid;
476 len = be32_to_cpup(p);
477
478 p = xdr_inline_decode(&stream, len);
479 if (unlikely(!p))
480 goto out_err_free_deviceid;
416 } 481 }
417 } 482 }
418 } 483 }
484
485 __free_page(scratch);
419 return dsaddr; 486 return dsaddr;
420 487
421out_err_free: 488out_err_free_deviceid:
422 nfs4_fl_free_deviceid(dsaddr); 489 nfs4_fl_free_deviceid(dsaddr);
490 /* stripe_indicies was part of dsaddr */
491 goto out_err_free_scratch;
492out_err_free_stripe_indices:
493 kfree(stripe_indices);
494out_err_free_scratch:
495 __free_page(scratch);
423out_err: 496out_err:
424 dprintk("%s ERROR: returning NULL\n", __func__); 497 dprintk("%s ERROR: returning NULL\n", __func__);
425 return NULL; 498 return NULL;
@@ -498,11 +571,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
498 goto out_free; 571 goto out_free;
499 } 572 }
500 573
501 /* set pdev->area */
502 pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
503 if (!pdev->area)
504 goto out_free;
505
506 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); 574 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
507 pdev->layout_type = LAYOUT_NFSV4_1_FILES; 575 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
508 pdev->pages = pages; 576 pdev->pages = pages;
@@ -521,8 +589,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
521 */ 589 */
522 dsaddr = decode_and_add_device(inode, pdev); 590 dsaddr = decode_and_add_device(inode, pdev);
523out_free: 591out_free:
524 if (pdev->area != NULL)
525 vunmap(pdev->area);
526 for (i = 0; i < max_pages; i++) 592 for (i = 0; i < max_pages; i++)
527 __free_page(pages[i]); 593 __free_page(pages[i]);
528 kfree(pages); 594 kfree(pages);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1d84e7088af9..dfd1e6d7e6c3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -41,6 +41,7 @@
41#include <linux/string.h> 41#include <linux/string.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
44#include <linux/sunrpc/gss_api.h>
44#include <linux/nfs.h> 45#include <linux/nfs.h>
45#include <linux/nfs4.h> 46#include <linux/nfs4.h>
46#include <linux/nfs_fs.h> 47#include <linux/nfs_fs.h>
@@ -71,7 +72,9 @@ static int _nfs4_proc_open(struct nfs4_opendata *data);
71static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 72static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
72static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 73static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
73static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 74static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
74static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 75static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
76 const struct qstr *name, struct nfs_fh *fhandle,
77 struct nfs_fattr *fattr);
75static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 78static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
76static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 79static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
77 struct nfs_fattr *fattr, struct iattr *sattr, 80 struct nfs_fattr *fattr, struct iattr *sattr,
@@ -85,6 +88,8 @@ static int nfs4_map_errors(int err)
85 switch (err) { 88 switch (err) {
86 case -NFS4ERR_RESOURCE: 89 case -NFS4ERR_RESOURCE:
87 return -EREMOTEIO; 90 return -EREMOTEIO;
91 case -NFS4ERR_WRONGSEC:
92 return -EPERM;
88 case -NFS4ERR_BADOWNER: 93 case -NFS4ERR_BADOWNER:
89 case -NFS4ERR_BADNAME: 94 case -NFS4ERR_BADNAME:
90 return -EINVAL; 95 return -EINVAL;
@@ -657,7 +662,8 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = {
657 .rpc_call_done = nfs41_call_sync_done, 662 .rpc_call_done = nfs41_call_sync_done,
658}; 663};
659 664
660static int nfs4_call_sync_sequence(struct nfs_server *server, 665static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
666 struct nfs_server *server,
661 struct rpc_message *msg, 667 struct rpc_message *msg,
662 struct nfs4_sequence_args *args, 668 struct nfs4_sequence_args *args,
663 struct nfs4_sequence_res *res, 669 struct nfs4_sequence_res *res,
@@ -673,7 +679,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
673 .cache_reply = cache_reply, 679 .cache_reply = cache_reply,
674 }; 680 };
675 struct rpc_task_setup task_setup = { 681 struct rpc_task_setup task_setup = {
676 .rpc_client = server->client, 682 .rpc_client = clnt,
677 .rpc_message = msg, 683 .rpc_message = msg,
678 .callback_ops = &nfs41_call_sync_ops, 684 .callback_ops = &nfs41_call_sync_ops,
679 .callback_data = &data 685 .callback_data = &data
@@ -692,13 +698,14 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
692 return ret; 698 return ret;
693} 699}
694 700
695int _nfs4_call_sync_session(struct nfs_server *server, 701int _nfs4_call_sync_session(struct rpc_clnt *clnt,
702 struct nfs_server *server,
696 struct rpc_message *msg, 703 struct rpc_message *msg,
697 struct nfs4_sequence_args *args, 704 struct nfs4_sequence_args *args,
698 struct nfs4_sequence_res *res, 705 struct nfs4_sequence_res *res,
699 int cache_reply) 706 int cache_reply)
700{ 707{
701 return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); 708 return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0);
702} 709}
703 710
704#else 711#else
@@ -709,19 +716,28 @@ static int nfs4_sequence_done(struct rpc_task *task,
709} 716}
710#endif /* CONFIG_NFS_V4_1 */ 717#endif /* CONFIG_NFS_V4_1 */
711 718
712int _nfs4_call_sync(struct nfs_server *server, 719int _nfs4_call_sync(struct rpc_clnt *clnt,
720 struct nfs_server *server,
713 struct rpc_message *msg, 721 struct rpc_message *msg,
714 struct nfs4_sequence_args *args, 722 struct nfs4_sequence_args *args,
715 struct nfs4_sequence_res *res, 723 struct nfs4_sequence_res *res,
716 int cache_reply) 724 int cache_reply)
717{ 725{
718 args->sa_session = res->sr_session = NULL; 726 args->sa_session = res->sr_session = NULL;
719 return rpc_call_sync(server->client, msg, 0); 727 return rpc_call_sync(clnt, msg, 0);
720} 728}
721 729
722#define nfs4_call_sync(server, msg, args, res, cache_reply) \ 730static inline
723 (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ 731int nfs4_call_sync(struct rpc_clnt *clnt,
724 &(res)->seq_res, (cache_reply)) 732 struct nfs_server *server,
733 struct rpc_message *msg,
734 struct nfs4_sequence_args *args,
735 struct nfs4_sequence_res *res,
736 int cache_reply)
737{
738 return server->nfs_client->cl_mvops->call_sync(clnt, server, msg,
739 args, res, cache_reply);
740}
725 741
726static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 742static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
727{ 743{
@@ -1831,7 +1847,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1831 } else 1847 } else
1832 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1848 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
1833 1849
1834 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 1850 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
1835 if (status == 0 && state != NULL) 1851 if (status == 0 && state != NULL)
1836 renew_lease(server, timestamp); 1852 renew_lease(server, timestamp);
1837 return status; 1853 return status;
@@ -2090,7 +2106,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
2090 }; 2106 };
2091 int status; 2107 int status;
2092 2108
2093 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2109 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2094 if (status == 0) { 2110 if (status == 0) {
2095 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); 2111 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
2096 server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| 2112 server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
@@ -2160,7 +2176,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2160 }; 2176 };
2161 2177
2162 nfs_fattr_init(info->fattr); 2178 nfs_fattr_init(info->fattr);
2163 return nfs4_call_sync(server, &msg, &args, &res, 0); 2179 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2164} 2180}
2165 2181
2166static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, 2182static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -2176,15 +2192,43 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2176 return err; 2192 return err;
2177} 2193}
2178 2194
2195static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
2196 struct nfs_fsinfo *info, rpc_authflavor_t flavor)
2197{
2198 struct rpc_auth *auth;
2199 int ret;
2200
2201 auth = rpcauth_create(flavor, server->client);
2202 if (!auth) {
2203 ret = -EIO;
2204 goto out;
2205 }
2206 ret = nfs4_lookup_root(server, fhandle, info);
2207 if (ret < 0)
2208 ret = -EAGAIN;
2209out:
2210 return ret;
2211}
2212
2179/* 2213/*
2180 * get the file handle for the "/" directory on the server 2214 * get the file handle for the "/" directory on the server
2181 */ 2215 */
2182static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, 2216static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
2183 struct nfs_fsinfo *info) 2217 struct nfs_fsinfo *info)
2184{ 2218{
2185 int status; 2219 int i, len, status = 0;
2220 rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS + 2];
2221
2222 flav_array[0] = RPC_AUTH_UNIX;
2223 len = gss_mech_list_pseudoflavors(&flav_array[1]);
2224 flav_array[1+len] = RPC_AUTH_NULL;
2225 len += 2;
2186 2226
2187 status = nfs4_lookup_root(server, fhandle, info); 2227 for (i = 0; i < len; i++) {
2228 status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
2229 if (status == 0)
2230 break;
2231 }
2188 if (status == 0) 2232 if (status == 0)
2189 status = nfs4_server_capabilities(server, fhandle); 2233 status = nfs4_server_capabilities(server, fhandle);
2190 if (status == 0) 2234 if (status == 0)
@@ -2249,7 +2293,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2249 }; 2293 };
2250 2294
2251 nfs_fattr_init(fattr); 2295 nfs_fattr_init(fattr);
2252 return nfs4_call_sync(server, &msg, &args, &res, 0); 2296 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2253} 2297}
2254 2298
2255static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2299static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
@@ -2309,9 +2353,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2309 return status; 2353 return status;
2310} 2354}
2311 2355
2312static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *dirfh, 2356static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
2313 const struct qstr *name, struct nfs_fh *fhandle, 2357 const struct nfs_fh *dirfh, const struct qstr *name,
2314 struct nfs_fattr *fattr) 2358 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
2315{ 2359{
2316 int status; 2360 int status;
2317 struct nfs4_lookup_arg args = { 2361 struct nfs4_lookup_arg args = {
@@ -2333,7 +2377,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d
2333 nfs_fattr_init(fattr); 2377 nfs_fattr_init(fattr);
2334 2378
2335 dprintk("NFS call lookupfh %s\n", name->name); 2379 dprintk("NFS call lookupfh %s\n", name->name);
2336 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2380 status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
2337 dprintk("NFS reply lookupfh: %d\n", status); 2381 dprintk("NFS reply lookupfh: %d\n", status);
2338 return status; 2382 return status;
2339} 2383}
@@ -2345,7 +2389,7 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
2345 struct nfs4_exception exception = { }; 2389 struct nfs4_exception exception = { };
2346 int err; 2390 int err;
2347 do { 2391 do {
2348 err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); 2392 err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
2349 /* FIXME: !!!! */ 2393 /* FIXME: !!!! */
2350 if (err == -NFS4ERR_MOVED) { 2394 if (err == -NFS4ERR_MOVED) {
2351 err = -EREMOTE; 2395 err = -EREMOTE;
@@ -2356,27 +2400,41 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
2356 return err; 2400 return err;
2357} 2401}
2358 2402
2359static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, 2403static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2360 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2404 const struct qstr *name, struct nfs_fh *fhandle,
2405 struct nfs_fattr *fattr)
2361{ 2406{
2362 int status; 2407 int status;
2363 2408
2364 dprintk("NFS call lookup %s\n", name->name); 2409 dprintk("NFS call lookup %s\n", name->name);
2365 status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); 2410 status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
2366 if (status == -NFS4ERR_MOVED) 2411 if (status == -NFS4ERR_MOVED)
2367 status = nfs4_get_referral(dir, name, fattr, fhandle); 2412 status = nfs4_get_referral(dir, name, fattr, fhandle);
2368 dprintk("NFS reply lookup: %d\n", status); 2413 dprintk("NFS reply lookup: %d\n", status);
2369 return status; 2414 return status;
2370} 2415}
2371 2416
2372static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2417void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr, struct nfs_fh *fh)
2418{
2419 memset(fh, 0, sizeof(struct nfs_fh));
2420 fattr->fsid.major = 1;
2421 fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
2422 NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_FSID | NFS_ATTR_FATTR_MOUNTPOINT;
2423 fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
2424 fattr->nlink = 2;
2425}
2426
2427static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
2428 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
2373{ 2429{
2374 struct nfs4_exception exception = { }; 2430 struct nfs4_exception exception = { };
2375 int err; 2431 int err;
2376 do { 2432 do {
2377 err = nfs4_handle_exception(NFS_SERVER(dir), 2433 err = nfs4_handle_exception(NFS_SERVER(dir),
2378 _nfs4_proc_lookup(dir, name, fhandle, fattr), 2434 _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr),
2379 &exception); 2435 &exception);
2436 if (err == -EPERM)
2437 nfs_fixup_secinfo_attributes(fattr, fhandle);
2380 } while (exception.retry); 2438 } while (exception.retry);
2381 return err; 2439 return err;
2382} 2440}
@@ -2421,7 +2479,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2421 if (res.fattr == NULL) 2479 if (res.fattr == NULL)
2422 return -ENOMEM; 2480 return -ENOMEM;
2423 2481
2424 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2482 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2425 if (!status) { 2483 if (!status) {
2426 entry->mask = 0; 2484 entry->mask = 0;
2427 if (res.access & NFS4_ACCESS_READ) 2485 if (res.access & NFS4_ACCESS_READ)
@@ -2488,7 +2546,7 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
2488 .rpc_resp = &res, 2546 .rpc_resp = &res,
2489 }; 2547 };
2490 2548
2491 return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); 2549 return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
2492} 2550}
2493 2551
2494static int nfs4_proc_readlink(struct inode *inode, struct page *page, 2552static int nfs4_proc_readlink(struct inode *inode, struct page *page,
@@ -2577,7 +2635,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
2577 if (res.dir_attr == NULL) 2635 if (res.dir_attr == NULL)
2578 goto out; 2636 goto out;
2579 2637
2580 status = nfs4_call_sync(server, &msg, &args, &res, 1); 2638 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
2581 if (status == 0) { 2639 if (status == 0) {
2582 update_changeattr(dir, &res.cinfo); 2640 update_changeattr(dir, &res.cinfo);
2583 nfs_post_op_update_inode(dir, res.dir_attr); 2641 nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2678,7 +2736,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2678 if (res.old_fattr == NULL || res.new_fattr == NULL) 2736 if (res.old_fattr == NULL || res.new_fattr == NULL)
2679 goto out; 2737 goto out;
2680 2738
2681 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2739 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
2682 if (!status) { 2740 if (!status) {
2683 update_changeattr(old_dir, &res.old_cinfo); 2741 update_changeattr(old_dir, &res.old_cinfo);
2684 nfs_post_op_update_inode(old_dir, res.old_fattr); 2742 nfs_post_op_update_inode(old_dir, res.old_fattr);
@@ -2729,7 +2787,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
2729 if (res.fattr == NULL || res.dir_attr == NULL) 2787 if (res.fattr == NULL || res.dir_attr == NULL)
2730 goto out; 2788 goto out;
2731 2789
2732 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2790 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
2733 if (!status) { 2791 if (!status) {
2734 update_changeattr(dir, &res.cinfo); 2792 update_changeattr(dir, &res.cinfo);
2735 nfs_post_op_update_inode(dir, res.dir_attr); 2793 nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2792,8 +2850,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
2792 2850
2793static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) 2851static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
2794{ 2852{
2795 int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg, 2853 int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
2796 &data->arg, &data->res, 1); 2854 &data->arg.seq_args, &data->res.seq_res, 1);
2797 if (status == 0) { 2855 if (status == 0) {
2798 update_changeattr(dir, &data->res.dir_cinfo); 2856 update_changeattr(dir, &data->res.dir_cinfo);
2799 nfs_post_op_update_inode(dir, data->res.dir_fattr); 2857 nfs_post_op_update_inode(dir, data->res.dir_fattr);
@@ -2905,7 +2963,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2905 (unsigned long long)cookie); 2963 (unsigned long long)cookie);
2906 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); 2964 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
2907 res.pgbase = args.pgbase; 2965 res.pgbase = args.pgbase;
2908 status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); 2966 status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
2909 if (status >= 0) { 2967 if (status >= 0) {
2910 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); 2968 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
2911 status += args.pgbase; 2969 status += args.pgbase;
@@ -2997,7 +3055,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
2997 }; 3055 };
2998 3056
2999 nfs_fattr_init(fsstat->fattr); 3057 nfs_fattr_init(fsstat->fattr);
3000 return nfs4_call_sync(server, &msg, &args, &res, 0); 3058 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
3001} 3059}
3002 3060
3003static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) 3061static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
@@ -3028,7 +3086,7 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
3028 .rpc_resp = &res, 3086 .rpc_resp = &res,
3029 }; 3087 };
3030 3088
3031 return nfs4_call_sync(server, &msg, &args, &res, 0); 3089 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
3032} 3090}
3033 3091
3034static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) 3092static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
@@ -3073,7 +3131,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
3073 } 3131 }
3074 3132
3075 nfs_fattr_init(pathconf->fattr); 3133 nfs_fattr_init(pathconf->fattr);
3076 return nfs4_call_sync(server, &msg, &args, &res, 0); 3134 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
3077} 3135}
3078 3136
3079static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, 3137static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -3195,12 +3253,9 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
3195 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; 3253 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
3196} 3254}
3197 3255
3198static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) 3256static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data)
3199{ 3257{
3200 struct inode *inode = data->inode; 3258 struct inode *inode = data->inode;
3201
3202 if (!nfs4_sequence_done(task, &data->res.seq_res))
3203 return -EAGAIN;
3204 3259
3205 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 3260 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
3206 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3261 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
@@ -3210,11 +3265,24 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
3210 return 0; 3265 return 0;
3211} 3266}
3212 3267
3268static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
3269{
3270 if (!nfs4_sequence_done(task, &data->res.seq_res))
3271 return -EAGAIN;
3272 return data->write_done_cb(task, data);
3273}
3274
3213static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) 3275static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
3214{ 3276{
3215 struct nfs_server *server = NFS_SERVER(data->inode); 3277 struct nfs_server *server = NFS_SERVER(data->inode);
3216 3278
3217 data->args.bitmask = server->cache_consistency_bitmask; 3279 if (data->lseg) {
3280 data->args.bitmask = NULL;
3281 data->res.fattr = NULL;
3282 } else
3283 data->args.bitmask = server->cache_consistency_bitmask;
3284 if (!data->write_done_cb)
3285 data->write_done_cb = nfs4_commit_done_cb;
3218 data->res.server = server; 3286 data->res.server = server;
3219 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 3287 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
3220} 3288}
@@ -3452,7 +3520,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3452 resp_buf = buf; 3520 resp_buf = buf;
3453 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); 3521 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
3454 } 3522 }
3455 ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); 3523 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
3456 if (ret) 3524 if (ret)
3457 goto out_free; 3525 goto out_free;
3458 if (res.acl_len > args.acl_len) 3526 if (res.acl_len > args.acl_len)
@@ -3527,7 +3595,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
3527 if (i < 0) 3595 if (i < 0)
3528 return i; 3596 return i;
3529 nfs_inode_return_delegation(inode); 3597 nfs_inode_return_delegation(inode);
3530 ret = nfs4_call_sync(server, &msg, &arg, &res, 1); 3598 ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3531 3599
3532 /* 3600 /*
3533 * Free each page after tx, so the only ref left is 3601 * Free each page after tx, so the only ref left is
@@ -3890,7 +3958,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
3890 lsp = request->fl_u.nfs4_fl.owner; 3958 lsp = request->fl_u.nfs4_fl.owner;
3891 arg.lock_owner.id = lsp->ls_id.id; 3959 arg.lock_owner.id = lsp->ls_id.id;
3892 arg.lock_owner.s_dev = server->s_dev; 3960 arg.lock_owner.s_dev = server->s_dev;
3893 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 3961 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3894 switch (status) { 3962 switch (status) {
3895 case 0: 3963 case 0:
3896 request->fl_type = F_UNLCK; 3964 request->fl_type = F_UNLCK;
@@ -4618,12 +4686,46 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
4618 nfs_fattr_init(&fs_locations->fattr); 4686 nfs_fattr_init(&fs_locations->fattr);
4619 fs_locations->server = server; 4687 fs_locations->server = server;
4620 fs_locations->nlocations = 0; 4688 fs_locations->nlocations = 0;
4621 status = nfs4_call_sync(server, &msg, &args, &res, 0); 4689 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
4622 nfs_fixup_referral_attributes(&fs_locations->fattr); 4690 nfs_fixup_referral_attributes(&fs_locations->fattr);
4623 dprintk("%s: returned status = %d\n", __func__, status); 4691 dprintk("%s: returned status = %d\n", __func__, status);
4624 return status; 4692 return status;
4625} 4693}
4626 4694
4695static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
4696{
4697 int status;
4698 struct nfs4_secinfo_arg args = {
4699 .dir_fh = NFS_FH(dir),
4700 .name = name,
4701 };
4702 struct nfs4_secinfo_res res = {
4703 .flavors = flavors,
4704 };
4705 struct rpc_message msg = {
4706 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO],
4707 .rpc_argp = &args,
4708 .rpc_resp = &res,
4709 };
4710
4711 dprintk("NFS call secinfo %s\n", name->name);
4712 status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
4713 dprintk("NFS reply secinfo: %d\n", status);
4714 return status;
4715}
4716
4717int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
4718{
4719 struct nfs4_exception exception = { };
4720 int err;
4721 do {
4722 err = nfs4_handle_exception(NFS_SERVER(dir),
4723 _nfs4_proc_secinfo(dir, name, flavors),
4724 &exception);
4725 } while (exception.retry);
4726 return err;
4727}
4728
4627#ifdef CONFIG_NFS_V4_1 4729#ifdef CONFIG_NFS_V4_1
4628/* 4730/*
4629 * Check the exchange flags returned by the server for invalid flags, having 4731 * Check the exchange flags returned by the server for invalid flags, having
@@ -5516,8 +5618,6 @@ static void nfs4_layoutget_release(void *calldata)
5516 struct nfs4_layoutget *lgp = calldata; 5618 struct nfs4_layoutget *lgp = calldata;
5517 5619
5518 dprintk("--> %s\n", __func__); 5620 dprintk("--> %s\n", __func__);
5519 if (lgp->res.layout.buf != NULL)
5520 free_page((unsigned long) lgp->res.layout.buf);
5521 put_nfs_open_context(lgp->args.ctx); 5621 put_nfs_open_context(lgp->args.ctx);
5522 kfree(calldata); 5622 kfree(calldata);
5523 dprintk("<-- %s\n", __func__); 5623 dprintk("<-- %s\n", __func__);
@@ -5549,12 +5649,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5549 5649
5550 dprintk("--> %s\n", __func__); 5650 dprintk("--> %s\n", __func__);
5551 5651
5552 lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); 5652 lgp->res.layoutp = &lgp->args.layout;
5553 if (lgp->res.layout.buf == NULL) {
5554 nfs4_layoutget_release(lgp);
5555 return -ENOMEM;
5556 }
5557
5558 lgp->res.seq_res.sr_slot = NULL; 5653 lgp->res.seq_res.sr_slot = NULL;
5559 task = rpc_run_task(&task_setup_data); 5654 task = rpc_run_task(&task_setup_data);
5560 if (IS_ERR(task)) 5655 if (IS_ERR(task))
@@ -5586,7 +5681,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5586 int status; 5681 int status;
5587 5682
5588 dprintk("--> %s\n", __func__); 5683 dprintk("--> %s\n", __func__);
5589 status = nfs4_call_sync(server, &msg, &args, &res, 0); 5684 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
5590 dprintk("<-- %s status=%d\n", __func__, status); 5685 dprintk("<-- %s status=%d\n", __func__, status);
5591 5686
5592 return status; 5687 return status;
@@ -5606,6 +5701,100 @@ int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5606} 5701}
5607EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); 5702EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
5608 5703
5704static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
5705{
5706 struct nfs4_layoutcommit_data *data = calldata;
5707 struct nfs_server *server = NFS_SERVER(data->args.inode);
5708
5709 if (nfs4_setup_sequence(server, &data->args.seq_args,
5710 &data->res.seq_res, 1, task))
5711 return;
5712 rpc_call_start(task);
5713}
5714
5715static void
5716nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
5717{
5718 struct nfs4_layoutcommit_data *data = calldata;
5719 struct nfs_server *server = NFS_SERVER(data->args.inode);
5720
5721 if (!nfs4_sequence_done(task, &data->res.seq_res))
5722 return;
5723
5724 switch (task->tk_status) { /* Just ignore these failures */
5725 case NFS4ERR_DELEG_REVOKED: /* layout was recalled */
5726 case NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */
5727 case NFS4ERR_BADLAYOUT: /* no layout */
5728 case NFS4ERR_GRACE: /* loca_recalim always false */
5729 task->tk_status = 0;
5730 }
5731
5732 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5733 nfs_restart_rpc(task, server->nfs_client);
5734 return;
5735 }
5736
5737 if (task->tk_status == 0)
5738 nfs_post_op_update_inode_force_wcc(data->args.inode,
5739 data->res.fattr);
5740}
5741
5742static void nfs4_layoutcommit_release(void *calldata)
5743{
5744 struct nfs4_layoutcommit_data *data = calldata;
5745
5746 /* Matched by references in pnfs_set_layoutcommit */
5747 put_lseg(data->lseg);
5748 put_rpccred(data->cred);
5749 kfree(data);
5750}
5751
5752static const struct rpc_call_ops nfs4_layoutcommit_ops = {
5753 .rpc_call_prepare = nfs4_layoutcommit_prepare,
5754 .rpc_call_done = nfs4_layoutcommit_done,
5755 .rpc_release = nfs4_layoutcommit_release,
5756};
5757
5758int
5759nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
5760{
5761 struct rpc_message msg = {
5762 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
5763 .rpc_argp = &data->args,
5764 .rpc_resp = &data->res,
5765 .rpc_cred = data->cred,
5766 };
5767 struct rpc_task_setup task_setup_data = {
5768 .task = &data->task,
5769 .rpc_client = NFS_CLIENT(data->args.inode),
5770 .rpc_message = &msg,
5771 .callback_ops = &nfs4_layoutcommit_ops,
5772 .callback_data = data,
5773 .flags = RPC_TASK_ASYNC,
5774 };
5775 struct rpc_task *task;
5776 int status = 0;
5777
5778 dprintk("NFS: %4d initiating layoutcommit call. sync %d "
5779 "lbw: %llu inode %lu\n",
5780 data->task.tk_pid, sync,
5781 data->args.lastbytewritten,
5782 data->args.inode->i_ino);
5783
5784 task = rpc_run_task(&task_setup_data);
5785 if (IS_ERR(task))
5786 return PTR_ERR(task);
5787 if (sync == false)
5788 goto out;
5789 status = nfs4_wait_for_completion_rpc_task(task);
5790 if (status != 0)
5791 goto out;
5792 status = task->tk_status;
5793out:
5794 dprintk("%s: status %d\n", __func__, status);
5795 rpc_put_task(task);
5796 return status;
5797}
5609#endif /* CONFIG_NFS_V4_1 */ 5798#endif /* CONFIG_NFS_V4_1 */
5610 5799
5611struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 5800struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5741,6 +5930,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5741 .close_context = nfs4_close_context, 5930 .close_context = nfs4_close_context,
5742 .open_context = nfs4_atomic_open, 5931 .open_context = nfs4_atomic_open,
5743 .init_client = nfs4_init_client, 5932 .init_client = nfs4_init_client,
5933 .secinfo = nfs4_proc_secinfo,
5744}; 5934};
5745 5935
5746static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { 5936static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0cf560f77884..dddfb5795d7b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -46,6 +46,7 @@
46#include <linux/kdev_t.h> 46#include <linux/kdev_t.h>
47#include <linux/sunrpc/clnt.h> 47#include <linux/sunrpc/clnt.h>
48#include <linux/sunrpc/msg_prot.h> 48#include <linux/sunrpc/msg_prot.h>
49#include <linux/sunrpc/gss_api.h>
49#include <linux/nfs.h> 50#include <linux/nfs.h>
50#include <linux/nfs4.h> 51#include <linux/nfs4.h>
51#include <linux/nfs_fs.h> 52#include <linux/nfs_fs.h>
@@ -112,7 +113,7 @@ static int nfs4_stat_to_errno(int);
112#define encode_restorefh_maxsz (op_encode_hdr_maxsz) 113#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
113#define decode_restorefh_maxsz (op_decode_hdr_maxsz) 114#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
114#define encode_fsinfo_maxsz (encode_getattr_maxsz) 115#define encode_fsinfo_maxsz (encode_getattr_maxsz)
115#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) 116#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15)
116#define encode_renew_maxsz (op_encode_hdr_maxsz + 3) 117#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
117#define decode_renew_maxsz (op_decode_hdr_maxsz) 118#define decode_renew_maxsz (op_decode_hdr_maxsz)
118#define encode_setclientid_maxsz \ 119#define encode_setclientid_maxsz \
@@ -253,6 +254,8 @@ static int nfs4_stat_to_errno(int);
253 (encode_getattr_maxsz) 254 (encode_getattr_maxsz)
254#define decode_fs_locations_maxsz \ 255#define decode_fs_locations_maxsz \
255 (0) 256 (0)
257#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
258#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)))
256 259
257#if defined(CONFIG_NFS_V4_1) 260#if defined(CONFIG_NFS_V4_1)
258#define NFS4_MAX_MACHINE_NAME_LEN (64) 261#define NFS4_MAX_MACHINE_NAME_LEN (64)
@@ -324,6 +327,18 @@ static int nfs4_stat_to_errno(int);
324#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ 327#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
325 decode_stateid_maxsz + \ 328 decode_stateid_maxsz + \
326 XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) 329 XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
330#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \
331 2 /* offset */ + \
332 2 /* length */ + \
333 1 /* reclaim */ + \
334 encode_stateid_maxsz + \
335 1 /* new offset (true) */ + \
336 2 /* last byte written */ + \
337 1 /* nt_timechanged (false) */ + \
338 1 /* layoutupdate4 layout type */ + \
339 1 /* NULL filelayout layoutupdate4 payload */)
340#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
341
327#else /* CONFIG_NFS_V4_1 */ 342#else /* CONFIG_NFS_V4_1 */
328#define encode_sequence_maxsz 0 343#define encode_sequence_maxsz 0
329#define decode_sequence_maxsz 0 344#define decode_sequence_maxsz 0
@@ -676,6 +691,14 @@ static int nfs4_stat_to_errno(int);
676 decode_putfh_maxsz + \ 691 decode_putfh_maxsz + \
677 decode_lookup_maxsz + \ 692 decode_lookup_maxsz + \
678 decode_fs_locations_maxsz) 693 decode_fs_locations_maxsz)
694#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \
695 encode_sequence_maxsz + \
696 encode_putfh_maxsz + \
697 encode_secinfo_maxsz)
698#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \
699 decode_sequence_maxsz + \
700 decode_putfh_maxsz + \
701 decode_secinfo_maxsz)
679#if defined(CONFIG_NFS_V4_1) 702#if defined(CONFIG_NFS_V4_1)
680#define NFS4_enc_exchange_id_sz \ 703#define NFS4_enc_exchange_id_sz \
681 (compound_encode_hdr_maxsz + \ 704 (compound_encode_hdr_maxsz + \
@@ -727,6 +750,17 @@ static int nfs4_stat_to_errno(int);
727 decode_sequence_maxsz + \ 750 decode_sequence_maxsz + \
728 decode_putfh_maxsz + \ 751 decode_putfh_maxsz + \
729 decode_layoutget_maxsz) 752 decode_layoutget_maxsz)
753#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
754 encode_sequence_maxsz +\
755 encode_putfh_maxsz + \
756 encode_layoutcommit_maxsz + \
757 encode_getattr_maxsz)
758#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
759 decode_sequence_maxsz + \
760 decode_putfh_maxsz + \
761 decode_layoutcommit_maxsz + \
762 decode_getattr_maxsz)
763
730 764
731const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 765const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
732 compound_encode_hdr_maxsz + 766 compound_encode_hdr_maxsz +
@@ -1620,6 +1654,18 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
1620 hdr->replen += decode_delegreturn_maxsz; 1654 hdr->replen += decode_delegreturn_maxsz;
1621} 1655}
1622 1656
1657static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1658{
1659 int len = name->len;
1660 __be32 *p;
1661
1662 p = reserve_space(xdr, 8 + len);
1663 *p++ = cpu_to_be32(OP_SECINFO);
1664 xdr_encode_opaque(p, name->name, len);
1665 hdr->nops++;
1666 hdr->replen += decode_secinfo_maxsz;
1667}
1668
1623#if defined(CONFIG_NFS_V4_1) 1669#if defined(CONFIG_NFS_V4_1)
1624/* NFSv4.1 operations */ 1670/* NFSv4.1 operations */
1625static void encode_exchange_id(struct xdr_stream *xdr, 1671static void encode_exchange_id(struct xdr_stream *xdr,
@@ -1816,6 +1862,34 @@ encode_layoutget(struct xdr_stream *xdr,
1816 hdr->nops++; 1862 hdr->nops++;
1817 hdr->replen += decode_layoutget_maxsz; 1863 hdr->replen += decode_layoutget_maxsz;
1818} 1864}
1865
1866static int
1867encode_layoutcommit(struct xdr_stream *xdr,
1868 const struct nfs4_layoutcommit_args *args,
1869 struct compound_hdr *hdr)
1870{
1871 __be32 *p;
1872
1873 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
1874 NFS_SERVER(args->inode)->pnfs_curr_ld->id);
1875
1876 p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
1877 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
1878 /* Only whole file layouts */
1879 p = xdr_encode_hyper(p, 0); /* offset */
1880 p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */
1881 *p++ = cpu_to_be32(0); /* reclaim */
1882 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
1883 *p++ = cpu_to_be32(1); /* newoffset = TRUE */
1884 p = xdr_encode_hyper(p, args->lastbytewritten);
1885 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
1886 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
1887 *p++ = cpu_to_be32(0); /* no file layout payload */
1888
1889 hdr->nops++;
1890 hdr->replen += decode_layoutcommit_maxsz;
1891 return 0;
1892}
1819#endif /* CONFIG_NFS_V4_1 */ 1893#endif /* CONFIG_NFS_V4_1 */
1820 1894
1821/* 1895/*
@@ -2294,7 +2368,8 @@ static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
2294 encode_sequence(xdr, &args->seq_args, &hdr); 2368 encode_sequence(xdr, &args->seq_args, &hdr);
2295 encode_putfh(xdr, args->fh, &hdr); 2369 encode_putfh(xdr, args->fh, &hdr);
2296 encode_commit(xdr, args, &hdr); 2370 encode_commit(xdr, args, &hdr);
2297 encode_getfattr(xdr, args->bitmask, &hdr); 2371 if (args->bitmask)
2372 encode_getfattr(xdr, args->bitmask, &hdr);
2298 encode_nops(&hdr); 2373 encode_nops(&hdr);
2299} 2374}
2300 2375
@@ -2465,6 +2540,24 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
2465 encode_nops(&hdr); 2540 encode_nops(&hdr);
2466} 2541}
2467 2542
2543/*
2544 * Encode SECINFO request
2545 */
2546static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
2547 struct xdr_stream *xdr,
2548 struct nfs4_secinfo_arg *args)
2549{
2550 struct compound_hdr hdr = {
2551 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2552 };
2553
2554 encode_compound_hdr(xdr, req, &hdr);
2555 encode_sequence(xdr, &args->seq_args, &hdr);
2556 encode_putfh(xdr, args->dir_fh, &hdr);
2557 encode_secinfo(xdr, args->name, &hdr);
2558 encode_nops(&hdr);
2559}
2560
2468#if defined(CONFIG_NFS_V4_1) 2561#if defined(CONFIG_NFS_V4_1)
2469/* 2562/*
2470 * EXCHANGE_ID request 2563 * EXCHANGE_ID request
@@ -2604,8 +2697,32 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
2604 encode_sequence(xdr, &args->seq_args, &hdr); 2697 encode_sequence(xdr, &args->seq_args, &hdr);
2605 encode_putfh(xdr, NFS_FH(args->inode), &hdr); 2698 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2606 encode_layoutget(xdr, args, &hdr); 2699 encode_layoutget(xdr, args, &hdr);
2700
2701 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
2702 args->layout.pages, 0, args->layout.pglen);
2703
2607 encode_nops(&hdr); 2704 encode_nops(&hdr);
2608} 2705}
2706
2707/*
2708 * Encode LAYOUTCOMMIT request
2709 */
2710static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
2711 struct xdr_stream *xdr,
2712 struct nfs4_layoutcommit_args *args)
2713{
2714 struct compound_hdr hdr = {
2715 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2716 };
2717
2718 encode_compound_hdr(xdr, req, &hdr);
2719 encode_sequence(xdr, &args->seq_args, &hdr);
2720 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2721 encode_layoutcommit(xdr, args, &hdr);
2722 encode_getfattr(xdr, args->bitmask, &hdr);
2723 encode_nops(&hdr);
2724 return 0;
2725}
2609#endif /* CONFIG_NFS_V4_1 */ 2726#endif /* CONFIG_NFS_V4_1 */
2610 2727
2611static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 2728static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2925,6 +3042,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
2925 if (unlikely(!p)) 3042 if (unlikely(!p))
2926 goto out_overflow; 3043 goto out_overflow;
2927 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; 3044 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
3045 return -be32_to_cpup(p);
2928 } 3046 }
2929 return 0; 3047 return 0;
2930out_overflow: 3048out_overflow:
@@ -3912,6 +4030,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
3912 fattr->valid |= status; 4030 fattr->valid |= status;
3913 4031
3914 status = decode_attr_error(xdr, bitmap); 4032 status = decode_attr_error(xdr, bitmap);
4033 if (status == -NFS4ERR_WRONGSEC) {
4034 nfs_fixup_secinfo_attributes(fattr, fh);
4035 status = 0;
4036 }
3915 if (status < 0) 4037 if (status < 0)
3916 goto xdr_error; 4038 goto xdr_error;
3917 4039
@@ -4680,6 +4802,73 @@ static int decode_delegreturn(struct xdr_stream *xdr)
4680 return decode_op_hdr(xdr, OP_DELEGRETURN); 4802 return decode_op_hdr(xdr, OP_DELEGRETURN);
4681} 4803}
4682 4804
4805static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor)
4806{
4807 __be32 *p;
4808
4809 p = xdr_inline_decode(xdr, 4);
4810 if (unlikely(!p))
4811 goto out_overflow;
4812 flavor->gss.sec_oid4.len = be32_to_cpup(p);
4813 if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN)
4814 goto out_err;
4815
4816 p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len);
4817 if (unlikely(!p))
4818 goto out_overflow;
4819 memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len);
4820
4821 p = xdr_inline_decode(xdr, 8);
4822 if (unlikely(!p))
4823 goto out_overflow;
4824 flavor->gss.qop4 = be32_to_cpup(p++);
4825 flavor->gss.service = be32_to_cpup(p);
4826
4827 return 0;
4828
4829out_overflow:
4830 print_overflow_msg(__func__, xdr);
4831 return -EIO;
4832out_err:
4833 return -EINVAL;
4834}
4835
4836static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
4837{
4838 struct nfs4_secinfo_flavor *sec_flavor;
4839 int status;
4840 __be32 *p;
4841 int i;
4842
4843 status = decode_op_hdr(xdr, OP_SECINFO);
4844 p = xdr_inline_decode(xdr, 4);
4845 if (unlikely(!p))
4846 goto out_overflow;
4847 res->flavors->num_flavors = be32_to_cpup(p);
4848
4849 for (i = 0; i < res->flavors->num_flavors; i++) {
4850 sec_flavor = &res->flavors->flavors[i];
4851 if ((char *)&sec_flavor[1] - (char *)res > PAGE_SIZE)
4852 break;
4853
4854 p = xdr_inline_decode(xdr, 4);
4855 if (unlikely(!p))
4856 goto out_overflow;
4857 sec_flavor->flavor = be32_to_cpup(p);
4858
4859 if (sec_flavor->flavor == RPC_AUTH_GSS) {
4860 if (decode_secinfo_gss(xdr, sec_flavor))
4861 break;
4862 }
4863 }
4864
4865 return 0;
4866
4867out_overflow:
4868 print_overflow_msg(__func__, xdr);
4869 return -EIO;
4870}
4871
4683#if defined(CONFIG_NFS_V4_1) 4872#if defined(CONFIG_NFS_V4_1)
4684static int decode_exchange_id(struct xdr_stream *xdr, 4873static int decode_exchange_id(struct xdr_stream *xdr,
4685 struct nfs41_exchange_id_res *res) 4874 struct nfs41_exchange_id_res *res)
@@ -4950,6 +5139,9 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
4950 __be32 *p; 5139 __be32 *p;
4951 int status; 5140 int status;
4952 u32 layout_count; 5141 u32 layout_count;
5142 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
5143 struct kvec *iov = rcvbuf->head;
5144 u32 hdrlen, recvd;
4953 5145
4954 status = decode_op_hdr(xdr, OP_LAYOUTGET); 5146 status = decode_op_hdr(xdr, OP_LAYOUTGET);
4955 if (status) 5147 if (status)
@@ -4966,17 +5158,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
4966 return -EINVAL; 5158 return -EINVAL;
4967 } 5159 }
4968 5160
4969 p = xdr_inline_decode(xdr, 24); 5161 p = xdr_inline_decode(xdr, 28);
4970 if (unlikely(!p)) 5162 if (unlikely(!p))
4971 goto out_overflow; 5163 goto out_overflow;
4972 p = xdr_decode_hyper(p, &res->range.offset); 5164 p = xdr_decode_hyper(p, &res->range.offset);
4973 p = xdr_decode_hyper(p, &res->range.length); 5165 p = xdr_decode_hyper(p, &res->range.length);
4974 res->range.iomode = be32_to_cpup(p++); 5166 res->range.iomode = be32_to_cpup(p++);
4975 res->type = be32_to_cpup(p++); 5167 res->type = be32_to_cpup(p++);
4976 5168 res->layoutp->len = be32_to_cpup(p);
4977 status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
4978 if (unlikely(status))
4979 return status;
4980 5169
4981 dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", 5170 dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
4982 __func__, 5171 __func__,
@@ -4984,12 +5173,18 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
4984 (unsigned long)res->range.length, 5173 (unsigned long)res->range.length,
4985 res->range.iomode, 5174 res->range.iomode,
4986 res->type, 5175 res->type,
4987 res->layout.len); 5176 res->layoutp->len);
4988 5177
4989 /* nfs4_proc_layoutget allocated a single page */ 5178 hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
4990 if (res->layout.len > PAGE_SIZE) 5179 recvd = req->rq_rcv_buf.len - hdrlen;
4991 return -ENOMEM; 5180 if (res->layoutp->len > recvd) {
4992 memcpy(res->layout.buf, p, res->layout.len); 5181 dprintk("NFS: server cheating in layoutget reply: "
5182 "layout len %u > recvd %u\n",
5183 res->layoutp->len, recvd);
5184 return -EINVAL;
5185 }
5186
5187 xdr_read_pages(xdr, res->layoutp->len);
4993 5188
4994 if (layout_count > 1) { 5189 if (layout_count > 1) {
4995 /* We only handle a length one array at the moment. Any 5190 /* We only handle a length one array at the moment. Any
@@ -5006,6 +5201,35 @@ out_overflow:
5006 print_overflow_msg(__func__, xdr); 5201 print_overflow_msg(__func__, xdr);
5007 return -EIO; 5202 return -EIO;
5008} 5203}
5204
5205static int decode_layoutcommit(struct xdr_stream *xdr,
5206 struct rpc_rqst *req,
5207 struct nfs4_layoutcommit_res *res)
5208{
5209 __be32 *p;
5210 __u32 sizechanged;
5211 int status;
5212
5213 status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
5214 if (status)
5215 return status;
5216
5217 p = xdr_inline_decode(xdr, 4);
5218 if (unlikely(!p))
5219 goto out_overflow;
5220 sizechanged = be32_to_cpup(p);
5221
5222 if (sizechanged) {
5223 /* throw away new size */
5224 p = xdr_inline_decode(xdr, 8);
5225 if (unlikely(!p))
5226 goto out_overflow;
5227 }
5228 return 0;
5229out_overflow:
5230 print_overflow_msg(__func__, xdr);
5231 return -EIO;
5232}
5009#endif /* CONFIG_NFS_V4_1 */ 5233#endif /* CONFIG_NFS_V4_1 */
5010 5234
5011/* 5235/*
@@ -5723,8 +5947,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5723 status = decode_commit(xdr, res); 5947 status = decode_commit(xdr, res);
5724 if (status) 5948 if (status)
5725 goto out; 5949 goto out;
5726 decode_getfattr(xdr, res->fattr, res->server, 5950 if (res->fattr)
5727 !RPC_IS_ASYNC(rqstp->rq_task)); 5951 decode_getfattr(xdr, res->fattr, res->server,
5952 !RPC_IS_ASYNC(rqstp->rq_task));
5728out: 5953out:
5729 return status; 5954 return status;
5730} 5955}
@@ -5919,6 +6144,32 @@ out:
5919 return status; 6144 return status;
5920} 6145}
5921 6146
6147/*
6148 * Decode SECINFO response
6149 */
6150static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
6151 struct xdr_stream *xdr,
6152 struct nfs4_secinfo_res *res)
6153{
6154 struct compound_hdr hdr;
6155 int status;
6156
6157 status = decode_compound_hdr(xdr, &hdr);
6158 if (status)
6159 goto out;
6160 status = decode_sequence(xdr, &res->seq_res, rqstp);
6161 if (status)
6162 goto out;
6163 status = decode_putfh(xdr);
6164 if (status)
6165 goto out;
6166 status = decode_secinfo(xdr, res);
6167 if (status)
6168 goto out;
6169out:
6170 return status;
6171}
6172
5922#if defined(CONFIG_NFS_V4_1) 6173#if defined(CONFIG_NFS_V4_1)
5923/* 6174/*
5924 * Decode EXCHANGE_ID response 6175 * Decode EXCHANGE_ID response
@@ -6066,6 +6317,34 @@ static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
6066out: 6317out:
6067 return status; 6318 return status;
6068} 6319}
6320
6321/*
6322 * Decode LAYOUTCOMMIT response
6323 */
6324static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
6325 struct xdr_stream *xdr,
6326 struct nfs4_layoutcommit_res *res)
6327{
6328 struct compound_hdr hdr;
6329 int status;
6330
6331 status = decode_compound_hdr(xdr, &hdr);
6332 if (status)
6333 goto out;
6334 status = decode_sequence(xdr, &res->seq_res, rqstp);
6335 if (status)
6336 goto out;
6337 status = decode_putfh(xdr);
6338 if (status)
6339 goto out;
6340 status = decode_layoutcommit(xdr, rqstp, res);
6341 if (status)
6342 goto out;
6343 decode_getfattr(xdr, res->fattr, res->server,
6344 !RPC_IS_ASYNC(rqstp->rq_task));
6345out:
6346 return status;
6347}
6069#endif /* CONFIG_NFS_V4_1 */ 6348#endif /* CONFIG_NFS_V4_1 */
6070 6349
6071/** 6350/**
@@ -6180,10 +6459,6 @@ static struct {
6180 { NFS4ERR_SYMLINK, -ELOOP }, 6459 { NFS4ERR_SYMLINK, -ELOOP },
6181 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, 6460 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
6182 { NFS4ERR_DEADLOCK, -EDEADLK }, 6461 { NFS4ERR_DEADLOCK, -EDEADLK },
6183 { NFS4ERR_WRONGSEC, -EPERM }, /* FIXME: this needs
6184 * to be handled by a
6185 * middle-layer.
6186 */
6187 { -1, -EIO } 6462 { -1, -EIO }
6188}; 6463};
6189 6464
@@ -6258,6 +6533,7 @@ struct rpc_procinfo nfs4_procedures[] = {
6258 PROC(SETACL, enc_setacl, dec_setacl), 6533 PROC(SETACL, enc_setacl, dec_setacl),
6259 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), 6534 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
6260 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), 6535 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
6536 PROC(SECINFO, enc_secinfo, dec_secinfo),
6261#if defined(CONFIG_NFS_V4_1) 6537#if defined(CONFIG_NFS_V4_1)
6262 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), 6538 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
6263 PROC(CREATE_SESSION, enc_create_session, dec_create_session), 6539 PROC(CREATE_SESSION, enc_create_session, dec_create_session),
@@ -6267,6 +6543,7 @@ struct rpc_procinfo nfs4_procedures[] = {
6267 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), 6543 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
6268 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), 6544 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6269 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 6545 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
6546 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
6270#endif /* CONFIG_NFS_V4_1 */ 6547#endif /* CONFIG_NFS_V4_1 */
6271}; 6548};
6272 6549
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 23e794410669..87a593c2b055 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -223,6 +223,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
223 desc->pg_count = 0; 223 desc->pg_count = 0;
224 desc->pg_bsize = bsize; 224 desc->pg_bsize = bsize;
225 desc->pg_base = 0; 225 desc->pg_base = 0;
226 desc->pg_moreio = 0;
226 desc->pg_inode = inode; 227 desc->pg_inode = inode;
227 desc->pg_doio = doio; 228 desc->pg_doio = doio;
228 desc->pg_ioflags = io_flags; 229 desc->pg_ioflags = io_flags;
@@ -335,9 +336,11 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
335 struct nfs_page *req) 336 struct nfs_page *req)
336{ 337{
337 while (!nfs_pageio_do_add_request(desc, req)) { 338 while (!nfs_pageio_do_add_request(desc, req)) {
339 desc->pg_moreio = 1;
338 nfs_pageio_doio(desc); 340 nfs_pageio_doio(desc);
339 if (desc->pg_error < 0) 341 if (desc->pg_error < 0)
340 return 0; 342 return 0;
343 desc->pg_moreio = 0;
341 } 344 }
342 return 1; 345 return 1;
343} 346}
@@ -395,6 +398,7 @@ int nfs_scan_list(struct nfs_inode *nfsi,
395 pgoff_t idx_end; 398 pgoff_t idx_end;
396 int found, i; 399 int found, i;
397 int res; 400 int res;
401 struct list_head *list;
398 402
399 res = 0; 403 res = 0;
400 if (npages == 0) 404 if (npages == 0)
@@ -415,10 +419,10 @@ int nfs_scan_list(struct nfs_inode *nfsi,
415 idx_start = req->wb_index + 1; 419 idx_start = req->wb_index + 1;
416 if (nfs_set_page_tag_locked(req)) { 420 if (nfs_set_page_tag_locked(req)) {
417 kref_get(&req->wb_kref); 421 kref_get(&req->wb_kref);
418 nfs_list_remove_request(req);
419 radix_tree_tag_clear(&nfsi->nfs_page_tree, 422 radix_tree_tag_clear(&nfsi->nfs_page_tree,
420 req->wb_index, tag); 423 req->wb_index, tag);
421 nfs_list_add_request(req, dst); 424 list = pnfs_choose_commit_list(req, dst);
425 nfs_list_add_request(req, list);
422 res++; 426 res++;
423 if (res == INT_MAX) 427 if (res == INT_MAX)
424 goto out; 428 goto out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f38813a0a295..d9ab97269ce6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -259,6 +259,7 @@ put_lseg(struct pnfs_layout_segment *lseg)
259 pnfs_free_lseg_list(&free_me); 259 pnfs_free_lseg_list(&free_me);
260 } 260 }
261} 261}
262EXPORT_SYMBOL_GPL(put_lseg);
262 263
263static bool 264static bool
264should_free_lseg(u32 lseg_iomode, u32 recall_iomode) 265should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
@@ -471,6 +472,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
471 struct nfs_server *server = NFS_SERVER(ino); 472 struct nfs_server *server = NFS_SERVER(ino);
472 struct nfs4_layoutget *lgp; 473 struct nfs4_layoutget *lgp;
473 struct pnfs_layout_segment *lseg = NULL; 474 struct pnfs_layout_segment *lseg = NULL;
475 struct page **pages = NULL;
476 int i;
477 u32 max_resp_sz, max_pages;
474 478
475 dprintk("--> %s\n", __func__); 479 dprintk("--> %s\n", __func__);
476 480
@@ -478,6 +482,21 @@ send_layoutget(struct pnfs_layout_hdr *lo,
478 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); 482 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
479 if (lgp == NULL) 483 if (lgp == NULL)
480 return NULL; 484 return NULL;
485
486 /* allocate pages for xdr post processing */
487 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
488 max_pages = max_resp_sz >> PAGE_SHIFT;
489
490 pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
491 if (!pages)
492 goto out_err_free;
493
494 for (i = 0; i < max_pages; i++) {
495 pages[i] = alloc_page(GFP_KERNEL);
496 if (!pages[i])
497 goto out_err_free;
498 }
499
481 lgp->args.minlength = NFS4_MAX_UINT64; 500 lgp->args.minlength = NFS4_MAX_UINT64;
482 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 501 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
483 lgp->args.range.iomode = iomode; 502 lgp->args.range.iomode = iomode;
@@ -486,6 +505,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
486 lgp->args.type = server->pnfs_curr_ld->id; 505 lgp->args.type = server->pnfs_curr_ld->id;
487 lgp->args.inode = ino; 506 lgp->args.inode = ino;
488 lgp->args.ctx = get_nfs_open_context(ctx); 507 lgp->args.ctx = get_nfs_open_context(ctx);
508 lgp->args.layout.pages = pages;
509 lgp->args.layout.pglen = max_pages * PAGE_SIZE;
489 lgp->lsegpp = &lseg; 510 lgp->lsegpp = &lseg;
490 511
491 /* Synchronously retrieve layout information from server and 512 /* Synchronously retrieve layout information from server and
@@ -496,7 +517,26 @@ send_layoutget(struct pnfs_layout_hdr *lo,
496 /* remember that LAYOUTGET failed and suspend trying */ 517 /* remember that LAYOUTGET failed and suspend trying */
497 set_bit(lo_fail_bit(iomode), &lo->plh_flags); 518 set_bit(lo_fail_bit(iomode), &lo->plh_flags);
498 } 519 }
520
521 /* free xdr pages */
522 for (i = 0; i < max_pages; i++)
523 __free_page(pages[i]);
524 kfree(pages);
525
499 return lseg; 526 return lseg;
527
528out_err_free:
529 /* free any allocated xdr pages, lgp as it's not used */
530 if (pages) {
531 for (i = 0; i < max_pages; i++) {
532 if (!pages[i])
533 break;
534 __free_page(pages[i]);
535 }
536 kfree(pages);
537 }
538 kfree(lgp);
539 return NULL;
500} 540}
501 541
502bool pnfs_roc(struct inode *ino) 542bool pnfs_roc(struct inode *ino)
@@ -945,3 +985,105 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
945 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 985 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
946 return trypnfs; 986 return trypnfs;
947} 987}
988
989/*
990 * Currently there is only one (whole file) write lseg.
991 */
992static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode)
993{
994 struct pnfs_layout_segment *lseg, *rv = NULL;
995
996 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
997 if (lseg->pls_range.iomode == IOMODE_RW)
998 rv = lseg;
999 return rv;
1000}
1001
1002void
1003pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1004{
1005 struct nfs_inode *nfsi = NFS_I(wdata->inode);
1006 loff_t end_pos = wdata->args.offset + wdata->res.count;
1007
1008 spin_lock(&nfsi->vfs_inode.i_lock);
1009 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1010 /* references matched in nfs4_layoutcommit_release */
1011 get_lseg(wdata->lseg);
1012 wdata->lseg->pls_lc_cred =
1013 get_rpccred(wdata->args.context->state->owner->so_cred);
1014 mark_inode_dirty_sync(wdata->inode);
1015 dprintk("%s: Set layoutcommit for inode %lu ",
1016 __func__, wdata->inode->i_ino);
1017 }
1018 if (end_pos > wdata->lseg->pls_end_pos)
1019 wdata->lseg->pls_end_pos = end_pos;
1020 spin_unlock(&nfsi->vfs_inode.i_lock);
1021}
1022EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1023
1024/*
1025 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
1026 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
1027 * data to disk to allow the server to recover the data if it crashes.
1028 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
1029 * is off, and a COMMIT is sent to a data server, or
1030 * if WRITEs to a data server return NFS_DATA_SYNC.
1031 */
1032int
1033pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1034{
1035 struct nfs4_layoutcommit_data *data;
1036 struct nfs_inode *nfsi = NFS_I(inode);
1037 struct pnfs_layout_segment *lseg;
1038 struct rpc_cred *cred;
1039 loff_t end_pos;
1040 int status = 0;
1041
1042 dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
1043
1044 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1045 return 0;
1046
1047 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
1048 data = kzalloc(sizeof(*data), GFP_NOFS);
1049 if (!data) {
1050 mark_inode_dirty_sync(inode);
1051 status = -ENOMEM;
1052 goto out;
1053 }
1054
1055 spin_lock(&inode->i_lock);
1056 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1057 spin_unlock(&inode->i_lock);
1058 kfree(data);
1059 goto out;
1060 }
1061 /*
1062 * Currently only one (whole file) write lseg which is referenced
1063 * in pnfs_set_layoutcommit and will be found.
1064 */
1065 lseg = pnfs_list_write_lseg(inode);
1066
1067 end_pos = lseg->pls_end_pos;
1068 cred = lseg->pls_lc_cred;
1069 lseg->pls_end_pos = 0;
1070 lseg->pls_lc_cred = NULL;
1071
1072 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
1073 sizeof(nfsi->layout->plh_stateid.data));
1074 spin_unlock(&inode->i_lock);
1075
1076 data->args.inode = inode;
1077 data->lseg = lseg;
1078 data->cred = cred;
1079 nfs_fattr_init(&data->fattr);
1080 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
1081 data->res.fattr = &data->fattr;
1082 data->args.lastbytewritten = end_pos - 1;
1083 data->res.server = NFS_SERVER(inode);
1084
1085 status = nfs4_proc_layoutcommit(data, sync);
1086out:
1087 dprintk("<-- %s status %d\n", __func__, status);
1088 return status;
1089}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 6380b9405bcd..bc4827202e7a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -43,6 +43,8 @@ struct pnfs_layout_segment {
43 atomic_t pls_refcount; 43 atomic_t pls_refcount;
44 unsigned long pls_flags; 44 unsigned long pls_flags;
45 struct pnfs_layout_hdr *pls_layout; 45 struct pnfs_layout_hdr *pls_layout;
46 struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
47 loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
46}; 48};
47 49
48enum pnfs_try_status { 50enum pnfs_try_status {
@@ -74,6 +76,13 @@ struct pnfs_layoutdriver_type {
74 /* test for nfs page cache coalescing */ 76 /* test for nfs page cache coalescing */
75 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); 77 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
76 78
79 /* Returns true if layoutdriver wants to divert this request to
80 * driver's commit routine.
81 */
82 bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg);
83 struct list_head * (*choose_commit_list) (struct nfs_page *req);
84 int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);
85
77 /* 86 /*
78 * Return PNFS_ATTEMPTED to indicate the layout code has attempted 87 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
79 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS 88 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
@@ -100,7 +109,6 @@ struct pnfs_device {
100 unsigned int layout_type; 109 unsigned int layout_type;
101 unsigned int mincount; 110 unsigned int mincount;
102 struct page **pages; 111 struct page **pages;
103 void *area;
104 unsigned int pgbase; 112 unsigned int pgbase;
105 unsigned int pglen; 113 unsigned int pglen;
106}; 114};
@@ -145,7 +153,8 @@ bool pnfs_roc(struct inode *ino);
145void pnfs_roc_release(struct inode *ino); 153void pnfs_roc_release(struct inode *ino);
146void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 154void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
147bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 155bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
148 156void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
157int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
149 158
150static inline int lo_fail_bit(u32 iomode) 159static inline int lo_fail_bit(u32 iomode)
151{ 160{
@@ -169,6 +178,51 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss)
169 return nfss->pnfs_curr_ld != NULL; 178 return nfss->pnfs_curr_ld != NULL;
170} 179}
171 180
181static inline void
182pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
183{
184 if (lseg) {
185 struct pnfs_layoutdriver_type *ld;
186
187 ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld;
188 if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) {
189 set_bit(PG_PNFS_COMMIT, &req->wb_flags);
190 req->wb_commit_lseg = get_lseg(lseg);
191 }
192 }
193}
194
195static inline int
196pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
197{
198 if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags))
199 return PNFS_NOT_ATTEMPTED;
200 return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);
201}
202
203static inline struct list_head *
204pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
205{
206 struct list_head *rv;
207
208 if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) {
209 struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode;
210
211 set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
212 rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req);
213 /* matched by ref taken when PG_PNFS_COMMIT is set */
214 put_lseg(req->wb_commit_lseg);
215 } else
216 rv = mds;
217 return rv;
218}
219
220static inline void pnfs_clear_request_commit(struct nfs_page *req)
221{
222 if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags))
223 put_lseg(req->wb_commit_lseg);
224}
225
172#else /* CONFIG_NFS_V4_1 */ 226#else /* CONFIG_NFS_V4_1 */
173 227
174static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 228static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -252,6 +306,31 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
252 pgio->pg_test = NULL; 306 pgio->pg_test = NULL;
253} 307}
254 308
309static inline void
310pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
311{
312}
313
314static inline int
315pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
316{
317 return PNFS_NOT_ATTEMPTED;
318}
319
320static inline struct list_head *
321pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
322{
323 return mds;
324}
325
326static inline void pnfs_clear_request_commit(struct nfs_page *req)
327{
328}
329
330static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
331{
332 return 0;
333}
255#endif /* CONFIG_NFS_V4_1 */ 334#endif /* CONFIG_NFS_V4_1 */
256 335
257#endif /* FS_NFS_PNFS_H */ 336#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b8ec170f2a0f..ac40b8535d7e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -177,7 +177,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
177} 177}
178 178
179static int 179static int
180nfs_proc_lookup(struct inode *dir, struct qstr *name, 180nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
181 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 181 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
182{ 182{
183 struct nfs_diropargs arg = { 183 struct nfs_diropargs arg = {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 47a3ad63e0d5..85d75254328e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -59,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
59 } 59 }
60 return p; 60 return p;
61} 61}
62EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
62 63
63void nfs_commit_free(struct nfs_write_data *p) 64void nfs_commit_free(struct nfs_write_data *p)
64{ 65{
@@ -66,6 +67,7 @@ void nfs_commit_free(struct nfs_write_data *p)
66 kfree(p->pagevec); 67 kfree(p->pagevec);
67 mempool_free(p, nfs_commit_mempool); 68 mempool_free(p, nfs_commit_mempool);
68} 69}
70EXPORT_SYMBOL_GPL(nfs_commit_free);
69 71
70struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) 72struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
71{ 73{
@@ -179,8 +181,8 @@ static int wb_priority(struct writeback_control *wbc)
179 if (wbc->for_reclaim) 181 if (wbc->for_reclaim)
180 return FLUSH_HIGHPRI | FLUSH_STABLE; 182 return FLUSH_HIGHPRI | FLUSH_STABLE;
181 if (wbc->for_kupdate || wbc->for_background) 183 if (wbc->for_kupdate || wbc->for_background)
182 return FLUSH_LOWPRI; 184 return FLUSH_LOWPRI | FLUSH_COND_STABLE;
183 return 0; 185 return FLUSH_COND_STABLE;
184} 186}
185 187
186/* 188/*
@@ -441,7 +443,7 @@ nfs_mark_request_dirty(struct nfs_page *req)
441 * Add a request to the inode's commit list. 443 * Add a request to the inode's commit list.
442 */ 444 */
443static void 445static void
444nfs_mark_request_commit(struct nfs_page *req) 446nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
445{ 447{
446 struct inode *inode = req->wb_context->path.dentry->d_inode; 448 struct inode *inode = req->wb_context->path.dentry->d_inode;
447 struct nfs_inode *nfsi = NFS_I(inode); 449 struct nfs_inode *nfsi = NFS_I(inode);
@@ -453,6 +455,7 @@ nfs_mark_request_commit(struct nfs_page *req)
453 NFS_PAGE_TAG_COMMIT); 455 NFS_PAGE_TAG_COMMIT);
454 nfsi->ncommit++; 456 nfsi->ncommit++;
455 spin_unlock(&inode->i_lock); 457 spin_unlock(&inode->i_lock);
458 pnfs_mark_request_commit(req, lseg);
456 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 459 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
457 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); 460 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
458 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 461 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -474,14 +477,18 @@ nfs_clear_request_commit(struct nfs_page *req)
474static inline 477static inline
475int nfs_write_need_commit(struct nfs_write_data *data) 478int nfs_write_need_commit(struct nfs_write_data *data)
476{ 479{
477 return data->verf.committed != NFS_FILE_SYNC; 480 if (data->verf.committed == NFS_DATA_SYNC)
481 return data->lseg == NULL;
482 else
483 return data->verf.committed != NFS_FILE_SYNC;
478} 484}
479 485
480static inline 486static inline
481int nfs_reschedule_unstable_write(struct nfs_page *req) 487int nfs_reschedule_unstable_write(struct nfs_page *req,
488 struct nfs_write_data *data)
482{ 489{
483 if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { 490 if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
484 nfs_mark_request_commit(req); 491 nfs_mark_request_commit(req, data->lseg);
485 return 1; 492 return 1;
486 } 493 }
487 if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { 494 if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
@@ -492,7 +499,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
492} 499}
493#else 500#else
494static inline void 501static inline void
495nfs_mark_request_commit(struct nfs_page *req) 502nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
496{ 503{
497} 504}
498 505
@@ -509,7 +516,8 @@ int nfs_write_need_commit(struct nfs_write_data *data)
509} 516}
510 517
511static inline 518static inline
512int nfs_reschedule_unstable_write(struct nfs_page *req) 519int nfs_reschedule_unstable_write(struct nfs_page *req,
520 struct nfs_write_data *data)
513{ 521{
514 return 0; 522 return 0;
515} 523}
@@ -612,9 +620,11 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
612 } 620 }
613 621
614 if (nfs_clear_request_commit(req) && 622 if (nfs_clear_request_commit(req) &&
615 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, 623 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
616 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) 624 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) {
617 NFS_I(inode)->ncommit--; 625 NFS_I(inode)->ncommit--;
626 pnfs_clear_request_commit(req);
627 }
618 628
619 /* Okay, the request matches. Update the region */ 629 /* Okay, the request matches. Update the region */
620 if (offset < req->wb_offset) { 630 if (offset < req->wb_offset) {
@@ -762,11 +772,12 @@ int nfs_updatepage(struct file *file, struct page *page,
762 return status; 772 return status;
763} 773}
764 774
765static void nfs_writepage_release(struct nfs_page *req) 775static void nfs_writepage_release(struct nfs_page *req,
776 struct nfs_write_data *data)
766{ 777{
767 struct page *page = req->wb_page; 778 struct page *page = req->wb_page;
768 779
769 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) 780 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
770 nfs_inode_remove_request(req); 781 nfs_inode_remove_request(req);
771 nfs_clear_page_tag_locked(req); 782 nfs_clear_page_tag_locked(req);
772 nfs_end_page_writeback(page); 783 nfs_end_page_writeback(page);
@@ -863,7 +874,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
863 data->args.context = get_nfs_open_context(req->wb_context); 874 data->args.context = get_nfs_open_context(req->wb_context);
864 data->args.lock_context = req->wb_lock_context; 875 data->args.lock_context = req->wb_lock_context;
865 data->args.stable = NFS_UNSTABLE; 876 data->args.stable = NFS_UNSTABLE;
866 if (how & FLUSH_STABLE) { 877 if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
867 data->args.stable = NFS_DATA_SYNC; 878 data->args.stable = NFS_DATA_SYNC;
868 if (!nfs_need_commit(NFS_I(inode))) 879 if (!nfs_need_commit(NFS_I(inode)))
869 data->args.stable = NFS_FILE_SYNC; 880 data->args.stable = NFS_FILE_SYNC;
@@ -912,6 +923,12 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
912 923
913 nfs_list_remove_request(req); 924 nfs_list_remove_request(req);
914 925
926 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
927 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit ||
928 desc->pg_count > wsize))
929 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
930
931
915 nbytes = desc->pg_count; 932 nbytes = desc->pg_count;
916 do { 933 do {
917 size_t len = min(nbytes, wsize); 934 size_t len = min(nbytes, wsize);
@@ -1002,6 +1019,10 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
1002 if ((!lseg) && list_is_singular(&data->pages)) 1019 if ((!lseg) && list_is_singular(&data->pages))
1003 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); 1020 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
1004 1021
1022 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1023 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
1024 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
1025
1005 /* Set up the argument struct */ 1026 /* Set up the argument struct */
1006 ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); 1027 ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
1007out: 1028out:
@@ -1074,7 +1095,7 @@ static void nfs_writeback_release_partial(void *calldata)
1074 1095
1075out: 1096out:
1076 if (atomic_dec_and_test(&req->wb_complete)) 1097 if (atomic_dec_and_test(&req->wb_complete))
1077 nfs_writepage_release(req); 1098 nfs_writepage_release(req, data);
1078 nfs_writedata_release(calldata); 1099 nfs_writedata_release(calldata);
1079} 1100}
1080 1101
@@ -1141,7 +1162,7 @@ static void nfs_writeback_release_full(void *calldata)
1141 1162
1142 if (nfs_write_need_commit(data)) { 1163 if (nfs_write_need_commit(data)) {
1143 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); 1164 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1144 nfs_mark_request_commit(req); 1165 nfs_mark_request_commit(req, data->lseg);
1145 dprintk(" marked for commit\n"); 1166 dprintk(" marked for commit\n");
1146 goto next; 1167 goto next;
1147 } 1168 }
@@ -1251,57 +1272,82 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1251#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1272#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1252static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) 1273static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
1253{ 1274{
1275 int ret;
1276
1254 if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) 1277 if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
1255 return 1; 1278 return 1;
1256 if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags, 1279 if (!may_wait)
1257 NFS_INO_COMMIT, nfs_wait_bit_killable, 1280 return 0;
1258 TASK_KILLABLE)) 1281 ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
1259 return 1; 1282 NFS_INO_COMMIT,
1260 return 0; 1283 nfs_wait_bit_killable,
1284 TASK_KILLABLE);
1285 return (ret < 0) ? ret : 1;
1261} 1286}
1262 1287
1263static void nfs_commit_clear_lock(struct nfs_inode *nfsi) 1288void nfs_commit_clear_lock(struct nfs_inode *nfsi)
1264{ 1289{
1265 clear_bit(NFS_INO_COMMIT, &nfsi->flags); 1290 clear_bit(NFS_INO_COMMIT, &nfsi->flags);
1266 smp_mb__after_clear_bit(); 1291 smp_mb__after_clear_bit();
1267 wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); 1292 wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
1268} 1293}
1294EXPORT_SYMBOL_GPL(nfs_commit_clear_lock);
1269 1295
1270 1296void nfs_commitdata_release(void *data)
1271static void nfs_commitdata_release(void *data)
1272{ 1297{
1273 struct nfs_write_data *wdata = data; 1298 struct nfs_write_data *wdata = data;
1274 1299
1300 put_lseg(wdata->lseg);
1275 put_nfs_open_context(wdata->args.context); 1301 put_nfs_open_context(wdata->args.context);
1276 nfs_commit_free(wdata); 1302 nfs_commit_free(wdata);
1277} 1303}
1304EXPORT_SYMBOL_GPL(nfs_commitdata_release);
1278 1305
1279/* 1306int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt,
1280 * Set up the argument/result storage required for the RPC call. 1307 const struct rpc_call_ops *call_ops,
1281 */ 1308 int how)
1282static int nfs_commit_rpcsetup(struct list_head *head,
1283 struct nfs_write_data *data,
1284 int how)
1285{ 1309{
1286 struct nfs_page *first = nfs_list_entry(head->next);
1287 struct inode *inode = first->wb_context->path.dentry->d_inode;
1288 int priority = flush_task_priority(how);
1289 struct rpc_task *task; 1310 struct rpc_task *task;
1311 int priority = flush_task_priority(how);
1290 struct rpc_message msg = { 1312 struct rpc_message msg = {
1291 .rpc_argp = &data->args, 1313 .rpc_argp = &data->args,
1292 .rpc_resp = &data->res, 1314 .rpc_resp = &data->res,
1293 .rpc_cred = first->wb_context->cred, 1315 .rpc_cred = data->cred,
1294 }; 1316 };
1295 struct rpc_task_setup task_setup_data = { 1317 struct rpc_task_setup task_setup_data = {
1296 .task = &data->task, 1318 .task = &data->task,
1297 .rpc_client = NFS_CLIENT(inode), 1319 .rpc_client = clnt,
1298 .rpc_message = &msg, 1320 .rpc_message = &msg,
1299 .callback_ops = &nfs_commit_ops, 1321 .callback_ops = call_ops,
1300 .callback_data = data, 1322 .callback_data = data,
1301 .workqueue = nfsiod_workqueue, 1323 .workqueue = nfsiod_workqueue,
1302 .flags = RPC_TASK_ASYNC, 1324 .flags = RPC_TASK_ASYNC,
1303 .priority = priority, 1325 .priority = priority,
1304 }; 1326 };
1327 /* Set up the initial task struct. */
1328 NFS_PROTO(data->inode)->commit_setup(data, &msg);
1329
1330 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
1331
1332 task = rpc_run_task(&task_setup_data);
1333 if (IS_ERR(task))
1334 return PTR_ERR(task);
1335 if (how & FLUSH_SYNC)
1336 rpc_wait_for_completion_task(task);
1337 rpc_put_task(task);
1338 return 0;
1339}
1340EXPORT_SYMBOL_GPL(nfs_initiate_commit);
1341
1342/*
1343 * Set up the argument/result storage required for the RPC call.
1344 */
1345void nfs_init_commit(struct nfs_write_data *data,
1346 struct list_head *head,
1347 struct pnfs_layout_segment *lseg)
1348{
1349 struct nfs_page *first = nfs_list_entry(head->next);
1350 struct inode *inode = first->wb_context->path.dentry->d_inode;
1305 1351
1306 /* Set up the RPC argument and reply structs 1352 /* Set up the RPC argument and reply structs
1307 * NB: take care not to mess about with data->commit et al. */ 1353 * NB: take care not to mess about with data->commit et al. */
@@ -1309,7 +1355,9 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1309 list_splice_init(head, &data->pages); 1355 list_splice_init(head, &data->pages);
1310 1356
1311 data->inode = inode; 1357 data->inode = inode;
1312 data->cred = msg.rpc_cred; 1358 data->cred = first->wb_context->cred;
1359 data->lseg = lseg; /* reference transferred */
1360 data->mds_ops = &nfs_commit_ops;
1313 1361
1314 data->args.fh = NFS_FH(data->inode); 1362 data->args.fh = NFS_FH(data->inode);
1315 /* Note: we always request a commit of the entire inode */ 1363 /* Note: we always request a commit of the entire inode */
@@ -1320,20 +1368,25 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1320 data->res.fattr = &data->fattr; 1368 data->res.fattr = &data->fattr;
1321 data->res.verf = &data->verf; 1369 data->res.verf = &data->verf;
1322 nfs_fattr_init(&data->fattr); 1370 nfs_fattr_init(&data->fattr);
1371}
1372EXPORT_SYMBOL_GPL(nfs_init_commit);
1323 1373
1324 /* Set up the initial task struct. */ 1374void nfs_retry_commit(struct list_head *page_list,
1325 NFS_PROTO(inode)->commit_setup(data, &msg); 1375 struct pnfs_layout_segment *lseg)
1326 1376{
1327 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 1377 struct nfs_page *req;
1328 1378
1329 task = rpc_run_task(&task_setup_data); 1379 while (!list_empty(page_list)) {
1330 if (IS_ERR(task)) 1380 req = nfs_list_entry(page_list->next);
1331 return PTR_ERR(task); 1381 nfs_list_remove_request(req);
1332 if (how & FLUSH_SYNC) 1382 nfs_mark_request_commit(req, lseg);
1333 rpc_wait_for_completion_task(task); 1383 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1334 rpc_put_task(task); 1384 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
1335 return 0; 1385 BDI_RECLAIMABLE);
1386 nfs_clear_page_tag_locked(req);
1387 }
1336} 1388}
1389EXPORT_SYMBOL_GPL(nfs_retry_commit);
1337 1390
1338/* 1391/*
1339 * Commit dirty pages 1392 * Commit dirty pages
@@ -1342,7 +1395,6 @@ static int
1342nfs_commit_list(struct inode *inode, struct list_head *head, int how) 1395nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1343{ 1396{
1344 struct nfs_write_data *data; 1397 struct nfs_write_data *data;
1345 struct nfs_page *req;
1346 1398
1347 data = nfs_commitdata_alloc(); 1399 data = nfs_commitdata_alloc();
1348 1400
@@ -1350,17 +1402,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1350 goto out_bad; 1402 goto out_bad;
1351 1403
1352 /* Set up the argument struct */ 1404 /* Set up the argument struct */
1353 return nfs_commit_rpcsetup(head, data, how); 1405 nfs_init_commit(data, head, NULL);
1406 return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how);
1354 out_bad: 1407 out_bad:
1355 while (!list_empty(head)) { 1408 nfs_retry_commit(head, NULL);
1356 req = nfs_list_entry(head->next);
1357 nfs_list_remove_request(req);
1358 nfs_mark_request_commit(req);
1359 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1360 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
1361 BDI_RECLAIMABLE);
1362 nfs_clear_page_tag_locked(req);
1363 }
1364 nfs_commit_clear_lock(NFS_I(inode)); 1409 nfs_commit_clear_lock(NFS_I(inode));
1365 return -ENOMEM; 1410 return -ENOMEM;
1366} 1411}
@@ -1380,10 +1425,9 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
1380 return; 1425 return;
1381} 1426}
1382 1427
1383static void nfs_commit_release(void *calldata) 1428void nfs_commit_release_pages(struct nfs_write_data *data)
1384{ 1429{
1385 struct nfs_write_data *data = calldata; 1430 struct nfs_page *req;
1386 struct nfs_page *req;
1387 int status = data->task.tk_status; 1431 int status = data->task.tk_status;
1388 1432
1389 while (!list_empty(&data->pages)) { 1433 while (!list_empty(&data->pages)) {
@@ -1417,6 +1461,14 @@ static void nfs_commit_release(void *calldata)
1417 next: 1461 next:
1418 nfs_clear_page_tag_locked(req); 1462 nfs_clear_page_tag_locked(req);
1419 } 1463 }
1464}
1465EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
1466
1467static void nfs_commit_release(void *calldata)
1468{
1469 struct nfs_write_data *data = calldata;
1470
1471 nfs_commit_release_pages(data);
1420 nfs_commit_clear_lock(NFS_I(data->inode)); 1472 nfs_commit_clear_lock(NFS_I(data->inode));
1421 nfs_commitdata_release(calldata); 1473 nfs_commitdata_release(calldata);
1422} 1474}
@@ -1433,23 +1485,30 @@ int nfs_commit_inode(struct inode *inode, int how)
1433{ 1485{
1434 LIST_HEAD(head); 1486 LIST_HEAD(head);
1435 int may_wait = how & FLUSH_SYNC; 1487 int may_wait = how & FLUSH_SYNC;
1436 int res = 0; 1488 int res;
1437 1489
1438 if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) 1490 res = nfs_commit_set_lock(NFS_I(inode), may_wait);
1491 if (res <= 0)
1439 goto out_mark_dirty; 1492 goto out_mark_dirty;
1440 spin_lock(&inode->i_lock); 1493 spin_lock(&inode->i_lock);
1441 res = nfs_scan_commit(inode, &head, 0, 0); 1494 res = nfs_scan_commit(inode, &head, 0, 0);
1442 spin_unlock(&inode->i_lock); 1495 spin_unlock(&inode->i_lock);
1443 if (res) { 1496 if (res) {
1444 int error = nfs_commit_list(inode, &head, how); 1497 int error;
1498
1499 error = pnfs_commit_list(inode, &head, how);
1500 if (error == PNFS_NOT_ATTEMPTED)
1501 error = nfs_commit_list(inode, &head, how);
1445 if (error < 0) 1502 if (error < 0)
1446 return error; 1503 return error;
1447 if (may_wait) 1504 if (!may_wait)
1448 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
1449 nfs_wait_bit_killable,
1450 TASK_KILLABLE);
1451 else
1452 goto out_mark_dirty; 1505 goto out_mark_dirty;
1506 error = wait_on_bit(&NFS_I(inode)->flags,
1507 NFS_INO_COMMIT,
1508 nfs_wait_bit_killable,
1509 TASK_KILLABLE);
1510 if (error < 0)
1511 return error;
1453 } else 1512 } else
1454 nfs_commit_clear_lock(NFS_I(inode)); 1513 nfs_commit_clear_lock(NFS_I(inode));
1455 return res; 1514 return res;
@@ -1503,7 +1562,22 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
1503 1562
1504int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) 1563int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1505{ 1564{
1506 return nfs_commit_unstable_pages(inode, wbc); 1565 int ret;
1566
1567 ret = nfs_commit_unstable_pages(inode, wbc);
1568 if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
1569 int status;
1570 bool sync = true;
1571
1572 if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
1573 wbc->for_background)
1574 sync = false;
1575
1576 status = pnfs_layoutcommit_inode(inode, sync);
1577 if (status < 0)
1578 return status;
1579 }
1580 return ret;
1507} 1581}
1508 1582
1509/* 1583/*
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 84c27d69d421..ec0f277be7f5 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -117,7 +117,6 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
117 * invoked in contexts where a memory allocation failure is 117 * invoked in contexts where a memory allocation failure is
118 * fatal. Fortunately this fake ACL is small enough to 118 * fatal. Fortunately this fake ACL is small enough to
119 * construct on the stack. */ 119 * construct on the stack. */
120 memset(acl2, 0, sizeof(acl2));
121 posix_acl_init(acl2, 4); 120 posix_acl_init(acl2, 4);
122 121
123 /* Insert entries in canonical order: other orders seem 122 /* Insert entries in canonical order: other orders seem
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4c29fcf557d1..07ea8d3e6ea2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -22,13 +22,14 @@
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/spinlock.h> 24#include <linux/spinlock.h>
25#include <linux/writeback.h> /* for inode_lock */
26 25
27#include <asm/atomic.h> 26#include <asm/atomic.h>
28 27
29#include <linux/fsnotify_backend.h> 28#include <linux/fsnotify_backend.h>
30#include "fsnotify.h" 29#include "fsnotify.h"
31 30
31#include "../internal.h"
32
32/* 33/*
33 * Recalculate the mask of events relevant to a given inode locked. 34 * Recalculate the mask of events relevant to a given inode locked.
34 */ 35 */
@@ -237,15 +238,14 @@ out:
237 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. 238 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
238 * @list: list of inodes being unmounted (sb->s_inodes) 239 * @list: list of inodes being unmounted (sb->s_inodes)
239 * 240 *
240 * Called with inode_lock held, protecting the unmounting super block's list 241 * Called during unmount with no locks held, so needs to be safe against
241 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. 242 * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
242 * We temporarily drop inode_lock, however, and CAN block.
243 */ 243 */
244void fsnotify_unmount_inodes(struct list_head *list) 244void fsnotify_unmount_inodes(struct list_head *list)
245{ 245{
246 struct inode *inode, *next_i, *need_iput = NULL; 246 struct inode *inode, *next_i, *need_iput = NULL;
247 247
248 spin_lock(&inode_lock); 248 spin_lock(&inode_sb_list_lock);
249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
250 struct inode *need_iput_tmp; 250 struct inode *need_iput_tmp;
251 251
@@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
254 * I_WILL_FREE, or I_NEW which is fine because by that point 254 * I_WILL_FREE, or I_NEW which is fine because by that point
255 * the inode cannot have any associated watches. 255 * the inode cannot have any associated watches.
256 */ 256 */
257 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 257 spin_lock(&inode->i_lock);
258 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
259 spin_unlock(&inode->i_lock);
258 continue; 260 continue;
261 }
259 262
260 /* 263 /*
261 * If i_count is zero, the inode cannot have any watches and 264 * If i_count is zero, the inode cannot have any watches and
@@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list)
263 * evict all inodes with zero i_count from icache which is 266 * evict all inodes with zero i_count from icache which is
264 * unnecessarily violent and may in fact be illegal to do. 267 * unnecessarily violent and may in fact be illegal to do.
265 */ 268 */
266 if (!atomic_read(&inode->i_count)) 269 if (!atomic_read(&inode->i_count)) {
270 spin_unlock(&inode->i_lock);
267 continue; 271 continue;
272 }
268 273
269 need_iput_tmp = need_iput; 274 need_iput_tmp = need_iput;
270 need_iput = NULL; 275 need_iput = NULL;
@@ -274,22 +279,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
274 __iget(inode); 279 __iget(inode);
275 else 280 else
276 need_iput_tmp = NULL; 281 need_iput_tmp = NULL;
282 spin_unlock(&inode->i_lock);
277 283
278 /* In case the dropping of a reference would nuke next_i. */ 284 /* In case the dropping of a reference would nuke next_i. */
279 if ((&next_i->i_sb_list != list) && 285 if ((&next_i->i_sb_list != list) &&
280 atomic_read(&next_i->i_count) && 286 atomic_read(&next_i->i_count)) {
281 !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { 287 spin_lock(&next_i->i_lock);
282 __iget(next_i); 288 if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
283 need_iput = next_i; 289 __iget(next_i);
290 need_iput = next_i;
291 }
292 spin_unlock(&next_i->i_lock);
284 } 293 }
285 294
286 /* 295 /*
287 * We can safely drop inode_lock here because we hold 296 * We can safely drop inode_sb_list_lock here because we hold
288 * references on both inode and next_i. Also no new inodes 297 * references on both inode and next_i. Also no new inodes
289 * will be added since the umount has begun. Finally, 298 * will be added since the umount has begun.
290 * iprune_mutex keeps shrink_icache_memory() away.
291 */ 299 */
292 spin_unlock(&inode_lock); 300 spin_unlock(&inode_sb_list_lock);
293 301
294 if (need_iput_tmp) 302 if (need_iput_tmp)
295 iput(need_iput_tmp); 303 iput(need_iput_tmp);
@@ -301,7 +309,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
301 309
302 iput(inode); 310 iput(inode);
303 311
304 spin_lock(&inode_lock); 312 spin_lock(&inode_sb_list_lock);
305 } 313 }
306 spin_unlock(&inode_lock); 314 spin_unlock(&inode_sb_list_lock);
307} 315}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 325185e514bb..50c00856f730 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,7 +91,6 @@
91#include <linux/slab.h> 91#include <linux/slab.h>
92#include <linux/spinlock.h> 92#include <linux/spinlock.h>
93#include <linux/srcu.h> 93#include <linux/srcu.h>
94#include <linux/writeback.h> /* for inode_lock */
95 94
96#include <asm/atomic.h> 95#include <asm/atomic.h>
97 96
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 85eebff6d0d7..e86577d6c5c3 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -23,7 +23,6 @@
23#include <linux/mount.h> 23#include <linux/mount.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26#include <linux/writeback.h> /* for inode_lock */
27 26
28#include <asm/atomic.h> 27#include <asm/atomic.h>
29 28
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index a627ed82c0a3..0b56c6b7ec01 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -54,7 +54,7 @@
54 * 54 *
55 * Return 1 if the attributes match and 0 if not. 55 * Return 1 if the attributes match and 0 if not.
56 * 56 *
57 * NOTE: This function runs with the inode_lock spin lock held so it is not 57 * NOTE: This function runs with the inode->i_lock spin lock held so it is not
58 * allowed to sleep. 58 * allowed to sleep.
59 */ 59 */
60int ntfs_test_inode(struct inode *vi, ntfs_attr *na) 60int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
98 * 98 *
99 * Return 0 on success and -errno on error. 99 * Return 0 on success and -errno on error.
100 * 100 *
101 * NOTE: This function runs with the inode_lock spin lock held so it is not 101 * NOTE: This function runs with the inode->i_lock spin lock held so it is not
102 * allowed to sleep. (Hence the GFP_ATOMIC allocation.) 102 * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
103 */ 103 */
104static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) 104static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7c708a418acc..2e7addfd9803 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -182,7 +182,8 @@ static void m_stop(struct seq_file *m, void *v)
182 struct proc_maps_private *priv = m->private; 182 struct proc_maps_private *priv = m->private;
183 struct vm_area_struct *vma = v; 183 struct vm_area_struct *vma = v;
184 184
185 vma_stop(priv, vma); 185 if (!IS_ERR(vma))
186 vma_stop(priv, vma);
186 if (priv->task) 187 if (priv->task)
187 put_task_struct(priv->task); 188 put_task_struct(priv->task);
188} 189}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a2a622e079f0..fcc8ae75d874 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -76,7 +76,7 @@
76#include <linux/buffer_head.h> 76#include <linux/buffer_head.h>
77#include <linux/capability.h> 77#include <linux/capability.h>
78#include <linux/quotaops.h> 78#include <linux/quotaops.h>
79#include <linux/writeback.h> /* for inode_lock, oddly enough.. */ 79#include "../internal.h" /* ugh */
80 80
81#include <asm/uaccess.h> 81#include <asm/uaccess.h>
82 82
@@ -900,33 +900,38 @@ static void add_dquot_ref(struct super_block *sb, int type)
900 int reserved = 0; 900 int reserved = 0;
901#endif 901#endif
902 902
903 spin_lock(&inode_lock); 903 spin_lock(&inode_sb_list_lock);
904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
905 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 905 spin_lock(&inode->i_lock);
906 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
907 !atomic_read(&inode->i_writecount) ||
908 !dqinit_needed(inode, type)) {
909 spin_unlock(&inode->i_lock);
906 continue; 910 continue;
911 }
907#ifdef CONFIG_QUOTA_DEBUG 912#ifdef CONFIG_QUOTA_DEBUG
908 if (unlikely(inode_get_rsv_space(inode) > 0)) 913 if (unlikely(inode_get_rsv_space(inode) > 0))
909 reserved = 1; 914 reserved = 1;
910#endif 915#endif
911 if (!atomic_read(&inode->i_writecount))
912 continue;
913 if (!dqinit_needed(inode, type))
914 continue;
915
916 __iget(inode); 916 __iget(inode);
917 spin_unlock(&inode_lock); 917 spin_unlock(&inode->i_lock);
918 spin_unlock(&inode_sb_list_lock);
918 919
919 iput(old_inode); 920 iput(old_inode);
920 __dquot_initialize(inode, type); 921 __dquot_initialize(inode, type);
921 /* We hold a reference to 'inode' so it couldn't have been 922
922 * removed from s_inodes list while we dropped the inode_lock. 923 /*
923 * We cannot iput the inode now as we can be holding the last 924 * We hold a reference to 'inode' so it couldn't have been
924 * reference and we cannot iput it under inode_lock. So we 925 * removed from s_inodes list while we dropped the
925 * keep the reference and iput it later. */ 926 * inode_sb_list_lock We cannot iput the inode now as we can be
927 * holding the last reference and we cannot iput it under
928 * inode_sb_list_lock. So we keep the reference and iput it
929 * later.
930 */
926 old_inode = inode; 931 old_inode = inode;
927 spin_lock(&inode_lock); 932 spin_lock(&inode_sb_list_lock);
928 } 933 }
929 spin_unlock(&inode_lock); 934 spin_unlock(&inode_sb_list_lock);
930 iput(old_inode); 935 iput(old_inode);
931 936
932#ifdef CONFIG_QUOTA_DEBUG 937#ifdef CONFIG_QUOTA_DEBUG
@@ -1007,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
1007 struct inode *inode; 1012 struct inode *inode;
1008 int reserved = 0; 1013 int reserved = 0;
1009 1014
1010 spin_lock(&inode_lock); 1015 spin_lock(&inode_sb_list_lock);
1011 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1016 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1012 /* 1017 /*
1013 * We have to scan also I_NEW inodes because they can already 1018 * We have to scan also I_NEW inodes because they can already
@@ -1021,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
1021 remove_inode_dquot_ref(inode, type, tofree_head); 1026 remove_inode_dquot_ref(inode, type, tofree_head);
1022 } 1027 }
1023 } 1028 }
1024 spin_unlock(&inode_lock); 1029 spin_unlock(&inode_sb_list_lock);
1025#ifdef CONFIG_QUOTA_DEBUG 1030#ifdef CONFIG_QUOTA_DEBUG
1026 if (reserved) { 1031 if (reserved) {
1027 printk(KERN_WARNING "VFS (%s): Writes happened after quota" 1032 printk(KERN_WARNING "VFS (%s): Writes happened after quota"