aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-03-24 22:01:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-24 22:01:30 -0400
commitd39dd11c3e6a7af5c20bfac40594db36cf270f42 (patch)
tree6384e07fa2f347b286cde9754c4507b5a738ab47
parent30f5b28e7f937608e0407edaa459cc8161de81d9 (diff)
parent0b2d0724e26a335cd326eb7ad552c109116a8795 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6: fs: simplify iget & friends fs: pull inode->i_lock up out of writeback_single_inode fs: rename inode_lock to inode_hash_lock fs: move i_wb_list out from under inode_lock fs: move i_sb_list out from under inode_lock fs: remove inode_lock from iput_final and prune_icache fs: Lock the inode LRU list separately fs: factor inode disposal fs: protect inode->i_state with inode->i_lock autofs4: Do not potentially dereference NULL pointer returned by fget() in autofs_dev_ioctl_setpipefd() autofs4 - remove autofs4_lock autofs4 - fix d_manage() return on rcu-walk autofs4 - fix autofs4_expire_indirect() traversal autofs4 - fix dentry leak in autofs4_expire_direct() autofs4 - reinstate last used update on access vfs - check non-mountpoint dentry might block in __follow_mount_rcu()
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--Documentation/filesystems/porting16
-rw-r--r--Documentation/filesystems/vfs.txt2
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c4
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/root.c62
-rw-r--r--fs/autofs4/waitq.c6
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/drop_caches.c18
-rw-r--r--fs/fs-writeback.c141
-rw-r--r--fs/inode.c656
-rw-r--r--fs/internal.h7
-rw-r--r--fs/logfs/inode.c2
-rw-r--r--fs/namei.c23
-rw-r--r--fs/notify/inode_mark.c42
-rw-r--r--fs/notify/mark.c1
-rw-r--r--fs/notify/vfsmount_mark.c1
-rw-r--r--fs/ntfs/inode.c4
-rw-r--r--fs/quota/dquot.c41
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/quotaops.h2
-rw-r--r--include/linux/writeback.h2
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/rmap.c5
27 files changed, 615 insertions, 536 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 2e994efe12cb..61b31acb9176 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -128,7 +128,7 @@ alloc_inode:
128destroy_inode: 128destroy_inode:
129dirty_inode: (must not sleep) 129dirty_inode: (must not sleep)
130write_inode: 130write_inode:
131drop_inode: !!!inode_lock!!! 131drop_inode: !!!inode->i_lock!!!
132evict_inode: 132evict_inode:
133put_super: write 133put_super: write
134write_super: read 134write_super: read
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 0c986c9e8519..6e29954851a2 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -298,11 +298,14 @@ be used instead. It gets called whenever the inode is evicted, whether it has
298remaining links or not. Caller does *not* evict the pagecache or inode-associated 298remaining links or not. Caller does *not* evict the pagecache or inode-associated
299metadata buffers; getting rid of those is responsibility of method, as it had 299metadata buffers; getting rid of those is responsibility of method, as it had
300been for ->delete_inode(). 300been for ->delete_inode().
301 ->drop_inode() returns int now; it's called on final iput() with inode_lock 301
302held and it returns true if filesystems wants the inode to be dropped. As before, 302 ->drop_inode() returns int now; it's called on final iput() with
303generic_drop_inode() is still the default and it's been updated appropriately. 303inode->i_lock held and it returns true if filesystems wants the inode to be
304generic_delete_inode() is also alive and it consists simply of return 1. Note that 304dropped. As before, generic_drop_inode() is still the default and it's been
305all actual eviction work is done by caller after ->drop_inode() returns. 305updated appropriately. generic_delete_inode() is also alive and it consists
306simply of return 1. Note that all actual eviction work is done by caller after
307->drop_inode() returns.
308
306 clear_inode() is gone; use end_writeback() instead. As before, it must 309 clear_inode() is gone; use end_writeback() instead. As before, it must
307be called exactly once on each call of ->evict_inode() (as it used to be for 310be called exactly once on each call of ->evict_inode() (as it used to be for
308each call of ->delete_inode()). Unlike before, if you are using inode-associated 311each call of ->delete_inode()). Unlike before, if you are using inode-associated
@@ -397,6 +400,9 @@ a file off.
397 400
398-- 401--
399[mandatory] 402[mandatory]
403
404--
405[mandatory]
400 ->get_sb() is gone. Switch to use of ->mount(). Typically it's just 406 ->get_sb() is gone. Switch to use of ->mount(). Typically it's just
401a matter of switching from calling get_sb_... to mount_... and changing the 407a matter of switching from calling get_sb_... to mount_... and changing the
402function type. If you were doing it manually, just switch from setting ->mnt_root 408function type. If you were doing it manually, just switch from setting ->mnt_root
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 306f0ae8df09..80815ed654cb 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -254,7 +254,7 @@ or bottom half).
254 should be synchronous or not, not all filesystems check this flag. 254 should be synchronous or not, not all filesystems check this flag.
255 255
256 drop_inode: called when the last access to the inode is dropped, 256 drop_inode: called when the last access to the inode is dropped,
257 with the inode_lock spinlock held. 257 with the inode->i_lock spinlock held.
258 258
259 This method should be either NULL (normal UNIX filesystem 259 This method should be either NULL (normal UNIX filesystem
260 semantics) or "generic_delete_inode" (for filesystems that do not 260 semantics) or "generic_delete_inode" (for filesystems that do not
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 54f923792728..475f9c597cb7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -61,8 +61,6 @@ do { \
61 current->pid, __func__, ##args); \ 61 current->pid, __func__, ##args); \
62} while (0) 62} while (0)
63 63
64extern spinlock_t autofs4_lock;
65
66/* Unified info structure. This is pointed to by both the dentry and 64/* Unified info structure. This is pointed to by both the dentry and
67 inode structures. Each file in the filesystem has an instance of this 65 inode structures. Each file in the filesystem has an instance of this
68 structure. It holds a reference to the dentry, so dentries are never 66 structure. It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1442da4860e5..509fe1eb66ae 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
372 return -EBUSY; 372 return -EBUSY;
373 } else { 373 } else {
374 struct file *pipe = fget(pipefd); 374 struct file *pipe = fget(pipefd);
375 if (!pipe) {
376 err = -EBADF;
377 goto out;
378 }
375 if (!pipe->f_op || !pipe->f_op->write) { 379 if (!pipe->f_op || !pipe->f_op->write) {
376 err = -EPIPE; 380 err = -EPIPE;
377 fput(pipe); 381 fput(pipe);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index f43100b9662b..450f529a4eae 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -87,18 +87,70 @@ done:
87} 87}
88 88
89/* 89/*
90 * Calculate and dget next entry in the subdirs list under root.
91 */
92static struct dentry *get_next_positive_subdir(struct dentry *prev,
93 struct dentry *root)
94{
95 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
96 struct list_head *next;
97 struct dentry *p, *q;
98
99 spin_lock(&sbi->lookup_lock);
100
101 if (prev == NULL) {
102 spin_lock(&root->d_lock);
103 prev = dget_dlock(root);
104 next = prev->d_subdirs.next;
105 p = prev;
106 goto start;
107 }
108
109 p = prev;
110 spin_lock(&p->d_lock);
111again:
112 next = p->d_u.d_child.next;
113start:
114 if (next == &root->d_subdirs) {
115 spin_unlock(&p->d_lock);
116 spin_unlock(&sbi->lookup_lock);
117 dput(prev);
118 return NULL;
119 }
120
121 q = list_entry(next, struct dentry, d_u.d_child);
122
123 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
124 /* Negative dentry - try next */
125 if (!simple_positive(q)) {
126 spin_unlock(&p->d_lock);
127 p = q;
128 goto again;
129 }
130 dget_dlock(q);
131 spin_unlock(&q->d_lock);
132 spin_unlock(&p->d_lock);
133 spin_unlock(&sbi->lookup_lock);
134
135 dput(prev);
136
137 return q;
138}
139
140/*
90 * Calculate and dget next entry in top down tree traversal. 141 * Calculate and dget next entry in top down tree traversal.
91 */ 142 */
92static struct dentry *get_next_positive_dentry(struct dentry *prev, 143static struct dentry *get_next_positive_dentry(struct dentry *prev,
93 struct dentry *root) 144 struct dentry *root)
94{ 145{
146 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
95 struct list_head *next; 147 struct list_head *next;
96 struct dentry *p, *ret; 148 struct dentry *p, *ret;
97 149
98 if (prev == NULL) 150 if (prev == NULL)
99 return dget(root); 151 return dget(root);
100 152
101 spin_lock(&autofs4_lock); 153 spin_lock(&sbi->lookup_lock);
102relock: 154relock:
103 p = prev; 155 p = prev;
104 spin_lock(&p->d_lock); 156 spin_lock(&p->d_lock);
@@ -110,7 +162,7 @@ again:
110 162
111 if (p == root) { 163 if (p == root) {
112 spin_unlock(&p->d_lock); 164 spin_unlock(&p->d_lock);
113 spin_unlock(&autofs4_lock); 165 spin_unlock(&sbi->lookup_lock);
114 dput(prev); 166 dput(prev);
115 return NULL; 167 return NULL;
116 } 168 }
@@ -140,7 +192,7 @@ again:
140 dget_dlock(ret); 192 dget_dlock(ret);
141 spin_unlock(&ret->d_lock); 193 spin_unlock(&ret->d_lock);
142 spin_unlock(&p->d_lock); 194 spin_unlock(&p->d_lock);
143 spin_unlock(&autofs4_lock); 195 spin_unlock(&sbi->lookup_lock);
144 196
145 dput(prev); 197 dput(prev);
146 198
@@ -290,11 +342,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
290 spin_lock(&sbi->fs_lock); 342 spin_lock(&sbi->fs_lock);
291 ino = autofs4_dentry_ino(root); 343 ino = autofs4_dentry_ino(root);
292 /* No point expiring a pending mount */ 344 /* No point expiring a pending mount */
293 if (ino->flags & AUTOFS_INF_PENDING) { 345 if (ino->flags & AUTOFS_INF_PENDING)
294 spin_unlock(&sbi->fs_lock); 346 goto out;
295 return NULL;
296 }
297 managed_dentry_set_transit(root);
298 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 347 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
299 struct autofs_info *ino = autofs4_dentry_ino(root); 348 struct autofs_info *ino = autofs4_dentry_ino(root);
300 ino->flags |= AUTOFS_INF_EXPIRING; 349 ino->flags |= AUTOFS_INF_EXPIRING;
@@ -302,7 +351,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
302 spin_unlock(&sbi->fs_lock); 351 spin_unlock(&sbi->fs_lock);
303 return root; 352 return root;
304 } 353 }
305 managed_dentry_clear_transit(root); 354out:
306 spin_unlock(&sbi->fs_lock); 355 spin_unlock(&sbi->fs_lock);
307 dput(root); 356 dput(root);
308 357
@@ -336,13 +385,12 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
336 timeout = sbi->exp_timeout; 385 timeout = sbi->exp_timeout;
337 386
338 dentry = NULL; 387 dentry = NULL;
339 while ((dentry = get_next_positive_dentry(dentry, root))) { 388 while ((dentry = get_next_positive_subdir(dentry, root))) {
340 spin_lock(&sbi->fs_lock); 389 spin_lock(&sbi->fs_lock);
341 ino = autofs4_dentry_ino(dentry); 390 ino = autofs4_dentry_ino(dentry);
342 /* No point expiring a pending mount */ 391 /* No point expiring a pending mount */
343 if (ino->flags & AUTOFS_INF_PENDING) 392 if (ino->flags & AUTOFS_INF_PENDING)
344 goto cont; 393 goto next;
345 managed_dentry_set_transit(dentry);
346 394
347 /* 395 /*
348 * Case 1: (i) indirect mount or top level pseudo direct mount 396 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -402,8 +450,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
402 } 450 }
403 } 451 }
404next: 452next:
405 managed_dentry_clear_transit(dentry);
406cont:
407 spin_unlock(&sbi->fs_lock); 453 spin_unlock(&sbi->fs_lock);
408 } 454 }
409 return NULL; 455 return NULL;
@@ -415,13 +461,13 @@ found:
415 ino->flags |= AUTOFS_INF_EXPIRING; 461 ino->flags |= AUTOFS_INF_EXPIRING;
416 init_completion(&ino->expire_complete); 462 init_completion(&ino->expire_complete);
417 spin_unlock(&sbi->fs_lock); 463 spin_unlock(&sbi->fs_lock);
418 spin_lock(&autofs4_lock); 464 spin_lock(&sbi->lookup_lock);
419 spin_lock(&expired->d_parent->d_lock); 465 spin_lock(&expired->d_parent->d_lock);
420 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); 466 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
421 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); 467 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
422 spin_unlock(&expired->d_lock); 468 spin_unlock(&expired->d_lock);
423 spin_unlock(&expired->d_parent->d_lock); 469 spin_unlock(&expired->d_parent->d_lock);
424 spin_unlock(&autofs4_lock); 470 spin_unlock(&sbi->lookup_lock);
425 return expired; 471 return expired;
426} 472}
427 473
@@ -484,8 +530,6 @@ int autofs4_expire_run(struct super_block *sb,
484 spin_lock(&sbi->fs_lock); 530 spin_lock(&sbi->fs_lock);
485 ino = autofs4_dentry_ino(dentry); 531 ino = autofs4_dentry_ino(dentry);
486 ino->flags &= ~AUTOFS_INF_EXPIRING; 532 ino->flags &= ~AUTOFS_INF_EXPIRING;
487 if (!d_unhashed(dentry))
488 managed_dentry_clear_transit(dentry);
489 complete_all(&ino->expire_complete); 533 complete_all(&ino->expire_complete);
490 spin_unlock(&sbi->fs_lock); 534 spin_unlock(&sbi->fs_lock);
491 535
@@ -513,9 +557,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
513 spin_lock(&sbi->fs_lock); 557 spin_lock(&sbi->fs_lock);
514 ino->flags &= ~AUTOFS_INF_EXPIRING; 558 ino->flags &= ~AUTOFS_INF_EXPIRING;
515 spin_lock(&dentry->d_lock); 559 spin_lock(&dentry->d_lock);
516 if (ret) 560 if (!ret) {
517 __managed_dentry_clear_transit(dentry);
518 else {
519 if ((IS_ROOT(dentry) || 561 if ((IS_ROOT(dentry) ||
520 (autofs_type_indirect(sbi->type) && 562 (autofs_type_indirect(sbi->type) &&
521 IS_ROOT(dentry->d_parent))) && 563 IS_ROOT(dentry->d_parent))) &&
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e6f84d26f4cf..96804a17bbd0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,8 +23,6 @@
23 23
24#include "autofs_i.h" 24#include "autofs_i.h"
25 25
26DEFINE_SPINLOCK(autofs4_lock);
27
28static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
29static int autofs4_dir_unlink(struct inode *,struct dentry *); 27static int autofs4_dir_unlink(struct inode *,struct dentry *);
30static int autofs4_dir_rmdir(struct inode *,struct dentry *); 28static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
125 * autofs file system so just let the libfs routines handle 123 * autofs file system so just let the libfs routines handle
126 * it. 124 * it.
127 */ 125 */
128 spin_lock(&autofs4_lock); 126 spin_lock(&sbi->lookup_lock);
129 spin_lock(&dentry->d_lock); 127 spin_lock(&dentry->d_lock);
130 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 128 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
131 spin_unlock(&dentry->d_lock); 129 spin_unlock(&dentry->d_lock);
132 spin_unlock(&autofs4_lock); 130 spin_unlock(&sbi->lookup_lock);
133 return -ENOENT; 131 return -ENOENT;
134 } 132 }
135 spin_unlock(&dentry->d_lock); 133 spin_unlock(&dentry->d_lock);
136 spin_unlock(&autofs4_lock); 134 spin_unlock(&sbi->lookup_lock);
137 135
138out: 136out:
139 return dcache_dir_open(inode, file); 137 return dcache_dir_open(inode, file);
@@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
171 const unsigned char *str = name->name; 169 const unsigned char *str = name->name;
172 struct list_head *p, *head; 170 struct list_head *p, *head;
173 171
174 spin_lock(&autofs4_lock);
175 spin_lock(&sbi->lookup_lock); 172 spin_lock(&sbi->lookup_lock);
176 head = &sbi->active_list; 173 head = &sbi->active_list;
177 list_for_each(p, head) { 174 list_for_each(p, head) {
@@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
204 dget_dlock(active); 201 dget_dlock(active);
205 spin_unlock(&active->d_lock); 202 spin_unlock(&active->d_lock);
206 spin_unlock(&sbi->lookup_lock); 203 spin_unlock(&sbi->lookup_lock);
207 spin_unlock(&autofs4_lock);
208 return active; 204 return active;
209 } 205 }
210next: 206next:
211 spin_unlock(&active->d_lock); 207 spin_unlock(&active->d_lock);
212 } 208 }
213 spin_unlock(&sbi->lookup_lock); 209 spin_unlock(&sbi->lookup_lock);
214 spin_unlock(&autofs4_lock);
215 210
216 return NULL; 211 return NULL;
217} 212}
@@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
226 const unsigned char *str = name->name; 221 const unsigned char *str = name->name;
227 struct list_head *p, *head; 222 struct list_head *p, *head;
228 223
229 spin_lock(&autofs4_lock);
230 spin_lock(&sbi->lookup_lock); 224 spin_lock(&sbi->lookup_lock);
231 head = &sbi->expiring_list; 225 head = &sbi->expiring_list;
232 list_for_each(p, head) { 226 list_for_each(p, head) {
@@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
259 dget_dlock(expiring); 253 dget_dlock(expiring);
260 spin_unlock(&expiring->d_lock); 254 spin_unlock(&expiring->d_lock);
261 spin_unlock(&sbi->lookup_lock); 255 spin_unlock(&sbi->lookup_lock);
262 spin_unlock(&autofs4_lock);
263 return expiring; 256 return expiring;
264 } 257 }
265next: 258next:
266 spin_unlock(&expiring->d_lock); 259 spin_unlock(&expiring->d_lock);
267 } 260 }
268 spin_unlock(&sbi->lookup_lock); 261 spin_unlock(&sbi->lookup_lock);
269 spin_unlock(&autofs4_lock);
270 262
271 return NULL; 263 return NULL;
272} 264}
@@ -275,17 +267,16 @@ static int autofs4_mount_wait(struct dentry *dentry)
275{ 267{
276 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 268 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
277 struct autofs_info *ino = autofs4_dentry_ino(dentry); 269 struct autofs_info *ino = autofs4_dentry_ino(dentry);
278 int status; 270 int status = 0;
279 271
280 if (ino->flags & AUTOFS_INF_PENDING) { 272 if (ino->flags & AUTOFS_INF_PENDING) {
281 DPRINTK("waiting for mount name=%.*s", 273 DPRINTK("waiting for mount name=%.*s",
282 dentry->d_name.len, dentry->d_name.name); 274 dentry->d_name.len, dentry->d_name.name);
283 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 275 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
284 DPRINTK("mount wait done status=%d", status); 276 DPRINTK("mount wait done status=%d", status);
285 ino->last_used = jiffies;
286 return status;
287 } 277 }
288 return 0; 278 ino->last_used = jiffies;
279 return status;
289} 280}
290 281
291static int do_expire_wait(struct dentry *dentry) 282static int do_expire_wait(struct dentry *dentry)
@@ -319,9 +310,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
319 */ 310 */
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { 311 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent; 312 struct dentry *parent = dentry->d_parent;
313 struct autofs_info *ino;
322 struct dentry *new = d_lookup(parent, &dentry->d_name); 314 struct dentry *new = d_lookup(parent, &dentry->d_name);
323 if (!new) 315 if (!new)
324 return NULL; 316 return NULL;
317 ino = autofs4_dentry_ino(new);
318 ino->last_used = jiffies;
325 dput(path->dentry); 319 dput(path->dentry);
326 path->dentry = new; 320 path->dentry = new;
327 } 321 }
@@ -338,18 +332,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
338 DPRINTK("dentry=%p %.*s", 332 DPRINTK("dentry=%p %.*s",
339 dentry, dentry->d_name.len, dentry->d_name.name); 333 dentry, dentry->d_name.len, dentry->d_name.name);
340 334
341 /*
342 * Someone may have manually umounted this or it was a submount
343 * that has gone away.
344 */
345 spin_lock(&dentry->d_lock);
346 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
347 if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
348 (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
349 __managed_dentry_set_transit(path->dentry);
350 }
351 spin_unlock(&dentry->d_lock);
352
353 /* The daemon never triggers a mount. */ 335 /* The daemon never triggers a mount. */
354 if (autofs4_oz_mode(sbi)) 336 if (autofs4_oz_mode(sbi))
355 return NULL; 337 return NULL;
@@ -418,18 +400,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
418done: 400done:
419 if (!(ino->flags & AUTOFS_INF_EXPIRING)) { 401 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
420 /* 402 /*
421 * Any needed mounting has been completed and the path updated 403 * Any needed mounting has been completed and the path
422 * so turn this into a normal dentry so we don't continually 404 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
423 * call ->d_automount() and ->d_manage(). 405 * call ->d_automount() on rootless multi-mounts since
424 */ 406 * it can lead to an incorrect ELOOP error return.
425 spin_lock(&dentry->d_lock); 407 *
426 __managed_dentry_clear_transit(dentry);
427 /*
428 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and 408 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
429 * symlinks as in all other cases the dentry will be covered by 409 * symlinks as in all other cases the dentry will be covered by
430 * an actual mount so ->d_automount() won't be called during 410 * an actual mount so ->d_automount() won't be called during
431 * the follow. 411 * the follow.
432 */ 412 */
413 spin_lock(&dentry->d_lock);
433 if ((!d_mountpoint(dentry) && 414 if ((!d_mountpoint(dentry) &&
434 !list_empty(&dentry->d_subdirs)) || 415 !list_empty(&dentry->d_subdirs)) ||
435 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) 416 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
@@ -455,6 +436,8 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
455 436
456 /* The daemon never waits. */ 437 /* The daemon never waits. */
457 if (autofs4_oz_mode(sbi)) { 438 if (autofs4_oz_mode(sbi)) {
439 if (rcu_walk)
440 return 0;
458 if (!d_mountpoint(dentry)) 441 if (!d_mountpoint(dentry))
459 return -EISDIR; 442 return -EISDIR;
460 return 0; 443 return 0;
@@ -612,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
612 595
613 dir->i_mtime = CURRENT_TIME; 596 dir->i_mtime = CURRENT_TIME;
614 597
615 spin_lock(&autofs4_lock); 598 spin_lock(&sbi->lookup_lock);
616 autofs4_add_expiring(dentry); 599 __autofs4_add_expiring(dentry);
617 spin_lock(&dentry->d_lock); 600 spin_lock(&dentry->d_lock);
618 __d_drop(dentry); 601 __d_drop(dentry);
619 spin_unlock(&dentry->d_lock); 602 spin_unlock(&dentry->d_lock);
620 spin_unlock(&autofs4_lock); 603 spin_unlock(&sbi->lookup_lock);
621 604
622 return 0; 605 return 0;
623} 606}
@@ -686,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
686 if (!autofs4_oz_mode(sbi)) 669 if (!autofs4_oz_mode(sbi))
687 return -EACCES; 670 return -EACCES;
688 671
689 spin_lock(&autofs4_lock);
690 spin_lock(&sbi->lookup_lock); 672 spin_lock(&sbi->lookup_lock);
691 spin_lock(&dentry->d_lock); 673 spin_lock(&dentry->d_lock);
692 if (!list_empty(&dentry->d_subdirs)) { 674 if (!list_empty(&dentry->d_subdirs)) {
693 spin_unlock(&dentry->d_lock); 675 spin_unlock(&dentry->d_lock);
694 spin_unlock(&sbi->lookup_lock); 676 spin_unlock(&sbi->lookup_lock);
695 spin_unlock(&autofs4_lock);
696 return -ENOTEMPTY; 677 return -ENOTEMPTY;
697 } 678 }
698 __autofs4_add_expiring(dentry); 679 __autofs4_add_expiring(dentry);
699 spin_unlock(&sbi->lookup_lock);
700 __d_drop(dentry); 680 __d_drop(dentry);
701 spin_unlock(&dentry->d_lock); 681 spin_unlock(&dentry->d_lock);
702 spin_unlock(&autofs4_lock); 682 spin_unlock(&sbi->lookup_lock);
703 683
704 if (sbi->version < 5) 684 if (sbi->version < 5)
705 autofs_clear_leaf_automount_flags(dentry); 685 autofs_clear_leaf_automount_flags(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 56010056b2e6..25435987d6ae 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -197,12 +197,12 @@ rename_retry:
197 197
198 seq = read_seqbegin(&rename_lock); 198 seq = read_seqbegin(&rename_lock);
199 rcu_read_lock(); 199 rcu_read_lock();
200 spin_lock(&autofs4_lock); 200 spin_lock(&sbi->fs_lock);
201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) 201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
202 len += tmp->d_name.len + 1; 202 len += tmp->d_name.len + 1;
203 203
204 if (!len || --len > NAME_MAX) { 204 if (!len || --len > NAME_MAX) {
205 spin_unlock(&autofs4_lock); 205 spin_unlock(&sbi->fs_lock);
206 rcu_read_unlock(); 206 rcu_read_unlock();
207 if (read_seqretry(&rename_lock, seq)) 207 if (read_seqretry(&rename_lock, seq))
208 goto rename_retry; 208 goto rename_retry;
@@ -218,7 +218,7 @@ rename_retry:
218 p -= tmp->d_name.len; 218 p -= tmp->d_name.len;
219 strncpy(p, tmp->d_name.name, tmp->d_name.len); 219 strncpy(p, tmp->d_name.name, tmp->d_name.len);
220 } 220 }
221 spin_unlock(&autofs4_lock); 221 spin_unlock(&sbi->fs_lock);
222 rcu_read_unlock(); 222 rcu_read_unlock();
223 if (read_seqretry(&rename_lock, seq)) 223 if (read_seqretry(&rename_lock, seq))
224 goto rename_retry; 224 goto rename_retry;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7d02afb2b7f4..c1511c674f53 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,11 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
55static void bdev_inode_switch_bdi(struct inode *inode, 55static void bdev_inode_switch_bdi(struct inode *inode,
56 struct backing_dev_info *dst) 56 struct backing_dev_info *dst)
57{ 57{
58 spin_lock(&inode_lock); 58 spin_lock(&inode_wb_list_lock);
59 spin_lock(&inode->i_lock);
59 inode->i_data.backing_dev_info = dst; 60 inode->i_data.backing_dev_info = dst;
60 if (inode->i_state & I_DIRTY) 61 if (inode->i_state & I_DIRTY)
61 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
62 spin_unlock(&inode_lock); 63 spin_unlock(&inode->i_lock);
64 spin_unlock(&inode_wb_list_lock);
63} 65}
64 66
65static sector_t max_block(struct block_device *bdev) 67static sector_t max_block(struct block_device *bdev)
diff --git a/fs/buffer.c b/fs/buffer.c
index 2e6b1a387b7e..a08bb8e61c6f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1138,7 +1138,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1138 * inode list. 1138 * inode list.
1139 * 1139 *
1140 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1140 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1141 * mapping->tree_lock and the global inode_lock. 1141 * mapping->tree_lock and mapping->host->i_lock.
1142 */ 1142 */
1143void mark_buffer_dirty(struct buffer_head *bh) 1143void mark_buffer_dirty(struct buffer_head *bh)
1144{ 1144{
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 816f88e6b9ce..98b77c89494c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,6 +8,7 @@
8#include <linux/writeback.h> 8#include <linux/writeback.h>
9#include <linux/sysctl.h> 9#include <linux/sysctl.h>
10#include <linux/gfp.h> 10#include <linux/gfp.h>
11#include "internal.h"
11 12
12/* A global variable is a bit ugly, but it keeps the code simple */ 13/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches; 14int sysctl_drop_caches;
@@ -16,20 +17,23 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
16{ 17{
17 struct inode *inode, *toput_inode = NULL; 18 struct inode *inode, *toput_inode = NULL;
18 19
19 spin_lock(&inode_lock); 20 spin_lock(&inode_sb_list_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 21 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 22 spin_lock(&inode->i_lock);
22 continue; 23 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
23 if (inode->i_mapping->nrpages == 0) 24 (inode->i_mapping->nrpages == 0)) {
25 spin_unlock(&inode->i_lock);
24 continue; 26 continue;
27 }
25 __iget(inode); 28 __iget(inode);
26 spin_unlock(&inode_lock); 29 spin_unlock(&inode->i_lock);
30 spin_unlock(&inode_sb_list_lock);
27 invalidate_mapping_pages(inode->i_mapping, 0, -1); 31 invalidate_mapping_pages(inode->i_mapping, 0, -1);
28 iput(toput_inode); 32 iput(toput_inode);
29 toput_inode = inode; 33 toput_inode = inode;
30 spin_lock(&inode_lock); 34 spin_lock(&inode_sb_list_lock);
31 } 35 }
32 spin_unlock(&inode_lock); 36 spin_unlock(&inode_sb_list_lock);
33 iput(toput_inode); 37 iput(toput_inode);
34} 38}
35 39
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c6e4956786..b5ed541fb137 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -176,6 +176,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
176} 176}
177 177
178/* 178/*
179 * Remove the inode from the writeback list it is on.
180 */
181void inode_wb_list_del(struct inode *inode)
182{
183 spin_lock(&inode_wb_list_lock);
184 list_del_init(&inode->i_wb_list);
185 spin_unlock(&inode_wb_list_lock);
186}
187
188
189/*
179 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
180 * furthest end of its superblock's dirty-inode list. 191 * furthest end of its superblock's dirty-inode list.
181 * 192 *
@@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
188{ 199{
189 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 200 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
190 201
202 assert_spin_locked(&inode_wb_list_lock);
191 if (!list_empty(&wb->b_dirty)) { 203 if (!list_empty(&wb->b_dirty)) {
192 struct inode *tail; 204 struct inode *tail;
193 205
@@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
205{ 217{
206 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 218 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
207 219
220 assert_spin_locked(&inode_wb_list_lock);
208 list_move(&inode->i_wb_list, &wb->b_more_io); 221 list_move(&inode->i_wb_list, &wb->b_more_io);
209} 222}
210 223
211static void inode_sync_complete(struct inode *inode) 224static void inode_sync_complete(struct inode *inode)
212{ 225{
213 /* 226 /*
214 * Prevent speculative execution through spin_unlock(&inode_lock); 227 * Prevent speculative execution through
228 * spin_unlock(&inode_wb_list_lock);
215 */ 229 */
230
216 smp_mb(); 231 smp_mb();
217 wake_up_bit(&inode->i_state, __I_SYNC); 232 wake_up_bit(&inode->i_state, __I_SYNC);
218} 233}
@@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
286 */ 301 */
287static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 302static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
288{ 303{
304 assert_spin_locked(&inode_wb_list_lock);
289 list_splice_init(&wb->b_more_io, &wb->b_io); 305 list_splice_init(&wb->b_more_io, &wb->b_io);
290 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 306 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
291} 307}
@@ -306,25 +322,25 @@ static void inode_wait_for_writeback(struct inode *inode)
306 wait_queue_head_t *wqh; 322 wait_queue_head_t *wqh;
307 323
308 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 324 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
309 while (inode->i_state & I_SYNC) { 325 while (inode->i_state & I_SYNC) {
310 spin_unlock(&inode_lock); 326 spin_unlock(&inode->i_lock);
327 spin_unlock(&inode_wb_list_lock);
311 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 328 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
312 spin_lock(&inode_lock); 329 spin_lock(&inode_wb_list_lock);
330 spin_lock(&inode->i_lock);
313 } 331 }
314} 332}
315 333
316/* 334/*
317 * Write out an inode's dirty pages. Called under inode_lock. Either the 335 * Write out an inode's dirty pages. Called under inode_wb_list_lock and
318 * caller has ref on the inode (either via __iget or via syscall against an fd) 336 * inode->i_lock. Either the caller has an active reference on the inode or
319 * or the inode has I_WILL_FREE set (via generic_forget_inode) 337 * the inode has I_WILL_FREE set.
320 * 338 *
321 * If `wait' is set, wait on the writeout. 339 * If `wait' is set, wait on the writeout.
322 * 340 *
323 * The whole writeout design is quite complex and fragile. We want to avoid 341 * The whole writeout design is quite complex and fragile. We want to avoid
324 * starvation of particular inodes when others are being redirtied, prevent 342 * starvation of particular inodes when others are being redirtied, prevent
325 * livelocks, etc. 343 * livelocks, etc.
326 *
327 * Called under inode_lock.
328 */ 344 */
329static int 345static int
330writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 346writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -333,6 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
333 unsigned dirty; 349 unsigned dirty;
334 int ret; 350 int ret;
335 351
352 assert_spin_locked(&inode_wb_list_lock);
353 assert_spin_locked(&inode->i_lock);
354
336 if (!atomic_read(&inode->i_count)) 355 if (!atomic_read(&inode->i_count))
337 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 356 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
338 else 357 else
@@ -363,7 +382,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
363 /* Set I_SYNC, reset I_DIRTY_PAGES */ 382 /* Set I_SYNC, reset I_DIRTY_PAGES */
364 inode->i_state |= I_SYNC; 383 inode->i_state |= I_SYNC;
365 inode->i_state &= ~I_DIRTY_PAGES; 384 inode->i_state &= ~I_DIRTY_PAGES;
366 spin_unlock(&inode_lock); 385 spin_unlock(&inode->i_lock);
386 spin_unlock(&inode_wb_list_lock);
367 387
368 ret = do_writepages(mapping, wbc); 388 ret = do_writepages(mapping, wbc);
369 389
@@ -383,10 +403,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
383 * due to delalloc, clear dirty metadata flags right before 403 * due to delalloc, clear dirty metadata flags right before
384 * write_inode() 404 * write_inode()
385 */ 405 */
386 spin_lock(&inode_lock); 406 spin_lock(&inode->i_lock);
387 dirty = inode->i_state & I_DIRTY; 407 dirty = inode->i_state & I_DIRTY;
388 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 408 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
389 spin_unlock(&inode_lock); 409 spin_unlock(&inode->i_lock);
390 /* Don't write the inode if only I_DIRTY_PAGES was set */ 410 /* Don't write the inode if only I_DIRTY_PAGES was set */
391 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 411 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
392 int err = write_inode(inode, wbc); 412 int err = write_inode(inode, wbc);
@@ -394,7 +414,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
394 ret = err; 414 ret = err;
395 } 415 }
396 416
397 spin_lock(&inode_lock); 417 spin_lock(&inode_wb_list_lock);
418 spin_lock(&inode->i_lock);
398 inode->i_state &= ~I_SYNC; 419 inode->i_state &= ~I_SYNC;
399 if (!(inode->i_state & I_FREEING)) { 420 if (!(inode->i_state & I_FREEING)) {
400 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 421 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -506,7 +527,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
506 * kind does not need peridic writeout yet, and for the latter 527 * kind does not need peridic writeout yet, and for the latter
507 * kind writeout is handled by the freer. 528 * kind writeout is handled by the freer.
508 */ 529 */
530 spin_lock(&inode->i_lock);
509 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 531 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
532 spin_unlock(&inode->i_lock);
510 requeue_io(inode); 533 requeue_io(inode);
511 continue; 534 continue;
512 } 535 }
@@ -515,10 +538,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
515 * Was this inode dirtied after sync_sb_inodes was called? 538 * Was this inode dirtied after sync_sb_inodes was called?
516 * This keeps sync from extra jobs and livelock. 539 * This keeps sync from extra jobs and livelock.
517 */ 540 */
518 if (inode_dirtied_after(inode, wbc->wb_start)) 541 if (inode_dirtied_after(inode, wbc->wb_start)) {
542 spin_unlock(&inode->i_lock);
519 return 1; 543 return 1;
544 }
520 545
521 __iget(inode); 546 __iget(inode);
547
522 pages_skipped = wbc->pages_skipped; 548 pages_skipped = wbc->pages_skipped;
523 writeback_single_inode(inode, wbc); 549 writeback_single_inode(inode, wbc);
524 if (wbc->pages_skipped != pages_skipped) { 550 if (wbc->pages_skipped != pages_skipped) {
@@ -528,10 +554,11 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
528 */ 554 */
529 redirty_tail(inode); 555 redirty_tail(inode);
530 } 556 }
531 spin_unlock(&inode_lock); 557 spin_unlock(&inode->i_lock);
558 spin_unlock(&inode_wb_list_lock);
532 iput(inode); 559 iput(inode);
533 cond_resched(); 560 cond_resched();
534 spin_lock(&inode_lock); 561 spin_lock(&inode_wb_list_lock);
535 if (wbc->nr_to_write <= 0) { 562 if (wbc->nr_to_write <= 0) {
536 wbc->more_io = 1; 563 wbc->more_io = 1;
537 return 1; 564 return 1;
@@ -550,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
550 577
551 if (!wbc->wb_start) 578 if (!wbc->wb_start)
552 wbc->wb_start = jiffies; /* livelock avoidance */ 579 wbc->wb_start = jiffies; /* livelock avoidance */
553 spin_lock(&inode_lock); 580 spin_lock(&inode_wb_list_lock);
554 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 581 if (!wbc->for_kupdate || list_empty(&wb->b_io))
555 queue_io(wb, wbc->older_than_this); 582 queue_io(wb, wbc->older_than_this);
556 583
@@ -568,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
568 if (ret) 595 if (ret)
569 break; 596 break;
570 } 597 }
571 spin_unlock(&inode_lock); 598 spin_unlock(&inode_wb_list_lock);
572 /* Leave any unwritten inodes on b_io */ 599 /* Leave any unwritten inodes on b_io */
573} 600}
574 601
@@ -577,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
577{ 604{
578 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 605 WARN_ON(!rwsem_is_locked(&sb->s_umount));
579 606
580 spin_lock(&inode_lock); 607 spin_lock(&inode_wb_list_lock);
581 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 608 if (!wbc->for_kupdate || list_empty(&wb->b_io))
582 queue_io(wb, wbc->older_than_this); 609 queue_io(wb, wbc->older_than_this);
583 writeback_sb_inodes(sb, wb, wbc, true); 610 writeback_sb_inodes(sb, wb, wbc, true);
584 spin_unlock(&inode_lock); 611 spin_unlock(&inode_wb_list_lock);
585} 612}
586 613
587/* 614/*
@@ -720,13 +747,15 @@ static long wb_writeback(struct bdi_writeback *wb,
720 * become available for writeback. Otherwise 747 * become available for writeback. Otherwise
721 * we'll just busyloop. 748 * we'll just busyloop.
722 */ 749 */
723 spin_lock(&inode_lock); 750 spin_lock(&inode_wb_list_lock);
724 if (!list_empty(&wb->b_more_io)) { 751 if (!list_empty(&wb->b_more_io)) {
725 inode = wb_inode(wb->b_more_io.prev); 752 inode = wb_inode(wb->b_more_io.prev);
726 trace_wbc_writeback_wait(&wbc, wb->bdi); 753 trace_wbc_writeback_wait(&wbc, wb->bdi);
754 spin_lock(&inode->i_lock);
727 inode_wait_for_writeback(inode); 755 inode_wait_for_writeback(inode);
756 spin_unlock(&inode->i_lock);
728 } 757 }
729 spin_unlock(&inode_lock); 758 spin_unlock(&inode_wb_list_lock);
730 } 759 }
731 760
732 return wrote; 761 return wrote;
@@ -992,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
992{ 1021{
993 struct super_block *sb = inode->i_sb; 1022 struct super_block *sb = inode->i_sb;
994 struct backing_dev_info *bdi = NULL; 1023 struct backing_dev_info *bdi = NULL;
995 bool wakeup_bdi = false;
996 1024
997 /* 1025 /*
998 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1026 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1016,7 +1044,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1016 if (unlikely(block_dump)) 1044 if (unlikely(block_dump))
1017 block_dump___mark_inode_dirty(inode); 1045 block_dump___mark_inode_dirty(inode);
1018 1046
1019 spin_lock(&inode_lock); 1047 spin_lock(&inode->i_lock);
1020 if ((inode->i_state & flags) != flags) { 1048 if ((inode->i_state & flags) != flags) {
1021 const int was_dirty = inode->i_state & I_DIRTY; 1049 const int was_dirty = inode->i_state & I_DIRTY;
1022 1050
@@ -1028,7 +1056,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1028 * superblock list, based upon its state. 1056 * superblock list, based upon its state.
1029 */ 1057 */
1030 if (inode->i_state & I_SYNC) 1058 if (inode->i_state & I_SYNC)
1031 goto out; 1059 goto out_unlock_inode;
1032 1060
1033 /* 1061 /*
1034 * Only add valid (hashed) inodes to the superblock's 1062 * Only add valid (hashed) inodes to the superblock's
@@ -1036,16 +1064,17 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1036 */ 1064 */
1037 if (!S_ISBLK(inode->i_mode)) { 1065 if (!S_ISBLK(inode->i_mode)) {
1038 if (inode_unhashed(inode)) 1066 if (inode_unhashed(inode))
1039 goto out; 1067 goto out_unlock_inode;
1040 } 1068 }
1041 if (inode->i_state & I_FREEING) 1069 if (inode->i_state & I_FREEING)
1042 goto out; 1070 goto out_unlock_inode;
1043 1071
1044 /* 1072 /*
1045 * If the inode was already on b_dirty/b_io/b_more_io, don't 1073 * If the inode was already on b_dirty/b_io/b_more_io, don't
1046 * reposition it (that would break b_dirty time-ordering). 1074 * reposition it (that would break b_dirty time-ordering).
1047 */ 1075 */
1048 if (!was_dirty) { 1076 if (!was_dirty) {
1077 bool wakeup_bdi = false;
1049 bdi = inode_to_bdi(inode); 1078 bdi = inode_to_bdi(inode);
1050 1079
1051 if (bdi_cap_writeback_dirty(bdi)) { 1080 if (bdi_cap_writeback_dirty(bdi)) {
@@ -1062,15 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1062 wakeup_bdi = true; 1091 wakeup_bdi = true;
1063 } 1092 }
1064 1093
1094 spin_unlock(&inode->i_lock);
1095 spin_lock(&inode_wb_list_lock);
1065 inode->dirtied_when = jiffies; 1096 inode->dirtied_when = jiffies;
1066 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1097 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1098 spin_unlock(&inode_wb_list_lock);
1099
1100 if (wakeup_bdi)
1101 bdi_wakeup_thread_delayed(bdi);
1102 return;
1067 } 1103 }
1068 } 1104 }
1069out: 1105out_unlock_inode:
1070 spin_unlock(&inode_lock); 1106 spin_unlock(&inode->i_lock);
1071 1107
1072 if (wakeup_bdi)
1073 bdi_wakeup_thread_delayed(bdi);
1074} 1108}
1075EXPORT_SYMBOL(__mark_inode_dirty); 1109EXPORT_SYMBOL(__mark_inode_dirty);
1076 1110
@@ -1101,7 +1135,7 @@ static void wait_sb_inodes(struct super_block *sb)
1101 */ 1135 */
1102 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1136 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1103 1137
1104 spin_lock(&inode_lock); 1138 spin_lock(&inode_sb_list_lock);
1105 1139
1106 /* 1140 /*
1107 * Data integrity sync. Must wait for all pages under writeback, 1141 * Data integrity sync. Must wait for all pages under writeback,
@@ -1111,22 +1145,25 @@ static void wait_sb_inodes(struct super_block *sb)
1111 * we still have to wait for that writeout. 1145 * we still have to wait for that writeout.
1112 */ 1146 */
1113 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1147 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1114 struct address_space *mapping; 1148 struct address_space *mapping = inode->i_mapping;
1115 1149
1116 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 1150 spin_lock(&inode->i_lock);
1117 continue; 1151 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
1118 mapping = inode->i_mapping; 1152 (mapping->nrpages == 0)) {
1119 if (mapping->nrpages == 0) 1153 spin_unlock(&inode->i_lock);
1120 continue; 1154 continue;
1155 }
1121 __iget(inode); 1156 __iget(inode);
1122 spin_unlock(&inode_lock); 1157 spin_unlock(&inode->i_lock);
1158 spin_unlock(&inode_sb_list_lock);
1159
1123 /* 1160 /*
1124 * We hold a reference to 'inode' so it couldn't have 1161 * We hold a reference to 'inode' so it couldn't have been
1125 * been removed from s_inodes list while we dropped the 1162 * removed from s_inodes list while we dropped the
1126 * inode_lock. We cannot iput the inode now as we can 1163 * inode_sb_list_lock. We cannot iput the inode now as we can
1127 * be holding the last reference and we cannot iput it 1164 * be holding the last reference and we cannot iput it under
1128 * under inode_lock. So we keep the reference and iput 1165 * inode_sb_list_lock. So we keep the reference and iput it
1129 * it later. 1166 * later.
1130 */ 1167 */
1131 iput(old_inode); 1168 iput(old_inode);
1132 old_inode = inode; 1169 old_inode = inode;
@@ -1135,9 +1172,9 @@ static void wait_sb_inodes(struct super_block *sb)
1135 1172
1136 cond_resched(); 1173 cond_resched();
1137 1174
1138 spin_lock(&inode_lock); 1175 spin_lock(&inode_sb_list_lock);
1139 } 1176 }
1140 spin_unlock(&inode_lock); 1177 spin_unlock(&inode_sb_list_lock);
1141 iput(old_inode); 1178 iput(old_inode);
1142} 1179}
1143 1180
@@ -1271,9 +1308,11 @@ int write_inode_now(struct inode *inode, int sync)
1271 wbc.nr_to_write = 0; 1308 wbc.nr_to_write = 0;
1272 1309
1273 might_sleep(); 1310 might_sleep();
1274 spin_lock(&inode_lock); 1311 spin_lock(&inode_wb_list_lock);
1312 spin_lock(&inode->i_lock);
1275 ret = writeback_single_inode(inode, &wbc); 1313 ret = writeback_single_inode(inode, &wbc);
1276 spin_unlock(&inode_lock); 1314 spin_unlock(&inode->i_lock);
1315 spin_unlock(&inode_wb_list_lock);
1277 if (sync) 1316 if (sync)
1278 inode_sync_wait(inode); 1317 inode_sync_wait(inode);
1279 return ret; 1318 return ret;
@@ -1295,9 +1334,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1295{ 1334{
1296 int ret; 1335 int ret;
1297 1336
1298 spin_lock(&inode_lock); 1337 spin_lock(&inode_wb_list_lock);
1338 spin_lock(&inode->i_lock);
1299 ret = writeback_single_inode(inode, wbc); 1339 ret = writeback_single_inode(inode, wbc);
1300 spin_unlock(&inode_lock); 1340 spin_unlock(&inode->i_lock);
1341 spin_unlock(&inode_wb_list_lock);
1301 return ret; 1342 return ret;
1302} 1343}
1303EXPORT_SYMBOL(sync_inode); 1344EXPORT_SYMBOL(sync_inode);
diff --git a/fs/inode.c b/fs/inode.c
index 0b3da4a77704..05a1f75ae791 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -26,6 +26,38 @@
26#include <linux/posix_acl.h> 26#include <linux/posix_acl.h>
27#include <linux/ima.h> 27#include <linux/ima.h>
28#include <linux/cred.h> 28#include <linux/cred.h>
29#include "internal.h"
30
31/*
32 * inode locking rules.
33 *
34 * inode->i_lock protects:
35 * inode->i_state, inode->i_hash, __iget()
36 * inode_lru_lock protects:
37 * inode_lru, inode->i_lru
38 * inode_sb_list_lock protects:
39 * sb->s_inodes, inode->i_sb_list
40 * inode_wb_list_lock protects:
41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
42 * inode_hash_lock protects:
43 * inode_hashtable, inode->i_hash
44 *
45 * Lock ordering:
46 *
47 * inode_sb_list_lock
48 * inode->i_lock
49 * inode_lru_lock
50 *
51 * inode_wb_list_lock
52 * inode->i_lock
53 *
54 * inode_hash_lock
55 * inode_sb_list_lock
56 * inode->i_lock
57 *
58 * iunique_lock
59 * inode_hash_lock
60 */
29 61
30/* 62/*
31 * This is needed for the following functions: 63 * This is needed for the following functions:
@@ -60,6 +92,8 @@
60 92
61static unsigned int i_hash_mask __read_mostly; 93static unsigned int i_hash_mask __read_mostly;
62static unsigned int i_hash_shift __read_mostly; 94static unsigned int i_hash_shift __read_mostly;
95static struct hlist_head *inode_hashtable __read_mostly;
96static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
63 97
64/* 98/*
65 * Each inode can be on two separate lists. One is 99 * Each inode can be on two separate lists. One is
@@ -74,15 +108,10 @@ static unsigned int i_hash_shift __read_mostly;
74 */ 108 */
75 109
76static LIST_HEAD(inode_lru); 110static LIST_HEAD(inode_lru);
77static struct hlist_head *inode_hashtable __read_mostly; 111static DEFINE_SPINLOCK(inode_lru_lock);
78 112
79/* 113__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
80 * A simple spinlock to protect the list manipulations. 114__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
81 *
82 * NOTE! You also have to own the lock if you change
83 * the i_state of an inode while it is in use..
84 */
85DEFINE_SPINLOCK(inode_lock);
86 115
87/* 116/*
88 * iprune_sem provides exclusion between the icache shrinking and the 117 * iprune_sem provides exclusion between the icache shrinking and the
@@ -137,15 +166,6 @@ int proc_nr_inodes(ctl_table *table, int write,
137} 166}
138#endif 167#endif
139 168
140static void wake_up_inode(struct inode *inode)
141{
142 /*
143 * Prevent speculative execution through spin_unlock(&inode_lock);
144 */
145 smp_mb();
146 wake_up_bit(&inode->i_state, __I_NEW);
147}
148
149/** 169/**
150 * inode_init_always - perform inode structure intialisation 170 * inode_init_always - perform inode structure intialisation
151 * @sb: superblock inode belongs to 171 * @sb: superblock inode belongs to
@@ -336,7 +356,7 @@ static void init_once(void *foo)
336} 356}
337 357
338/* 358/*
339 * inode_lock must be held 359 * inode->i_lock must be held
340 */ 360 */
341void __iget(struct inode *inode) 361void __iget(struct inode *inode)
342{ 362{
@@ -354,23 +374,22 @@ EXPORT_SYMBOL(ihold);
354 374
355static void inode_lru_list_add(struct inode *inode) 375static void inode_lru_list_add(struct inode *inode)
356{ 376{
377 spin_lock(&inode_lru_lock);
357 if (list_empty(&inode->i_lru)) { 378 if (list_empty(&inode->i_lru)) {
358 list_add(&inode->i_lru, &inode_lru); 379 list_add(&inode->i_lru, &inode_lru);
359 inodes_stat.nr_unused++; 380 inodes_stat.nr_unused++;
360 } 381 }
382 spin_unlock(&inode_lru_lock);
361} 383}
362 384
363static void inode_lru_list_del(struct inode *inode) 385static void inode_lru_list_del(struct inode *inode)
364{ 386{
387 spin_lock(&inode_lru_lock);
365 if (!list_empty(&inode->i_lru)) { 388 if (!list_empty(&inode->i_lru)) {
366 list_del_init(&inode->i_lru); 389 list_del_init(&inode->i_lru);
367 inodes_stat.nr_unused--; 390 inodes_stat.nr_unused--;
368 } 391 }
369} 392 spin_unlock(&inode_lru_lock);
370
371static inline void __inode_sb_list_add(struct inode *inode)
372{
373 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
374} 393}
375 394
376/** 395/**
@@ -379,15 +398,17 @@ static inline void __inode_sb_list_add(struct inode *inode)
379 */ 398 */
380void inode_sb_list_add(struct inode *inode) 399void inode_sb_list_add(struct inode *inode)
381{ 400{
382 spin_lock(&inode_lock); 401 spin_lock(&inode_sb_list_lock);
383 __inode_sb_list_add(inode); 402 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
384 spin_unlock(&inode_lock); 403 spin_unlock(&inode_sb_list_lock);
385} 404}
386EXPORT_SYMBOL_GPL(inode_sb_list_add); 405EXPORT_SYMBOL_GPL(inode_sb_list_add);
387 406
388static inline void __inode_sb_list_del(struct inode *inode) 407static inline void inode_sb_list_del(struct inode *inode)
389{ 408{
409 spin_lock(&inode_sb_list_lock);
390 list_del_init(&inode->i_sb_list); 410 list_del_init(&inode->i_sb_list);
411 spin_unlock(&inode_sb_list_lock);
391} 412}
392 413
393static unsigned long hash(struct super_block *sb, unsigned long hashval) 414static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -412,24 +433,15 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
412{ 433{
413 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 434 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
414 435
415 spin_lock(&inode_lock); 436 spin_lock(&inode_hash_lock);
437 spin_lock(&inode->i_lock);
416 hlist_add_head(&inode->i_hash, b); 438 hlist_add_head(&inode->i_hash, b);
417 spin_unlock(&inode_lock); 439 spin_unlock(&inode->i_lock);
440 spin_unlock(&inode_hash_lock);
418} 441}
419EXPORT_SYMBOL(__insert_inode_hash); 442EXPORT_SYMBOL(__insert_inode_hash);
420 443
421/** 444/**
422 * __remove_inode_hash - remove an inode from the hash
423 * @inode: inode to unhash
424 *
425 * Remove an inode from the superblock.
426 */
427static void __remove_inode_hash(struct inode *inode)
428{
429 hlist_del_init(&inode->i_hash);
430}
431
432/**
433 * remove_inode_hash - remove an inode from the hash 445 * remove_inode_hash - remove an inode from the hash
434 * @inode: inode to unhash 446 * @inode: inode to unhash
435 * 447 *
@@ -437,9 +449,11 @@ static void __remove_inode_hash(struct inode *inode)
437 */ 449 */
438void remove_inode_hash(struct inode *inode) 450void remove_inode_hash(struct inode *inode)
439{ 451{
440 spin_lock(&inode_lock); 452 spin_lock(&inode_hash_lock);
453 spin_lock(&inode->i_lock);
441 hlist_del_init(&inode->i_hash); 454 hlist_del_init(&inode->i_hash);
442 spin_unlock(&inode_lock); 455 spin_unlock(&inode->i_lock);
456 spin_unlock(&inode_hash_lock);
443} 457}
444EXPORT_SYMBOL(remove_inode_hash); 458EXPORT_SYMBOL(remove_inode_hash);
445 459
@@ -456,10 +470,29 @@ void end_writeback(struct inode *inode)
456} 470}
457EXPORT_SYMBOL(end_writeback); 471EXPORT_SYMBOL(end_writeback);
458 472
473/*
474 * Free the inode passed in, removing it from the lists it is still connected
475 * to. We remove any pages still attached to the inode and wait for any IO that
476 * is still in progress before finally destroying the inode.
477 *
478 * An inode must already be marked I_FREEING so that we avoid the inode being
479 * moved back onto lists if we race with other code that manipulates the lists
480 * (e.g. writeback_single_inode). The caller is responsible for setting this.
481 *
482 * An inode must already be removed from the LRU list before being evicted from
483 * the cache. This should occur atomically with setting the I_FREEING state
484 * flag, so no inodes here should ever be on the LRU when being evicted.
485 */
459static void evict(struct inode *inode) 486static void evict(struct inode *inode)
460{ 487{
461 const struct super_operations *op = inode->i_sb->s_op; 488 const struct super_operations *op = inode->i_sb->s_op;
462 489
490 BUG_ON(!(inode->i_state & I_FREEING));
491 BUG_ON(!list_empty(&inode->i_lru));
492
493 inode_wb_list_del(inode);
494 inode_sb_list_del(inode);
495
463 if (op->evict_inode) { 496 if (op->evict_inode) {
464 op->evict_inode(inode); 497 op->evict_inode(inode);
465 } else { 498 } else {
@@ -471,6 +504,15 @@ static void evict(struct inode *inode)
471 bd_forget(inode); 504 bd_forget(inode);
472 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 505 if (S_ISCHR(inode->i_mode) && inode->i_cdev)
473 cd_forget(inode); 506 cd_forget(inode);
507
508 remove_inode_hash(inode);
509
510 spin_lock(&inode->i_lock);
511 wake_up_bit(&inode->i_state, __I_NEW);
512 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
513 spin_unlock(&inode->i_lock);
514
515 destroy_inode(inode);
474} 516}
475 517
476/* 518/*
@@ -489,14 +531,6 @@ static void dispose_list(struct list_head *head)
489 list_del_init(&inode->i_lru); 531 list_del_init(&inode->i_lru);
490 532
491 evict(inode); 533 evict(inode);
492
493 spin_lock(&inode_lock);
494 __remove_inode_hash(inode);
495 __inode_sb_list_del(inode);
496 spin_unlock(&inode_lock);
497
498 wake_up_inode(inode);
499 destroy_inode(inode);
500 } 534 }
501} 535}
502 536
@@ -514,25 +548,23 @@ void evict_inodes(struct super_block *sb)
514 struct inode *inode, *next; 548 struct inode *inode, *next;
515 LIST_HEAD(dispose); 549 LIST_HEAD(dispose);
516 550
517 spin_lock(&inode_lock); 551 spin_lock(&inode_sb_list_lock);
518 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 552 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
519 if (atomic_read(&inode->i_count)) 553 if (atomic_read(&inode->i_count))
520 continue; 554 continue;
521 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 555
556 spin_lock(&inode->i_lock);
557 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
558 spin_unlock(&inode->i_lock);
522 continue; 559 continue;
560 }
523 561
524 inode->i_state |= I_FREEING; 562 inode->i_state |= I_FREEING;
525 563 inode_lru_list_del(inode);
526 /* 564 spin_unlock(&inode->i_lock);
527 * Move the inode off the IO lists and LRU once I_FREEING is 565 list_add(&inode->i_lru, &dispose);
528 * set so that it won't get moved back on there if it is dirty.
529 */
530 list_move(&inode->i_lru, &dispose);
531 list_del_init(&inode->i_wb_list);
532 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
533 inodes_stat.nr_unused--;
534 } 566 }
535 spin_unlock(&inode_lock); 567 spin_unlock(&inode_sb_list_lock);
536 568
537 dispose_list(&dispose); 569 dispose_list(&dispose);
538 570
@@ -561,31 +593,30 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
561 struct inode *inode, *next; 593 struct inode *inode, *next;
562 LIST_HEAD(dispose); 594 LIST_HEAD(dispose);
563 595
564 spin_lock(&inode_lock); 596 spin_lock(&inode_sb_list_lock);
565 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 597 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
566 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 598 spin_lock(&inode->i_lock);
599 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
600 spin_unlock(&inode->i_lock);
567 continue; 601 continue;
602 }
568 if (inode->i_state & I_DIRTY && !kill_dirty) { 603 if (inode->i_state & I_DIRTY && !kill_dirty) {
604 spin_unlock(&inode->i_lock);
569 busy = 1; 605 busy = 1;
570 continue; 606 continue;
571 } 607 }
572 if (atomic_read(&inode->i_count)) { 608 if (atomic_read(&inode->i_count)) {
609 spin_unlock(&inode->i_lock);
573 busy = 1; 610 busy = 1;
574 continue; 611 continue;
575 } 612 }
576 613
577 inode->i_state |= I_FREEING; 614 inode->i_state |= I_FREEING;
578 615 inode_lru_list_del(inode);
579 /* 616 spin_unlock(&inode->i_lock);
580 * Move the inode off the IO lists and LRU once I_FREEING is 617 list_add(&inode->i_lru, &dispose);
581 * set so that it won't get moved back on there if it is dirty.
582 */
583 list_move(&inode->i_lru, &dispose);
584 list_del_init(&inode->i_wb_list);
585 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
586 inodes_stat.nr_unused--;
587 } 618 }
588 spin_unlock(&inode_lock); 619 spin_unlock(&inode_sb_list_lock);
589 620
590 dispose_list(&dispose); 621 dispose_list(&dispose);
591 622
@@ -607,7 +638,7 @@ static int can_unuse(struct inode *inode)
607 638
608/* 639/*
609 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a 640 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
610 * temporary list and then are freed outside inode_lock by dispose_list(). 641 * temporary list and then are freed outside inode_lru_lock by dispose_list().
611 * 642 *
612 * Any inodes which are pinned purely because of attached pagecache have their 643 * Any inodes which are pinned purely because of attached pagecache have their
613 * pagecache removed. If the inode has metadata buffers attached to 644 * pagecache removed. If the inode has metadata buffers attached to
@@ -628,7 +659,7 @@ static void prune_icache(int nr_to_scan)
628 unsigned long reap = 0; 659 unsigned long reap = 0;
629 660
630 down_read(&iprune_sem); 661 down_read(&iprune_sem);
631 spin_lock(&inode_lock); 662 spin_lock(&inode_lru_lock);
632 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 663 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
633 struct inode *inode; 664 struct inode *inode;
634 665
@@ -638,53 +669,67 @@ static void prune_icache(int nr_to_scan)
638 inode = list_entry(inode_lru.prev, struct inode, i_lru); 669 inode = list_entry(inode_lru.prev, struct inode, i_lru);
639 670
640 /* 671 /*
672 * we are inverting the inode_lru_lock/inode->i_lock here,
673 * so use a trylock. If we fail to get the lock, just move the
674 * inode to the back of the list so we don't spin on it.
675 */
676 if (!spin_trylock(&inode->i_lock)) {
677 list_move(&inode->i_lru, &inode_lru);
678 continue;
679 }
680
681 /*
641 * Referenced or dirty inodes are still in use. Give them 682 * Referenced or dirty inodes are still in use. Give them
642 * another pass through the LRU as we canot reclaim them now. 683 * another pass through the LRU as we canot reclaim them now.
643 */ 684 */
644 if (atomic_read(&inode->i_count) || 685 if (atomic_read(&inode->i_count) ||
645 (inode->i_state & ~I_REFERENCED)) { 686 (inode->i_state & ~I_REFERENCED)) {
646 list_del_init(&inode->i_lru); 687 list_del_init(&inode->i_lru);
688 spin_unlock(&inode->i_lock);
647 inodes_stat.nr_unused--; 689 inodes_stat.nr_unused--;
648 continue; 690 continue;
649 } 691 }
650 692
651 /* recently referenced inodes get one more pass */ 693 /* recently referenced inodes get one more pass */
652 if (inode->i_state & I_REFERENCED) { 694 if (inode->i_state & I_REFERENCED) {
653 list_move(&inode->i_lru, &inode_lru);
654 inode->i_state &= ~I_REFERENCED; 695 inode->i_state &= ~I_REFERENCED;
696 list_move(&inode->i_lru, &inode_lru);
697 spin_unlock(&inode->i_lock);
655 continue; 698 continue;
656 } 699 }
657 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 700 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
658 __iget(inode); 701 __iget(inode);
659 spin_unlock(&inode_lock); 702 spin_unlock(&inode->i_lock);
703 spin_unlock(&inode_lru_lock);
660 if (remove_inode_buffers(inode)) 704 if (remove_inode_buffers(inode))
661 reap += invalidate_mapping_pages(&inode->i_data, 705 reap += invalidate_mapping_pages(&inode->i_data,
662 0, -1); 706 0, -1);
663 iput(inode); 707 iput(inode);
664 spin_lock(&inode_lock); 708 spin_lock(&inode_lru_lock);
665 709
666 if (inode != list_entry(inode_lru.next, 710 if (inode != list_entry(inode_lru.next,
667 struct inode, i_lru)) 711 struct inode, i_lru))
668 continue; /* wrong inode or list_empty */ 712 continue; /* wrong inode or list_empty */
669 if (!can_unuse(inode)) 713 /* avoid lock inversions with trylock */
714 if (!spin_trylock(&inode->i_lock))
715 continue;
716 if (!can_unuse(inode)) {
717 spin_unlock(&inode->i_lock);
670 continue; 718 continue;
719 }
671 } 720 }
672 WARN_ON(inode->i_state & I_NEW); 721 WARN_ON(inode->i_state & I_NEW);
673 inode->i_state |= I_FREEING; 722 inode->i_state |= I_FREEING;
723 spin_unlock(&inode->i_lock);
674 724
675 /*
676 * Move the inode off the IO lists and LRU once I_FREEING is
677 * set so that it won't get moved back on there if it is dirty.
678 */
679 list_move(&inode->i_lru, &freeable); 725 list_move(&inode->i_lru, &freeable);
680 list_del_init(&inode->i_wb_list);
681 inodes_stat.nr_unused--; 726 inodes_stat.nr_unused--;
682 } 727 }
683 if (current_is_kswapd()) 728 if (current_is_kswapd())
684 __count_vm_events(KSWAPD_INODESTEAL, reap); 729 __count_vm_events(KSWAPD_INODESTEAL, reap);
685 else 730 else
686 __count_vm_events(PGINODESTEAL, reap); 731 __count_vm_events(PGINODESTEAL, reap);
687 spin_unlock(&inode_lock); 732 spin_unlock(&inode_lru_lock);
688 733
689 dispose_list(&freeable); 734 dispose_list(&freeable);
690 up_read(&iprune_sem); 735 up_read(&iprune_sem);
@@ -733,15 +778,21 @@ static struct inode *find_inode(struct super_block *sb,
733 778
734repeat: 779repeat:
735 hlist_for_each_entry(inode, node, head, i_hash) { 780 hlist_for_each_entry(inode, node, head, i_hash) {
736 if (inode->i_sb != sb) 781 spin_lock(&inode->i_lock);
782 if (inode->i_sb != sb) {
783 spin_unlock(&inode->i_lock);
737 continue; 784 continue;
738 if (!test(inode, data)) 785 }
786 if (!test(inode, data)) {
787 spin_unlock(&inode->i_lock);
739 continue; 788 continue;
789 }
740 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 790 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
741 __wait_on_freeing_inode(inode); 791 __wait_on_freeing_inode(inode);
742 goto repeat; 792 goto repeat;
743 } 793 }
744 __iget(inode); 794 __iget(inode);
795 spin_unlock(&inode->i_lock);
745 return inode; 796 return inode;
746 } 797 }
747 return NULL; 798 return NULL;
@@ -759,15 +810,21 @@ static struct inode *find_inode_fast(struct super_block *sb,
759 810
760repeat: 811repeat:
761 hlist_for_each_entry(inode, node, head, i_hash) { 812 hlist_for_each_entry(inode, node, head, i_hash) {
762 if (inode->i_ino != ino) 813 spin_lock(&inode->i_lock);
814 if (inode->i_ino != ino) {
815 spin_unlock(&inode->i_lock);
763 continue; 816 continue;
764 if (inode->i_sb != sb) 817 }
818 if (inode->i_sb != sb) {
819 spin_unlock(&inode->i_lock);
765 continue; 820 continue;
821 }
766 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 822 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
767 __wait_on_freeing_inode(inode); 823 __wait_on_freeing_inode(inode);
768 goto repeat; 824 goto repeat;
769 } 825 }
770 __iget(inode); 826 __iget(inode);
827 spin_unlock(&inode->i_lock);
771 return inode; 828 return inode;
772 } 829 }
773 return NULL; 830 return NULL;
@@ -827,19 +884,26 @@ struct inode *new_inode(struct super_block *sb)
827{ 884{
828 struct inode *inode; 885 struct inode *inode;
829 886
830 spin_lock_prefetch(&inode_lock); 887 spin_lock_prefetch(&inode_sb_list_lock);
831 888
832 inode = alloc_inode(sb); 889 inode = alloc_inode(sb);
833 if (inode) { 890 if (inode) {
834 spin_lock(&inode_lock); 891 spin_lock(&inode->i_lock);
835 __inode_sb_list_add(inode);
836 inode->i_state = 0; 892 inode->i_state = 0;
837 spin_unlock(&inode_lock); 893 spin_unlock(&inode->i_lock);
894 inode_sb_list_add(inode);
838 } 895 }
839 return inode; 896 return inode;
840} 897}
841EXPORT_SYMBOL(new_inode); 898EXPORT_SYMBOL(new_inode);
842 899
900/**
901 * unlock_new_inode - clear the I_NEW state and wake up any waiters
902 * @inode: new inode to unlock
903 *
904 * Called when the inode is fully initialised to clear the new state of the
905 * inode and wake up anyone waiting for the inode to finish initialisation.
906 */
843void unlock_new_inode(struct inode *inode) 907void unlock_new_inode(struct inode *inode)
844{ 908{
845#ifdef CONFIG_DEBUG_LOCK_ALLOC 909#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -859,51 +923,67 @@ void unlock_new_inode(struct inode *inode)
859 } 923 }
860 } 924 }
861#endif 925#endif
862 /* 926 spin_lock(&inode->i_lock);
863 * This is special! We do not need the spinlock when clearing I_NEW,
864 * because we're guaranteed that nobody else tries to do anything about
865 * the state of the inode when it is locked, as we just created it (so
866 * there can be no old holders that haven't tested I_NEW).
867 * However we must emit the memory barrier so that other CPUs reliably
868 * see the clearing of I_NEW after the other inode initialisation has
869 * completed.
870 */
871 smp_mb();
872 WARN_ON(!(inode->i_state & I_NEW)); 927 WARN_ON(!(inode->i_state & I_NEW));
873 inode->i_state &= ~I_NEW; 928 inode->i_state &= ~I_NEW;
874 wake_up_inode(inode); 929 wake_up_bit(&inode->i_state, __I_NEW);
930 spin_unlock(&inode->i_lock);
875} 931}
876EXPORT_SYMBOL(unlock_new_inode); 932EXPORT_SYMBOL(unlock_new_inode);
877 933
878/* 934/**
879 * This is called without the inode lock held.. Be careful. 935 * iget5_locked - obtain an inode from a mounted file system
936 * @sb: super block of file system
937 * @hashval: hash value (usually inode number) to get
938 * @test: callback used for comparisons between inodes
939 * @set: callback used to initialize a new struct inode
940 * @data: opaque data pointer to pass to @test and @set
941 *
942 * Search for the inode specified by @hashval and @data in the inode cache,
943 * and if present it is return it with an increased reference count. This is
944 * a generalized version of iget_locked() for file systems where the inode
945 * number is not sufficient for unique identification of an inode.
880 * 946 *
881 * We no longer cache the sb_flags in i_flags - see fs.h 947 * If the inode is not in cache, allocate a new inode and return it locked,
882 * -- rmk@arm.uk.linux.org 948 * hashed, and with the I_NEW flag set. The file system gets to fill it in
949 * before unlocking it via unlock_new_inode().
950 *
951 * Note both @test and @set are called with the inode_hash_lock held, so can't
952 * sleep.
883 */ 953 */
884static struct inode *get_new_inode(struct super_block *sb, 954struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
885 struct hlist_head *head, 955 int (*test)(struct inode *, void *),
886 int (*test)(struct inode *, void *), 956 int (*set)(struct inode *, void *), void *data)
887 int (*set)(struct inode *, void *),
888 void *data)
889{ 957{
958 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
890 struct inode *inode; 959 struct inode *inode;
891 960
961 spin_lock(&inode_hash_lock);
962 inode = find_inode(sb, head, test, data);
963 spin_unlock(&inode_hash_lock);
964
965 if (inode) {
966 wait_on_inode(inode);
967 return inode;
968 }
969
892 inode = alloc_inode(sb); 970 inode = alloc_inode(sb);
893 if (inode) { 971 if (inode) {
894 struct inode *old; 972 struct inode *old;
895 973
896 spin_lock(&inode_lock); 974 spin_lock(&inode_hash_lock);
897 /* We released the lock, so.. */ 975 /* We released the lock, so.. */
898 old = find_inode(sb, head, test, data); 976 old = find_inode(sb, head, test, data);
899 if (!old) { 977 if (!old) {
900 if (set(inode, data)) 978 if (set(inode, data))
901 goto set_failed; 979 goto set_failed;
902 980
903 hlist_add_head(&inode->i_hash, head); 981 spin_lock(&inode->i_lock);
904 __inode_sb_list_add(inode);
905 inode->i_state = I_NEW; 982 inode->i_state = I_NEW;
906 spin_unlock(&inode_lock); 983 hlist_add_head(&inode->i_hash, head);
984 spin_unlock(&inode->i_lock);
985 inode_sb_list_add(inode);
986 spin_unlock(&inode_hash_lock);
907 987
908 /* Return the locked inode with I_NEW set, the 988 /* Return the locked inode with I_NEW set, the
909 * caller is responsible for filling in the contents 989 * caller is responsible for filling in the contents
@@ -916,7 +996,7 @@ static struct inode *get_new_inode(struct super_block *sb,
916 * us. Use the old inode instead of the one we just 996 * us. Use the old inode instead of the one we just
917 * allocated. 997 * allocated.
918 */ 998 */
919 spin_unlock(&inode_lock); 999 spin_unlock(&inode_hash_lock);
920 destroy_inode(inode); 1000 destroy_inode(inode);
921 inode = old; 1001 inode = old;
922 wait_on_inode(inode); 1002 wait_on_inode(inode);
@@ -924,33 +1004,53 @@ static struct inode *get_new_inode(struct super_block *sb,
924 return inode; 1004 return inode;
925 1005
926set_failed: 1006set_failed:
927 spin_unlock(&inode_lock); 1007 spin_unlock(&inode_hash_lock);
928 destroy_inode(inode); 1008 destroy_inode(inode);
929 return NULL; 1009 return NULL;
930} 1010}
1011EXPORT_SYMBOL(iget5_locked);
931 1012
932/* 1013/**
933 * get_new_inode_fast is the fast path version of get_new_inode, see the 1014 * iget_locked - obtain an inode from a mounted file system
934 * comment at iget_locked for details. 1015 * @sb: super block of file system
1016 * @ino: inode number to get
1017 *
1018 * Search for the inode specified by @ino in the inode cache and if present
1019 * return it with an increased reference count. This is for file systems
1020 * where the inode number is sufficient for unique identification of an inode.
1021 *
1022 * If the inode is not in cache, allocate a new inode and return it locked,
1023 * hashed, and with the I_NEW flag set. The file system gets to fill it in
1024 * before unlocking it via unlock_new_inode().
935 */ 1025 */
936static struct inode *get_new_inode_fast(struct super_block *sb, 1026struct inode *iget_locked(struct super_block *sb, unsigned long ino)
937 struct hlist_head *head, unsigned long ino)
938{ 1027{
1028 struct hlist_head *head = inode_hashtable + hash(sb, ino);
939 struct inode *inode; 1029 struct inode *inode;
940 1030
1031 spin_lock(&inode_hash_lock);
1032 inode = find_inode_fast(sb, head, ino);
1033 spin_unlock(&inode_hash_lock);
1034 if (inode) {
1035 wait_on_inode(inode);
1036 return inode;
1037 }
1038
941 inode = alloc_inode(sb); 1039 inode = alloc_inode(sb);
942 if (inode) { 1040 if (inode) {
943 struct inode *old; 1041 struct inode *old;
944 1042
945 spin_lock(&inode_lock); 1043 spin_lock(&inode_hash_lock);
946 /* We released the lock, so.. */ 1044 /* We released the lock, so.. */
947 old = find_inode_fast(sb, head, ino); 1045 old = find_inode_fast(sb, head, ino);
948 if (!old) { 1046 if (!old) {
949 inode->i_ino = ino; 1047 inode->i_ino = ino;
950 hlist_add_head(&inode->i_hash, head); 1048 spin_lock(&inode->i_lock);
951 __inode_sb_list_add(inode);
952 inode->i_state = I_NEW; 1049 inode->i_state = I_NEW;
953 spin_unlock(&inode_lock); 1050 hlist_add_head(&inode->i_hash, head);
1051 spin_unlock(&inode->i_lock);
1052 inode_sb_list_add(inode);
1053 spin_unlock(&inode_hash_lock);
954 1054
955 /* Return the locked inode with I_NEW set, the 1055 /* Return the locked inode with I_NEW set, the
956 * caller is responsible for filling in the contents 1056 * caller is responsible for filling in the contents
@@ -963,13 +1063,14 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
963 * us. Use the old inode instead of the one we just 1063 * us. Use the old inode instead of the one we just
964 * allocated. 1064 * allocated.
965 */ 1065 */
966 spin_unlock(&inode_lock); 1066 spin_unlock(&inode_hash_lock);
967 destroy_inode(inode); 1067 destroy_inode(inode);
968 inode = old; 1068 inode = old;
969 wait_on_inode(inode); 1069 wait_on_inode(inode);
970 } 1070 }
971 return inode; 1071 return inode;
972} 1072}
1073EXPORT_SYMBOL(iget_locked);
973 1074
974/* 1075/*
975 * search the inode cache for a matching inode number. 1076 * search the inode cache for a matching inode number.
@@ -984,10 +1085,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
984 struct hlist_node *node; 1085 struct hlist_node *node;
985 struct inode *inode; 1086 struct inode *inode;
986 1087
1088 spin_lock(&inode_hash_lock);
987 hlist_for_each_entry(inode, node, b, i_hash) { 1089 hlist_for_each_entry(inode, node, b, i_hash) {
988 if (inode->i_ino == ino && inode->i_sb == sb) 1090 if (inode->i_ino == ino && inode->i_sb == sb) {
1091 spin_unlock(&inode_hash_lock);
989 return 0; 1092 return 0;
1093 }
990 } 1094 }
1095 spin_unlock(&inode_hash_lock);
991 1096
992 return 1; 1097 return 1;
993} 1098}
@@ -1017,7 +1122,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
1017 static unsigned int counter; 1122 static unsigned int counter;
1018 ino_t res; 1123 ino_t res;
1019 1124
1020 spin_lock(&inode_lock);
1021 spin_lock(&iunique_lock); 1125 spin_lock(&iunique_lock);
1022 do { 1126 do {
1023 if (counter <= max_reserved) 1127 if (counter <= max_reserved)
@@ -1025,7 +1129,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
1025 res = counter++; 1129 res = counter++;
1026 } while (!test_inode_iunique(sb, res)); 1130 } while (!test_inode_iunique(sb, res));
1027 spin_unlock(&iunique_lock); 1131 spin_unlock(&iunique_lock);
1028 spin_unlock(&inode_lock);
1029 1132
1030 return res; 1133 return res;
1031} 1134}
@@ -1033,116 +1136,50 @@ EXPORT_SYMBOL(iunique);
1033 1136
1034struct inode *igrab(struct inode *inode) 1137struct inode *igrab(struct inode *inode)
1035{ 1138{
1036 spin_lock(&inode_lock); 1139 spin_lock(&inode->i_lock);
1037 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) 1140 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
1038 __iget(inode); 1141 __iget(inode);
1039 else 1142 spin_unlock(&inode->i_lock);
1143 } else {
1144 spin_unlock(&inode->i_lock);
1040 /* 1145 /*
1041 * Handle the case where s_op->clear_inode is not been 1146 * Handle the case where s_op->clear_inode is not been
1042 * called yet, and somebody is calling igrab 1147 * called yet, and somebody is calling igrab
1043 * while the inode is getting freed. 1148 * while the inode is getting freed.
1044 */ 1149 */
1045 inode = NULL; 1150 inode = NULL;
1046 spin_unlock(&inode_lock); 1151 }
1047 return inode; 1152 return inode;
1048} 1153}
1049EXPORT_SYMBOL(igrab); 1154EXPORT_SYMBOL(igrab);
1050 1155
1051/** 1156/**
1052 * ifind - internal function, you want ilookup5() or iget5().
1053 * @sb: super block of file system to search
1054 * @head: the head of the list to search
1055 * @test: callback used for comparisons between inodes
1056 * @data: opaque data pointer to pass to @test
1057 * @wait: if true wait for the inode to be unlocked, if false do not
1058 *
1059 * ifind() searches for the inode specified by @data in the inode
1060 * cache. This is a generalized version of ifind_fast() for file systems where
1061 * the inode number is not sufficient for unique identification of an inode.
1062 *
1063 * If the inode is in the cache, the inode is returned with an incremented
1064 * reference count.
1065 *
1066 * Otherwise NULL is returned.
1067 *
1068 * Note, @test is called with the inode_lock held, so can't sleep.
1069 */
1070static struct inode *ifind(struct super_block *sb,
1071 struct hlist_head *head, int (*test)(struct inode *, void *),
1072 void *data, const int wait)
1073{
1074 struct inode *inode;
1075
1076 spin_lock(&inode_lock);
1077 inode = find_inode(sb, head, test, data);
1078 if (inode) {
1079 spin_unlock(&inode_lock);
1080 if (likely(wait))
1081 wait_on_inode(inode);
1082 return inode;
1083 }
1084 spin_unlock(&inode_lock);
1085 return NULL;
1086}
1087
1088/**
1089 * ifind_fast - internal function, you want ilookup() or iget().
1090 * @sb: super block of file system to search
1091 * @head: head of the list to search
1092 * @ino: inode number to search for
1093 *
1094 * ifind_fast() searches for the inode @ino in the inode cache. This is for
1095 * file systems where the inode number is sufficient for unique identification
1096 * of an inode.
1097 *
1098 * If the inode is in the cache, the inode is returned with an incremented
1099 * reference count.
1100 *
1101 * Otherwise NULL is returned.
1102 */
1103static struct inode *ifind_fast(struct super_block *sb,
1104 struct hlist_head *head, unsigned long ino)
1105{
1106 struct inode *inode;
1107
1108 spin_lock(&inode_lock);
1109 inode = find_inode_fast(sb, head, ino);
1110 if (inode) {
1111 spin_unlock(&inode_lock);
1112 wait_on_inode(inode);
1113 return inode;
1114 }
1115 spin_unlock(&inode_lock);
1116 return NULL;
1117}
1118
1119/**
1120 * ilookup5_nowait - search for an inode in the inode cache 1157 * ilookup5_nowait - search for an inode in the inode cache
1121 * @sb: super block of file system to search 1158 * @sb: super block of file system to search
1122 * @hashval: hash value (usually inode number) to search for 1159 * @hashval: hash value (usually inode number) to search for
1123 * @test: callback used for comparisons between inodes 1160 * @test: callback used for comparisons between inodes
1124 * @data: opaque data pointer to pass to @test 1161 * @data: opaque data pointer to pass to @test
1125 * 1162 *
1126 * ilookup5() uses ifind() to search for the inode specified by @hashval and 1163 * Search for the inode specified by @hashval and @data in the inode cache.
1127 * @data in the inode cache. This is a generalized version of ilookup() for
1128 * file systems where the inode number is not sufficient for unique
1129 * identification of an inode.
1130 *
1131 * If the inode is in the cache, the inode is returned with an incremented 1164 * If the inode is in the cache, the inode is returned with an incremented
1132 * reference count. Note, the inode lock is not waited upon so you have to be 1165 * reference count.
1133 * very careful what you do with the returned inode. You probably should be
1134 * using ilookup5() instead.
1135 * 1166 *
1136 * Otherwise NULL is returned. 1167 * Note: I_NEW is not waited upon so you have to be very careful what you do
1168 * with the returned inode. You probably should be using ilookup5() instead.
1137 * 1169 *
1138 * Note, @test is called with the inode_lock held, so can't sleep. 1170 * Note: @test is called with the inode_hash_lock held, so can't sleep.
1139 */ 1171 */
1140struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1172struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
1141 int (*test)(struct inode *, void *), void *data) 1173 int (*test)(struct inode *, void *), void *data)
1142{ 1174{
1143 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1175 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1176 struct inode *inode;
1177
1178 spin_lock(&inode_hash_lock);
1179 inode = find_inode(sb, head, test, data);
1180 spin_unlock(&inode_hash_lock);
1144 1181
1145 return ifind(sb, head, test, data, 0); 1182 return inode;
1146} 1183}
1147EXPORT_SYMBOL(ilookup5_nowait); 1184EXPORT_SYMBOL(ilookup5_nowait);
1148 1185
@@ -1153,24 +1190,24 @@ EXPORT_SYMBOL(ilookup5_nowait);
1153 * @test: callback used for comparisons between inodes 1190 * @test: callback used for comparisons between inodes
1154 * @data: opaque data pointer to pass to @test 1191 * @data: opaque data pointer to pass to @test
1155 * 1192 *
1156 * ilookup5() uses ifind() to search for the inode specified by @hashval and 1193 * Search for the inode specified by @hashval and @data in the inode cache,
1157 * @data in the inode cache. This is a generalized version of ilookup() for 1194 * and if the inode is in the cache, return the inode with an incremented
1158 * file systems where the inode number is not sufficient for unique 1195 * reference count. Waits on I_NEW before returning the inode.
1159 * identification of an inode.
1160 *
1161 * If the inode is in the cache, the inode lock is waited upon and the inode is
1162 * returned with an incremented reference count. 1196 * returned with an incremented reference count.
1163 * 1197 *
1164 * Otherwise NULL is returned. 1198 * This is a generalized version of ilookup() for file systems where the
1199 * inode number is not sufficient for unique identification of an inode.
1165 * 1200 *
1166 * Note, @test is called with the inode_lock held, so can't sleep. 1201 * Note: @test is called with the inode_hash_lock held, so can't sleep.
1167 */ 1202 */
1168struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1203struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
1169 int (*test)(struct inode *, void *), void *data) 1204 int (*test)(struct inode *, void *), void *data)
1170{ 1205{
1171 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1206 struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
1172 1207
1173 return ifind(sb, head, test, data, 1); 1208 if (inode)
1209 wait_on_inode(inode);
1210 return inode;
1174} 1211}
1175EXPORT_SYMBOL(ilookup5); 1212EXPORT_SYMBOL(ilookup5);
1176 1213
@@ -1179,91 +1216,23 @@ EXPORT_SYMBOL(ilookup5);
1179 * @sb: super block of file system to search 1216 * @sb: super block of file system to search
1180 * @ino: inode number to search for 1217 * @ino: inode number to search for
1181 * 1218 *
1182 * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. 1219 * Search for the inode @ino in the inode cache, and if the inode is in the
1183 * This is for file systems where the inode number is sufficient for unique 1220 * cache, the inode is returned with an incremented reference count.
1184 * identification of an inode.
1185 *
1186 * If the inode is in the cache, the inode is returned with an incremented
1187 * reference count.
1188 *
1189 * Otherwise NULL is returned.
1190 */ 1221 */
1191struct inode *ilookup(struct super_block *sb, unsigned long ino) 1222struct inode *ilookup(struct super_block *sb, unsigned long ino)
1192{ 1223{
1193 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1224 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1194
1195 return ifind_fast(sb, head, ino);
1196}
1197EXPORT_SYMBOL(ilookup);
1198
1199/**
1200 * iget5_locked - obtain an inode from a mounted file system
1201 * @sb: super block of file system
1202 * @hashval: hash value (usually inode number) to get
1203 * @test: callback used for comparisons between inodes
1204 * @set: callback used to initialize a new struct inode
1205 * @data: opaque data pointer to pass to @test and @set
1206 *
1207 * iget5_locked() uses ifind() to search for the inode specified by @hashval
1208 * and @data in the inode cache and if present it is returned with an increased
1209 * reference count. This is a generalized version of iget_locked() for file
1210 * systems where the inode number is not sufficient for unique identification
1211 * of an inode.
1212 *
1213 * If the inode is not in cache, get_new_inode() is called to allocate a new
1214 * inode and this is returned locked, hashed, and with the I_NEW flag set. The
1215 * file system gets to fill it in before unlocking it via unlock_new_inode().
1216 *
1217 * Note both @test and @set are called with the inode_lock held, so can't sleep.
1218 */
1219struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
1220 int (*test)(struct inode *, void *),
1221 int (*set)(struct inode *, void *), void *data)
1222{
1223 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1224 struct inode *inode; 1225 struct inode *inode;
1225 1226
1226 inode = ifind(sb, head, test, data, 1); 1227 spin_lock(&inode_hash_lock);
1227 if (inode) 1228 inode = find_inode_fast(sb, head, ino);
1228 return inode; 1229 spin_unlock(&inode_hash_lock);
1229 /*
1230 * get_new_inode() will do the right thing, re-trying the search
1231 * in case it had to block at any point.
1232 */
1233 return get_new_inode(sb, head, test, set, data);
1234}
1235EXPORT_SYMBOL(iget5_locked);
1236
1237/**
1238 * iget_locked - obtain an inode from a mounted file system
1239 * @sb: super block of file system
1240 * @ino: inode number to get
1241 *
1242 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
1243 * the inode cache and if present it is returned with an increased reference
1244 * count. This is for file systems where the inode number is sufficient for
1245 * unique identification of an inode.
1246 *
1247 * If the inode is not in cache, get_new_inode_fast() is called to allocate a
1248 * new inode and this is returned locked, hashed, and with the I_NEW flag set.
1249 * The file system gets to fill it in before unlocking it via
1250 * unlock_new_inode().
1251 */
1252struct inode *iget_locked(struct super_block *sb, unsigned long ino)
1253{
1254 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1255 struct inode *inode;
1256 1230
1257 inode = ifind_fast(sb, head, ino);
1258 if (inode) 1231 if (inode)
1259 return inode; 1232 wait_on_inode(inode);
1260 /* 1233 return inode;
1261 * get_new_inode_fast() will do the right thing, re-trying the search
1262 * in case it had to block at any point.
1263 */
1264 return get_new_inode_fast(sb, head, ino);
1265} 1234}
1266EXPORT_SYMBOL(iget_locked); 1235EXPORT_SYMBOL(ilookup);
1267 1236
1268int insert_inode_locked(struct inode *inode) 1237int insert_inode_locked(struct inode *inode)
1269{ 1238{
@@ -1271,27 +1240,33 @@ int insert_inode_locked(struct inode *inode)
1271 ino_t ino = inode->i_ino; 1240 ino_t ino = inode->i_ino;
1272 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1241 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1273 1242
1274 inode->i_state |= I_NEW;
1275 while (1) { 1243 while (1) {
1276 struct hlist_node *node; 1244 struct hlist_node *node;
1277 struct inode *old = NULL; 1245 struct inode *old = NULL;
1278 spin_lock(&inode_lock); 1246 spin_lock(&inode_hash_lock);
1279 hlist_for_each_entry(old, node, head, i_hash) { 1247 hlist_for_each_entry(old, node, head, i_hash) {
1280 if (old->i_ino != ino) 1248 if (old->i_ino != ino)
1281 continue; 1249 continue;
1282 if (old->i_sb != sb) 1250 if (old->i_sb != sb)
1283 continue; 1251 continue;
1284 if (old->i_state & (I_FREEING|I_WILL_FREE)) 1252 spin_lock(&old->i_lock);
1253 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1254 spin_unlock(&old->i_lock);
1285 continue; 1255 continue;
1256 }
1286 break; 1257 break;
1287 } 1258 }
1288 if (likely(!node)) { 1259 if (likely(!node)) {
1260 spin_lock(&inode->i_lock);
1261 inode->i_state |= I_NEW;
1289 hlist_add_head(&inode->i_hash, head); 1262 hlist_add_head(&inode->i_hash, head);
1290 spin_unlock(&inode_lock); 1263 spin_unlock(&inode->i_lock);
1264 spin_unlock(&inode_hash_lock);
1291 return 0; 1265 return 0;
1292 } 1266 }
1293 __iget(old); 1267 __iget(old);
1294 spin_unlock(&inode_lock); 1268 spin_unlock(&old->i_lock);
1269 spin_unlock(&inode_hash_lock);
1295 wait_on_inode(old); 1270 wait_on_inode(old);
1296 if (unlikely(!inode_unhashed(old))) { 1271 if (unlikely(!inode_unhashed(old))) {
1297 iput(old); 1272 iput(old);
@@ -1308,29 +1283,34 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1308 struct super_block *sb = inode->i_sb; 1283 struct super_block *sb = inode->i_sb;
1309 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1284 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1310 1285
1311 inode->i_state |= I_NEW;
1312
1313 while (1) { 1286 while (1) {
1314 struct hlist_node *node; 1287 struct hlist_node *node;
1315 struct inode *old = NULL; 1288 struct inode *old = NULL;
1316 1289
1317 spin_lock(&inode_lock); 1290 spin_lock(&inode_hash_lock);
1318 hlist_for_each_entry(old, node, head, i_hash) { 1291 hlist_for_each_entry(old, node, head, i_hash) {
1319 if (old->i_sb != sb) 1292 if (old->i_sb != sb)
1320 continue; 1293 continue;
1321 if (!test(old, data)) 1294 if (!test(old, data))
1322 continue; 1295 continue;
1323 if (old->i_state & (I_FREEING|I_WILL_FREE)) 1296 spin_lock(&old->i_lock);
1297 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1298 spin_unlock(&old->i_lock);
1324 continue; 1299 continue;
1300 }
1325 break; 1301 break;
1326 } 1302 }
1327 if (likely(!node)) { 1303 if (likely(!node)) {
1304 spin_lock(&inode->i_lock);
1305 inode->i_state |= I_NEW;
1328 hlist_add_head(&inode->i_hash, head); 1306 hlist_add_head(&inode->i_hash, head);
1329 spin_unlock(&inode_lock); 1307 spin_unlock(&inode->i_lock);
1308 spin_unlock(&inode_hash_lock);
1330 return 0; 1309 return 0;
1331 } 1310 }
1332 __iget(old); 1311 __iget(old);
1333 spin_unlock(&inode_lock); 1312 spin_unlock(&old->i_lock);
1313 spin_unlock(&inode_hash_lock);
1334 wait_on_inode(old); 1314 wait_on_inode(old);
1335 if (unlikely(!inode_unhashed(old))) { 1315 if (unlikely(!inode_unhashed(old))) {
1336 iput(old); 1316 iput(old);
@@ -1375,47 +1355,35 @@ static void iput_final(struct inode *inode)
1375 const struct super_operations *op = inode->i_sb->s_op; 1355 const struct super_operations *op = inode->i_sb->s_op;
1376 int drop; 1356 int drop;
1377 1357
1358 WARN_ON(inode->i_state & I_NEW);
1359
1378 if (op && op->drop_inode) 1360 if (op && op->drop_inode)
1379 drop = op->drop_inode(inode); 1361 drop = op->drop_inode(inode);
1380 else 1362 else
1381 drop = generic_drop_inode(inode); 1363 drop = generic_drop_inode(inode);
1382 1364
1365 if (!drop && (sb->s_flags & MS_ACTIVE)) {
1366 inode->i_state |= I_REFERENCED;
1367 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1368 inode_lru_list_add(inode);
1369 spin_unlock(&inode->i_lock);
1370 return;
1371 }
1372
1383 if (!drop) { 1373 if (!drop) {
1384 if (sb->s_flags & MS_ACTIVE) {
1385 inode->i_state |= I_REFERENCED;
1386 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1387 inode_lru_list_add(inode);
1388 }
1389 spin_unlock(&inode_lock);
1390 return;
1391 }
1392 WARN_ON(inode->i_state & I_NEW);
1393 inode->i_state |= I_WILL_FREE; 1374 inode->i_state |= I_WILL_FREE;
1394 spin_unlock(&inode_lock); 1375 spin_unlock(&inode->i_lock);
1395 write_inode_now(inode, 1); 1376 write_inode_now(inode, 1);
1396 spin_lock(&inode_lock); 1377 spin_lock(&inode->i_lock);
1397 WARN_ON(inode->i_state & I_NEW); 1378 WARN_ON(inode->i_state & I_NEW);
1398 inode->i_state &= ~I_WILL_FREE; 1379 inode->i_state &= ~I_WILL_FREE;
1399 __remove_inode_hash(inode);
1400 } 1380 }
1401 1381
1402 WARN_ON(inode->i_state & I_NEW);
1403 inode->i_state |= I_FREEING; 1382 inode->i_state |= I_FREEING;
1404
1405 /*
1406 * Move the inode off the IO lists and LRU once I_FREEING is
1407 * set so that it won't get moved back on there if it is dirty.
1408 */
1409 inode_lru_list_del(inode); 1383 inode_lru_list_del(inode);
1410 list_del_init(&inode->i_wb_list); 1384 spin_unlock(&inode->i_lock);
1411 1385
1412 __inode_sb_list_del(inode);
1413 spin_unlock(&inode_lock);
1414 evict(inode); 1386 evict(inode);
1415 remove_inode_hash(inode);
1416 wake_up_inode(inode);
1417 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
1418 destroy_inode(inode);
1419} 1387}
1420 1388
1421/** 1389/**
@@ -1432,7 +1400,7 @@ void iput(struct inode *inode)
1432 if (inode) { 1400 if (inode) {
1433 BUG_ON(inode->i_state & I_CLEAR); 1401 BUG_ON(inode->i_state & I_CLEAR);
1434 1402
1435 if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1403 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
1436 iput_final(inode); 1404 iput_final(inode);
1437 } 1405 }
1438} 1406}
@@ -1611,9 +1579,8 @@ EXPORT_SYMBOL(inode_wait);
1611 * to recheck inode state. 1579 * to recheck inode state.
1612 * 1580 *
1613 * It doesn't matter if I_NEW is not set initially, a call to 1581 * It doesn't matter if I_NEW is not set initially, a call to
1614 * wake_up_inode() after removing from the hash list will DTRT. 1582 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
1615 * 1583 * will DTRT.
1616 * This is called with inode_lock held.
1617 */ 1584 */
1618static void __wait_on_freeing_inode(struct inode *inode) 1585static void __wait_on_freeing_inode(struct inode *inode)
1619{ 1586{
@@ -1621,10 +1588,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
1621 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1588 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1622 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1589 wq = bit_waitqueue(&inode->i_state, __I_NEW);
1623 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1590 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
1624 spin_unlock(&inode_lock); 1591 spin_unlock(&inode->i_lock);
1592 spin_unlock(&inode_hash_lock);
1625 schedule(); 1593 schedule();
1626 finish_wait(wq, &wait.wait); 1594 finish_wait(wq, &wait.wait);
1627 spin_lock(&inode_lock); 1595 spin_lock(&inode_hash_lock);
1628} 1596}
1629 1597
1630static __initdata unsigned long ihash_entries; 1598static __initdata unsigned long ihash_entries;
diff --git a/fs/internal.h b/fs/internal.h
index 8318059b42c6..b29c46e4e32f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -125,6 +125,13 @@ extern long do_handle_open(int mountdirfd,
125/* 125/*
126 * inode.c 126 * inode.c
127 */ 127 */
128extern spinlock_t inode_sb_list_lock;
129
130/*
131 * fs-writeback.c
132 */
133extern void inode_wb_list_del(struct inode *inode);
134
128extern int get_nr_dirty_inodes(void); 135extern int get_nr_dirty_inodes(void);
129extern void evict_inodes(struct super_block *); 136extern void evict_inodes(struct super_block *);
130extern int invalidate_inodes(struct super_block *, bool); 137extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 03b8c240aeda..edfea7a3a747 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
293 return ret; 293 return ret;
294} 294}
295 295
296/* called with inode_lock held */ 296/* called with inode->i_lock held */
297static int logfs_drop_inode(struct inode *inode) 297static int logfs_drop_inode(struct inode *inode)
298{ 298{
299 struct logfs_super *super = logfs_super(inode->i_sb); 299 struct logfs_super *super = logfs_super(inode->i_sb);
diff --git a/fs/namei.c b/fs/namei.c
index d0066e17d45d..3cb616d38d9c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -992,6 +992,12 @@ int follow_down_one(struct path *path)
992 return 0; 992 return 0;
993} 993}
994 994
995static inline bool managed_dentry_might_block(struct dentry *dentry)
996{
997 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
998 dentry->d_op->d_manage(dentry, true) < 0);
999}
1000
995/* 1001/*
996 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we 1002 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
997 * meet a managed dentry and we're not walking to "..". True is returned to 1003 * meet a managed dentry and we're not walking to "..". True is returned to
@@ -1000,19 +1006,26 @@ int follow_down_one(struct path *path)
1000static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 1006static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1001 struct inode **inode, bool reverse_transit) 1007 struct inode **inode, bool reverse_transit)
1002{ 1008{
1003 while (d_mountpoint(path->dentry)) { 1009 for (;;) {
1004 struct vfsmount *mounted; 1010 struct vfsmount *mounted;
1005 if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && 1011 /*
1006 !reverse_transit && 1012 * Don't forget we might have a non-mountpoint managed dentry
1007 path->dentry->d_op->d_manage(path->dentry, true) < 0) 1013 * that wants to block transit.
1014 */
1015 *inode = path->dentry->d_inode;
1016 if (!reverse_transit &&
1017 unlikely(managed_dentry_might_block(path->dentry)))
1008 return false; 1018 return false;
1019
1020 if (!d_mountpoint(path->dentry))
1021 break;
1022
1009 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 1023 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
1010 if (!mounted) 1024 if (!mounted)
1011 break; 1025 break;
1012 path->mnt = mounted; 1026 path->mnt = mounted;
1013 path->dentry = mounted->mnt_root; 1027 path->dentry = mounted->mnt_root;
1014 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 1028 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1015 *inode = path->dentry->d_inode;
1016 } 1029 }
1017 1030
1018 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) 1031 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4c29fcf557d1..07ea8d3e6ea2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -22,13 +22,14 @@
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/spinlock.h> 24#include <linux/spinlock.h>
25#include <linux/writeback.h> /* for inode_lock */
26 25
27#include <asm/atomic.h> 26#include <asm/atomic.h>
28 27
29#include <linux/fsnotify_backend.h> 28#include <linux/fsnotify_backend.h>
30#include "fsnotify.h" 29#include "fsnotify.h"
31 30
31#include "../internal.h"
32
32/* 33/*
33 * Recalculate the mask of events relevant to a given inode locked. 34 * Recalculate the mask of events relevant to a given inode locked.
34 */ 35 */
@@ -237,15 +238,14 @@ out:
237 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. 238 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
238 * @list: list of inodes being unmounted (sb->s_inodes) 239 * @list: list of inodes being unmounted (sb->s_inodes)
239 * 240 *
240 * Called with inode_lock held, protecting the unmounting super block's list 241 * Called during unmount with no locks held, so needs to be safe against
241 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. 242 * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
242 * We temporarily drop inode_lock, however, and CAN block.
243 */ 243 */
244void fsnotify_unmount_inodes(struct list_head *list) 244void fsnotify_unmount_inodes(struct list_head *list)
245{ 245{
246 struct inode *inode, *next_i, *need_iput = NULL; 246 struct inode *inode, *next_i, *need_iput = NULL;
247 247
248 spin_lock(&inode_lock); 248 spin_lock(&inode_sb_list_lock);
249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
250 struct inode *need_iput_tmp; 250 struct inode *need_iput_tmp;
251 251
@@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
254 * I_WILL_FREE, or I_NEW which is fine because by that point 254 * I_WILL_FREE, or I_NEW which is fine because by that point
255 * the inode cannot have any associated watches. 255 * the inode cannot have any associated watches.
256 */ 256 */
257 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 257 spin_lock(&inode->i_lock);
258 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
259 spin_unlock(&inode->i_lock);
258 continue; 260 continue;
261 }
259 262
260 /* 263 /*
261 * If i_count is zero, the inode cannot have any watches and 264 * If i_count is zero, the inode cannot have any watches and
@@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list)
263 * evict all inodes with zero i_count from icache which is 266 * evict all inodes with zero i_count from icache which is
264 * unnecessarily violent and may in fact be illegal to do. 267 * unnecessarily violent and may in fact be illegal to do.
265 */ 268 */
266 if (!atomic_read(&inode->i_count)) 269 if (!atomic_read(&inode->i_count)) {
270 spin_unlock(&inode->i_lock);
267 continue; 271 continue;
272 }
268 273
269 need_iput_tmp = need_iput; 274 need_iput_tmp = need_iput;
270 need_iput = NULL; 275 need_iput = NULL;
@@ -274,22 +279,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
274 __iget(inode); 279 __iget(inode);
275 else 280 else
276 need_iput_tmp = NULL; 281 need_iput_tmp = NULL;
282 spin_unlock(&inode->i_lock);
277 283
278 /* In case the dropping of a reference would nuke next_i. */ 284 /* In case the dropping of a reference would nuke next_i. */
279 if ((&next_i->i_sb_list != list) && 285 if ((&next_i->i_sb_list != list) &&
280 atomic_read(&next_i->i_count) && 286 atomic_read(&next_i->i_count)) {
281 !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { 287 spin_lock(&next_i->i_lock);
282 __iget(next_i); 288 if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
283 need_iput = next_i; 289 __iget(next_i);
290 need_iput = next_i;
291 }
292 spin_unlock(&next_i->i_lock);
284 } 293 }
285 294
286 /* 295 /*
287 * We can safely drop inode_lock here because we hold 296 * We can safely drop inode_sb_list_lock here because we hold
288 * references on both inode and next_i. Also no new inodes 297 * references on both inode and next_i. Also no new inodes
289 * will be added since the umount has begun. Finally, 298 * will be added since the umount has begun.
290 * iprune_mutex keeps shrink_icache_memory() away.
291 */ 299 */
292 spin_unlock(&inode_lock); 300 spin_unlock(&inode_sb_list_lock);
293 301
294 if (need_iput_tmp) 302 if (need_iput_tmp)
295 iput(need_iput_tmp); 303 iput(need_iput_tmp);
@@ -301,7 +309,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
301 309
302 iput(inode); 310 iput(inode);
303 311
304 spin_lock(&inode_lock); 312 spin_lock(&inode_sb_list_lock);
305 } 313 }
306 spin_unlock(&inode_lock); 314 spin_unlock(&inode_sb_list_lock);
307} 315}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 325185e514bb..50c00856f730 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,7 +91,6 @@
91#include <linux/slab.h> 91#include <linux/slab.h>
92#include <linux/spinlock.h> 92#include <linux/spinlock.h>
93#include <linux/srcu.h> 93#include <linux/srcu.h>
94#include <linux/writeback.h> /* for inode_lock */
95 94
96#include <asm/atomic.h> 95#include <asm/atomic.h>
97 96
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 85eebff6d0d7..e86577d6c5c3 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -23,7 +23,6 @@
23#include <linux/mount.h> 23#include <linux/mount.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26#include <linux/writeback.h> /* for inode_lock */
27 26
28#include <asm/atomic.h> 27#include <asm/atomic.h>
29 28
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index a627ed82c0a3..0b56c6b7ec01 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -54,7 +54,7 @@
54 * 54 *
55 * Return 1 if the attributes match and 0 if not. 55 * Return 1 if the attributes match and 0 if not.
56 * 56 *
57 * NOTE: This function runs with the inode_lock spin lock held so it is not 57 * NOTE: This function runs with the inode->i_lock spin lock held so it is not
58 * allowed to sleep. 58 * allowed to sleep.
59 */ 59 */
60int ntfs_test_inode(struct inode *vi, ntfs_attr *na) 60int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
98 * 98 *
99 * Return 0 on success and -errno on error. 99 * Return 0 on success and -errno on error.
100 * 100 *
101 * NOTE: This function runs with the inode_lock spin lock held so it is not 101 * NOTE: This function runs with the inode->i_lock spin lock held so it is not
102 * allowed to sleep. (Hence the GFP_ATOMIC allocation.) 102 * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
103 */ 103 */
104static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) 104static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a2a622e079f0..fcc8ae75d874 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -76,7 +76,7 @@
76#include <linux/buffer_head.h> 76#include <linux/buffer_head.h>
77#include <linux/capability.h> 77#include <linux/capability.h>
78#include <linux/quotaops.h> 78#include <linux/quotaops.h>
79#include <linux/writeback.h> /* for inode_lock, oddly enough.. */ 79#include "../internal.h" /* ugh */
80 80
81#include <asm/uaccess.h> 81#include <asm/uaccess.h>
82 82
@@ -900,33 +900,38 @@ static void add_dquot_ref(struct super_block *sb, int type)
900 int reserved = 0; 900 int reserved = 0;
901#endif 901#endif
902 902
903 spin_lock(&inode_lock); 903 spin_lock(&inode_sb_list_lock);
904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
905 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 905 spin_lock(&inode->i_lock);
906 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
907 !atomic_read(&inode->i_writecount) ||
908 !dqinit_needed(inode, type)) {
909 spin_unlock(&inode->i_lock);
906 continue; 910 continue;
911 }
907#ifdef CONFIG_QUOTA_DEBUG 912#ifdef CONFIG_QUOTA_DEBUG
908 if (unlikely(inode_get_rsv_space(inode) > 0)) 913 if (unlikely(inode_get_rsv_space(inode) > 0))
909 reserved = 1; 914 reserved = 1;
910#endif 915#endif
911 if (!atomic_read(&inode->i_writecount))
912 continue;
913 if (!dqinit_needed(inode, type))
914 continue;
915
916 __iget(inode); 916 __iget(inode);
917 spin_unlock(&inode_lock); 917 spin_unlock(&inode->i_lock);
918 spin_unlock(&inode_sb_list_lock);
918 919
919 iput(old_inode); 920 iput(old_inode);
920 __dquot_initialize(inode, type); 921 __dquot_initialize(inode, type);
921 /* We hold a reference to 'inode' so it couldn't have been 922
922 * removed from s_inodes list while we dropped the inode_lock. 923 /*
923 * We cannot iput the inode now as we can be holding the last 924 * We hold a reference to 'inode' so it couldn't have been
924 * reference and we cannot iput it under inode_lock. So we 925 * removed from s_inodes list while we dropped the
925 * keep the reference and iput it later. */ 926 * inode_sb_list_lock We cannot iput the inode now as we can be
927 * holding the last reference and we cannot iput it under
928 * inode_sb_list_lock. So we keep the reference and iput it
929 * later.
930 */
926 old_inode = inode; 931 old_inode = inode;
927 spin_lock(&inode_lock); 932 spin_lock(&inode_sb_list_lock);
928 } 933 }
929 spin_unlock(&inode_lock); 934 spin_unlock(&inode_sb_list_lock);
930 iput(old_inode); 935 iput(old_inode);
931 936
932#ifdef CONFIG_QUOTA_DEBUG 937#ifdef CONFIG_QUOTA_DEBUG
@@ -1007,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
1007 struct inode *inode; 1012 struct inode *inode;
1008 int reserved = 0; 1013 int reserved = 0;
1009 1014
1010 spin_lock(&inode_lock); 1015 spin_lock(&inode_sb_list_lock);
1011 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1016 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1012 /* 1017 /*
1013 * We have to scan also I_NEW inodes because they can already 1018 * We have to scan also I_NEW inodes because they can already
@@ -1021,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
1021 remove_inode_dquot_ref(inode, type, tofree_head); 1026 remove_inode_dquot_ref(inode, type, tofree_head);
1022 } 1027 }
1023 } 1028 }
1024 spin_unlock(&inode_lock); 1029 spin_unlock(&inode_sb_list_lock);
1025#ifdef CONFIG_QUOTA_DEBUG 1030#ifdef CONFIG_QUOTA_DEBUG
1026 if (reserved) { 1031 if (reserved) {
1027 printk(KERN_WARNING "VFS (%s): Writes happened after quota" 1032 printk(KERN_WARNING "VFS (%s): Writes happened after quota"
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ce7e18555197..b677bd77f2d6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1636,7 +1636,7 @@ struct super_operations {
1636}; 1636};
1637 1637
1638/* 1638/*
1639 * Inode state bits. Protected by inode_lock. 1639 * Inode state bits. Protected by inode->i_lock
1640 * 1640 *
1641 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, 1641 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
1642 * I_DIRTY_DATASYNC and I_DIRTY_PAGES. 1642 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index eb354f6f26b3..26f9e3612e0f 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -277,7 +277,7 @@ static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
277 /* 277 /*
278 * Mark inode fully dirty. Since we are allocating blocks, inode 278 * Mark inode fully dirty. Since we are allocating blocks, inode
279 * would become fully dirty soon anyway and it reportedly 279 * would become fully dirty soon anyway and it reportedly
280 * reduces inode_lock contention. 280 * reduces lock contention.
281 */ 281 */
282 mark_inode_dirty(inode); 282 mark_inode_dirty(inode);
283 } 283 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 0ead399e08b5..17e7ccc322a5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -9,7 +9,7 @@
9 9
10struct backing_dev_info; 10struct backing_dev_info;
11 11
12extern spinlock_t inode_lock; 12extern spinlock_t inode_wb_list_lock;
13 13
14/* 14/*
15 * fs/fs-writeback.c 15 * fs/fs-writeback.c
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fe9d3407921..0d9a036ada66 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -67,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
67 struct inode *inode; 67 struct inode *inode;
68 68
69 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 69 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_lock); 70 spin_lock(&inode_wb_list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 72 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_wb_list) 73 list_for_each_entry(inode, &wb->b_io, i_wb_list)
74 nr_io++; 74 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
76 nr_more_io++; 76 nr_more_io++;
77 spin_unlock(&inode_lock); 77 spin_unlock(&inode_wb_list_lock);
78 78
79 global_dirty_limits(&background_thresh, &dirty_thresh); 79 global_dirty_limits(&background_thresh, &dirty_thresh);
80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -676,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
676 if (bdi_has_dirty_io(bdi)) { 676 if (bdi_has_dirty_io(bdi)) {
677 struct bdi_writeback *dst = &default_backing_dev_info.wb; 677 struct bdi_writeback *dst = &default_backing_dev_info.wb;
678 678
679 spin_lock(&inode_lock); 679 spin_lock(&inode_wb_list_lock);
680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
681 list_splice(&bdi->wb.b_io, &dst->b_io); 681 list_splice(&bdi->wb.b_io, &dst->b_io);
682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
683 spin_unlock(&inode_lock); 683 spin_unlock(&inode_wb_list_lock);
684 } 684 }
685 685
686 bdi_unregister(bdi); 686 bdi_unregister(bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index 04d1992fd86b..c641edf553a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -80,8 +80,8 @@
80 * ->i_mutex 80 * ->i_mutex
81 * ->i_alloc_sem (various) 81 * ->i_alloc_sem (various)
82 * 82 *
83 * ->inode_lock 83 * inode_wb_list_lock
84 * ->sb_lock (fs/fs-writeback.c) 84 * sb_lock (fs/fs-writeback.c)
85 * ->mapping->tree_lock (__sync_single_inode) 85 * ->mapping->tree_lock (__sync_single_inode)
86 * 86 *
87 * ->i_mmap_lock 87 * ->i_mmap_lock
@@ -98,8 +98,10 @@
98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
99 * ->private_lock (page_remove_rmap->set_page_dirty) 99 * ->private_lock (page_remove_rmap->set_page_dirty)
100 * ->tree_lock (page_remove_rmap->set_page_dirty) 100 * ->tree_lock (page_remove_rmap->set_page_dirty)
101 * ->inode_lock (page_remove_rmap->set_page_dirty) 101 * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
102 * ->inode_lock (zap_pte_range->set_page_dirty) 102 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
103 * inode_wb_list_lock (zap_pte_range->set_page_dirty)
104 * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 106 *
105 * (code doesn't rely on that order, so you could switch it around) 107 * (code doesn't rely on that order, so you could switch it around)
diff --git a/mm/rmap.c b/mm/rmap.c
index 4a8e99a0fb97..8da044a1db0f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -31,11 +31,12 @@
31 * swap_lock (in swap_duplicate, swap_info_get) 31 * swap_lock (in swap_duplicate, swap_info_get)
32 * mmlist_lock (in mmput, drain_mmlist and others) 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->private_lock (in __set_page_dirty_buffers) 33 * mapping->private_lock (in __set_page_dirty_buffers)
34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 34 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
35 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 38 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 39 * within inode_wb_list_lock in __sync_single_inode)
39 * 40 *
40 * (code doesn't rely on that order so it could be switched around) 41 * (code doesn't rely on that order so it could be switched around)
41 * ->tasklist_lock 42 * ->tasklist_lock