diff options
author | Nick Piggin <npiggin@suse.de> | 2010-10-23 06:55:17 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2010-10-25 21:26:09 -0400 |
commit | 9e38d86ff2d8a8db99570e982230861046df32b5 (patch) | |
tree | 7ea2ceea24a4e070259a4585b2748c9e2c070ee0 | |
parent | cffbc8aa334f55c9ed42d25202eb3ebf3a97c195 (diff) |
fs: Implement lazy LRU updates for inodes
Convert the inode LRU to use lazy updates to reduce lock and
cacheline traffic. We avoid moving inodes around in the LRU list
during iget/iput operations so these frequent operations don't need
to access the LRUs. Instead, we defer the refcount checks to
reclaim-time and use a per-inode state flag, I_REFERENCED, to tell
reclaim that iget has touched the inode in the past. This means that
only reclaim should be touching the LRU with any frequency, hence
significantly reducing lock acquisitions and the amount contention
on LRU updates.
This also removes the inode_in_use list, which means we now only
have one list for tracking the inode LRU status. This makes it much
simpler to split out the LRU list operations under it's own lock.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r-- | fs/fs-writeback.c | 11 | ||||
-rw-r--r-- | fs/inode.c | 86 | ||||
-rw-r--r-- | include/linux/fs.h | 13 | ||||
-rw-r--r-- | include/linux/writeback.h | 2 |
4 files changed, 71 insertions, 41 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f04d04af84f2..e8f65290e836 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -408,16 +408,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
408 | * completion. | 408 | * completion. |
409 | */ | 409 | */ |
410 | redirty_tail(inode); | 410 | redirty_tail(inode); |
411 | } else if (atomic_read(&inode->i_count)) { | ||
412 | /* | ||
413 | * The inode is clean, inuse | ||
414 | */ | ||
415 | list_move(&inode->i_list, &inode_in_use); | ||
416 | } else { | 411 | } else { |
417 | /* | 412 | /* |
418 | * The inode is clean, unused | 413 | * The inode is clean. At this point we either have |
414 | * a reference to the inode or it's on it's way out. | ||
415 | * No need to add it back to the LRU. | ||
419 | */ | 416 | */ |
420 | list_move(&inode->i_list, &inode_unused); | 417 | list_del_init(&inode->i_list); |
421 | } | 418 | } |
422 | } | 419 | } |
423 | inode_sync_complete(inode); | 420 | inode_sync_complete(inode); |
diff --git a/fs/inode.c b/fs/inode.c index 0d5aeccbdd90..3bdc76f1653a 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly; | |||
72 | * allowing for low-overhead inode sync() operations. | 72 | * allowing for low-overhead inode sync() operations. |
73 | */ | 73 | */ |
74 | 74 | ||
75 | LIST_HEAD(inode_in_use); | 75 | static LIST_HEAD(inode_unused); |
76 | LIST_HEAD(inode_unused); | ||
77 | static struct hlist_head *inode_hashtable __read_mostly; | 76 | static struct hlist_head *inode_hashtable __read_mostly; |
78 | 77 | ||
79 | /* | 78 | /* |
@@ -291,6 +290,7 @@ void inode_init_once(struct inode *inode) | |||
291 | INIT_HLIST_NODE(&inode->i_hash); | 290 | INIT_HLIST_NODE(&inode->i_hash); |
292 | INIT_LIST_HEAD(&inode->i_dentry); | 291 | INIT_LIST_HEAD(&inode->i_dentry); |
293 | INIT_LIST_HEAD(&inode->i_devices); | 292 | INIT_LIST_HEAD(&inode->i_devices); |
293 | INIT_LIST_HEAD(&inode->i_list); | ||
294 | INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); | 294 | INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); |
295 | spin_lock_init(&inode->i_data.tree_lock); | 295 | spin_lock_init(&inode->i_data.tree_lock); |
296 | spin_lock_init(&inode->i_data.i_mmap_lock); | 296 | spin_lock_init(&inode->i_data.i_mmap_lock); |
@@ -317,12 +317,23 @@ static void init_once(void *foo) | |||
317 | */ | 317 | */ |
318 | void __iget(struct inode *inode) | 318 | void __iget(struct inode *inode) |
319 | { | 319 | { |
320 | if (atomic_inc_return(&inode->i_count) != 1) | 320 | atomic_inc(&inode->i_count); |
321 | return; | 321 | } |
322 | 322 | ||
323 | if (!(inode->i_state & (I_DIRTY|I_SYNC))) | 323 | static void inode_lru_list_add(struct inode *inode) |
324 | list_move(&inode->i_list, &inode_in_use); | 324 | { |
325 | percpu_counter_dec(&nr_inodes_unused); | 325 | if (list_empty(&inode->i_list)) { |
326 | list_add(&inode->i_list, &inode_unused); | ||
327 | percpu_counter_inc(&nr_inodes_unused); | ||
328 | } | ||
329 | } | ||
330 | |||
331 | static void inode_lru_list_del(struct inode *inode) | ||
332 | { | ||
333 | if (!list_empty(&inode->i_list)) { | ||
334 | list_del_init(&inode->i_list); | ||
335 | percpu_counter_dec(&nr_inodes_unused); | ||
336 | } | ||
326 | } | 337 | } |
327 | 338 | ||
328 | void end_writeback(struct inode *inode) | 339 | void end_writeback(struct inode *inode) |
@@ -367,7 +378,7 @@ static void dispose_list(struct list_head *head) | |||
367 | struct inode *inode; | 378 | struct inode *inode; |
368 | 379 | ||
369 | inode = list_first_entry(head, struct inode, i_list); | 380 | inode = list_first_entry(head, struct inode, i_list); |
370 | list_del(&inode->i_list); | 381 | list_del_init(&inode->i_list); |
371 | 382 | ||
372 | evict(inode); | 383 | evict(inode); |
373 | 384 | ||
@@ -413,7 +424,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) | |||
413 | list_move(&inode->i_list, dispose); | 424 | list_move(&inode->i_list, dispose); |
414 | WARN_ON(inode->i_state & I_NEW); | 425 | WARN_ON(inode->i_state & I_NEW); |
415 | inode->i_state |= I_FREEING; | 426 | inode->i_state |= I_FREEING; |
416 | percpu_counter_dec(&nr_inodes_unused); | 427 | if (!(inode->i_state & (I_DIRTY | I_SYNC))) |
428 | percpu_counter_dec(&nr_inodes_unused); | ||
417 | continue; | 429 | continue; |
418 | } | 430 | } |
419 | busy = 1; | 431 | busy = 1; |
@@ -448,7 +460,7 @@ int invalidate_inodes(struct super_block *sb) | |||
448 | 460 | ||
449 | static int can_unuse(struct inode *inode) | 461 | static int can_unuse(struct inode *inode) |
450 | { | 462 | { |
451 | if (inode->i_state) | 463 | if (inode->i_state & ~I_REFERENCED) |
452 | return 0; | 464 | return 0; |
453 | if (inode_has_buffers(inode)) | 465 | if (inode_has_buffers(inode)) |
454 | return 0; | 466 | return 0; |
@@ -460,17 +472,20 @@ static int can_unuse(struct inode *inode) | |||
460 | } | 472 | } |
461 | 473 | ||
462 | /* | 474 | /* |
463 | * Scan `goal' inodes on the unused list for freeable ones. They are moved to | 475 | * Scan `goal' inodes on the unused list for freeable ones. They are moved to a |
464 | * a temporary list and then are freed outside inode_lock by dispose_list(). | 476 | * temporary list and then are freed outside inode_lock by dispose_list(). |
465 | * | 477 | * |
466 | * Any inodes which are pinned purely because of attached pagecache have their | 478 | * Any inodes which are pinned purely because of attached pagecache have their |
467 | * pagecache removed. We expect the final iput() on that inode to add it to | 479 | * pagecache removed. If the inode has metadata buffers attached to |
468 | * the front of the inode_unused list. So look for it there and if the | 480 | * mapping->private_list then try to remove them. |
469 | * inode is still freeable, proceed. The right inode is found 99.9% of the | ||
470 | * time in testing on a 4-way. | ||
471 | * | 481 | * |
472 | * If the inode has metadata buffers attached to mapping->private_list then | 482 | * If the inode has the I_REFERENCED flag set, then it means that it has been |
473 | * try to remove them. | 483 | * used recently - the flag is set in iput_final(). When we encounter such an |
484 | * inode, clear the flag and move it to the back of the LRU so it gets another | ||
485 | * pass through the LRU before it gets reclaimed. This is necessary because of | ||
486 | * the fact we are doing lazy LRU updates to minimise lock contention so the | ||
487 | * LRU does not have strict ordering. Hence we don't want to reclaim inodes | ||
488 | * with this flag set because they are the inodes that are out of order. | ||
474 | */ | 489 | */ |
475 | static void prune_icache(int nr_to_scan) | 490 | static void prune_icache(int nr_to_scan) |
476 | { | 491 | { |
@@ -488,8 +503,21 @@ static void prune_icache(int nr_to_scan) | |||
488 | 503 | ||
489 | inode = list_entry(inode_unused.prev, struct inode, i_list); | 504 | inode = list_entry(inode_unused.prev, struct inode, i_list); |
490 | 505 | ||
491 | if (inode->i_state || atomic_read(&inode->i_count)) { | 506 | /* |
507 | * Referenced or dirty inodes are still in use. Give them | ||
508 | * another pass through the LRU as we canot reclaim them now. | ||
509 | */ | ||
510 | if (atomic_read(&inode->i_count) || | ||
511 | (inode->i_state & ~I_REFERENCED)) { | ||
512 | list_del_init(&inode->i_list); | ||
513 | percpu_counter_dec(&nr_inodes_unused); | ||
514 | continue; | ||
515 | } | ||
516 | |||
517 | /* recently referenced inodes get one more pass */ | ||
518 | if (inode->i_state & I_REFERENCED) { | ||
492 | list_move(&inode->i_list, &inode_unused); | 519 | list_move(&inode->i_list, &inode_unused); |
520 | inode->i_state &= ~I_REFERENCED; | ||
493 | continue; | 521 | continue; |
494 | } | 522 | } |
495 | if (inode_has_buffers(inode) || inode->i_data.nrpages) { | 523 | if (inode_has_buffers(inode) || inode->i_data.nrpages) { |
@@ -620,7 +648,6 @@ static inline void | |||
620 | __inode_add_to_lists(struct super_block *sb, struct hlist_head *head, | 648 | __inode_add_to_lists(struct super_block *sb, struct hlist_head *head, |
621 | struct inode *inode) | 649 | struct inode *inode) |
622 | { | 650 | { |
623 | list_add(&inode->i_list, &inode_in_use); | ||
624 | list_add(&inode->i_sb_list, &sb->s_inodes); | 651 | list_add(&inode->i_sb_list, &sb->s_inodes); |
625 | if (head) | 652 | if (head) |
626 | hlist_add_head(&inode->i_hash, head); | 653 | hlist_add_head(&inode->i_hash, head); |
@@ -1237,10 +1264,11 @@ static void iput_final(struct inode *inode) | |||
1237 | drop = generic_drop_inode(inode); | 1264 | drop = generic_drop_inode(inode); |
1238 | 1265 | ||
1239 | if (!drop) { | 1266 | if (!drop) { |
1240 | if (!(inode->i_state & (I_DIRTY|I_SYNC))) | ||
1241 | list_move(&inode->i_list, &inode_unused); | ||
1242 | percpu_counter_inc(&nr_inodes_unused); | ||
1243 | if (sb->s_flags & MS_ACTIVE) { | 1267 | if (sb->s_flags & MS_ACTIVE) { |
1268 | inode->i_state |= I_REFERENCED; | ||
1269 | if (!(inode->i_state & (I_DIRTY|I_SYNC))) { | ||
1270 | inode_lru_list_add(inode); | ||
1271 | } | ||
1244 | spin_unlock(&inode_lock); | 1272 | spin_unlock(&inode_lock); |
1245 | return; | 1273 | return; |
1246 | } | 1274 | } |
@@ -1251,13 +1279,19 @@ static void iput_final(struct inode *inode) | |||
1251 | spin_lock(&inode_lock); | 1279 | spin_lock(&inode_lock); |
1252 | WARN_ON(inode->i_state & I_NEW); | 1280 | WARN_ON(inode->i_state & I_NEW); |
1253 | inode->i_state &= ~I_WILL_FREE; | 1281 | inode->i_state &= ~I_WILL_FREE; |
1254 | percpu_counter_dec(&nr_inodes_unused); | ||
1255 | hlist_del_init(&inode->i_hash); | 1282 | hlist_del_init(&inode->i_hash); |
1256 | } | 1283 | } |
1257 | list_del_init(&inode->i_list); | ||
1258 | list_del_init(&inode->i_sb_list); | ||
1259 | WARN_ON(inode->i_state & I_NEW); | 1284 | WARN_ON(inode->i_state & I_NEW); |
1260 | inode->i_state |= I_FREEING; | 1285 | inode->i_state |= I_FREEING; |
1286 | |||
1287 | /* | ||
1288 | * After we delete the inode from the LRU here, we avoid moving dirty | ||
1289 | * inodes back onto the LRU now because I_FREEING is set and hence | ||
1290 | * writeback_single_inode() won't move the inode around. | ||
1291 | */ | ||
1292 | inode_lru_list_del(inode); | ||
1293 | |||
1294 | list_del_init(&inode->i_sb_list); | ||
1261 | spin_unlock(&inode_lock); | 1295 | spin_unlock(&inode_lock); |
1262 | evict(inode); | 1296 | evict(inode); |
1263 | spin_lock(&inode_lock); | 1297 | spin_lock(&inode_lock); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index a3937a8ee95e..876275fc0638 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -1641,16 +1641,17 @@ struct super_operations { | |||
1641 | * | 1641 | * |
1642 | * Q: What is the difference between I_WILL_FREE and I_FREEING? | 1642 | * Q: What is the difference between I_WILL_FREE and I_FREEING? |
1643 | */ | 1643 | */ |
1644 | #define I_DIRTY_SYNC 1 | 1644 | #define I_DIRTY_SYNC (1 << 0) |
1645 | #define I_DIRTY_DATASYNC 2 | 1645 | #define I_DIRTY_DATASYNC (1 << 1) |
1646 | #define I_DIRTY_PAGES 4 | 1646 | #define I_DIRTY_PAGES (1 << 2) |
1647 | #define __I_NEW 3 | 1647 | #define __I_NEW 3 |
1648 | #define I_NEW (1 << __I_NEW) | 1648 | #define I_NEW (1 << __I_NEW) |
1649 | #define I_WILL_FREE 16 | 1649 | #define I_WILL_FREE (1 << 4) |
1650 | #define I_FREEING 32 | 1650 | #define I_FREEING (1 << 5) |
1651 | #define I_CLEAR 64 | 1651 | #define I_CLEAR (1 << 6) |
1652 | #define __I_SYNC 7 | 1652 | #define __I_SYNC 7 |
1653 | #define I_SYNC (1 << __I_SYNC) | 1653 | #define I_SYNC (1 << __I_SYNC) |
1654 | #define I_REFERENCED (1 << 8) | ||
1654 | 1655 | ||
1655 | #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) | 1656 | #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) |
1656 | 1657 | ||
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 72a5d647a5f2..242b6f812ba6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -10,8 +10,6 @@ | |||
10 | struct backing_dev_info; | 10 | struct backing_dev_info; |
11 | 11 | ||
12 | extern spinlock_t inode_lock; | 12 | extern spinlock_t inode_lock; |
13 | extern struct list_head inode_in_use; | ||
14 | extern struct list_head inode_unused; | ||
15 | 13 | ||
16 | /* | 14 | /* |
17 | * fs/fs-writeback.c | 15 | * fs/fs-writeback.c |