aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2010-10-23 06:55:17 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2010-10-25 21:26:09 -0400
commit9e38d86ff2d8a8db99570e982230861046df32b5 (patch)
tree7ea2ceea24a4e070259a4585b2748c9e2c070ee0
parentcffbc8aa334f55c9ed42d25202eb3ebf3a97c195 (diff)
fs: Implement lazy LRU updates for inodes
Convert the inode LRU to use lazy updates to reduce lock and cacheline traffic. We avoid moving inodes around in the LRU list during iget/iput operations so these frequent operations don't need to access the LRUs. Instead, we defer the refcount checks to reclaim-time and use a per-inode state flag, I_REFERENCED, to tell reclaim that iget has touched the inode in the past. This means that only reclaim should be touching the LRU with any frequency, hence significantly reducing lock acquisitions and the amount contention on LRU updates. This also removes the inode_in_use list, which means we now only have one list for tracking the inode LRU status. This makes it much simpler to split out the LRU list operations under it's own lock. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/fs-writeback.c11
-rw-r--r--fs/inode.c86
-rw-r--r--include/linux/fs.h13
-rw-r--r--include/linux/writeback.h2
4 files changed, 71 insertions, 41 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f04d04af84f2..e8f65290e836 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -408,16 +408,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
408 * completion. 408 * completion.
409 */ 409 */
410 redirty_tail(inode); 410 redirty_tail(inode);
411 } else if (atomic_read(&inode->i_count)) {
412 /*
413 * The inode is clean, inuse
414 */
415 list_move(&inode->i_list, &inode_in_use);
416 } else { 411 } else {
417 /* 412 /*
418 * The inode is clean, unused 413 * The inode is clean. At this point we either have
414 * a reference to the inode or it's on it's way out.
415 * No need to add it back to the LRU.
419 */ 416 */
420 list_move(&inode->i_list, &inode_unused); 417 list_del_init(&inode->i_list);
421 } 418 }
422 } 419 }
423 inode_sync_complete(inode); 420 inode_sync_complete(inode);
diff --git a/fs/inode.c b/fs/inode.c
index 0d5aeccbdd90..3bdc76f1653a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
72 * allowing for low-overhead inode sync() operations. 72 * allowing for low-overhead inode sync() operations.
73 */ 73 */
74 74
75LIST_HEAD(inode_in_use); 75static LIST_HEAD(inode_unused);
76LIST_HEAD(inode_unused);
77static struct hlist_head *inode_hashtable __read_mostly; 76static struct hlist_head *inode_hashtable __read_mostly;
78 77
79/* 78/*
@@ -291,6 +290,7 @@ void inode_init_once(struct inode *inode)
291 INIT_HLIST_NODE(&inode->i_hash); 290 INIT_HLIST_NODE(&inode->i_hash);
292 INIT_LIST_HEAD(&inode->i_dentry); 291 INIT_LIST_HEAD(&inode->i_dentry);
293 INIT_LIST_HEAD(&inode->i_devices); 292 INIT_LIST_HEAD(&inode->i_devices);
293 INIT_LIST_HEAD(&inode->i_list);
294 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 294 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
295 spin_lock_init(&inode->i_data.tree_lock); 295 spin_lock_init(&inode->i_data.tree_lock);
296 spin_lock_init(&inode->i_data.i_mmap_lock); 296 spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -317,12 +317,23 @@ static void init_once(void *foo)
317 */ 317 */
318void __iget(struct inode *inode) 318void __iget(struct inode *inode)
319{ 319{
320 if (atomic_inc_return(&inode->i_count) != 1) 320 atomic_inc(&inode->i_count);
321 return; 321}
322 322
323 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 323static void inode_lru_list_add(struct inode *inode)
324 list_move(&inode->i_list, &inode_in_use); 324{
325 percpu_counter_dec(&nr_inodes_unused); 325 if (list_empty(&inode->i_list)) {
326 list_add(&inode->i_list, &inode_unused);
327 percpu_counter_inc(&nr_inodes_unused);
328 }
329}
330
331static void inode_lru_list_del(struct inode *inode)
332{
333 if (!list_empty(&inode->i_list)) {
334 list_del_init(&inode->i_list);
335 percpu_counter_dec(&nr_inodes_unused);
336 }
326} 337}
327 338
328void end_writeback(struct inode *inode) 339void end_writeback(struct inode *inode)
@@ -367,7 +378,7 @@ static void dispose_list(struct list_head *head)
367 struct inode *inode; 378 struct inode *inode;
368 379
369 inode = list_first_entry(head, struct inode, i_list); 380 inode = list_first_entry(head, struct inode, i_list);
370 list_del(&inode->i_list); 381 list_del_init(&inode->i_list);
371 382
372 evict(inode); 383 evict(inode);
373 384
@@ -413,7 +424,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
413 list_move(&inode->i_list, dispose); 424 list_move(&inode->i_list, dispose);
414 WARN_ON(inode->i_state & I_NEW); 425 WARN_ON(inode->i_state & I_NEW);
415 inode->i_state |= I_FREEING; 426 inode->i_state |= I_FREEING;
416 percpu_counter_dec(&nr_inodes_unused); 427 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
428 percpu_counter_dec(&nr_inodes_unused);
417 continue; 429 continue;
418 } 430 }
419 busy = 1; 431 busy = 1;
@@ -448,7 +460,7 @@ int invalidate_inodes(struct super_block *sb)
448 460
449static int can_unuse(struct inode *inode) 461static int can_unuse(struct inode *inode)
450{ 462{
451 if (inode->i_state) 463 if (inode->i_state & ~I_REFERENCED)
452 return 0; 464 return 0;
453 if (inode_has_buffers(inode)) 465 if (inode_has_buffers(inode))
454 return 0; 466 return 0;
@@ -460,17 +472,20 @@ static int can_unuse(struct inode *inode)
460} 472}
461 473
462/* 474/*
463 * Scan `goal' inodes on the unused list for freeable ones. They are moved to 475 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
464 * a temporary list and then are freed outside inode_lock by dispose_list(). 476 * temporary list and then are freed outside inode_lock by dispose_list().
465 * 477 *
466 * Any inodes which are pinned purely because of attached pagecache have their 478 * Any inodes which are pinned purely because of attached pagecache have their
467 * pagecache removed. We expect the final iput() on that inode to add it to 479 * pagecache removed. If the inode has metadata buffers attached to
468 * the front of the inode_unused list. So look for it there and if the 480 * mapping->private_list then try to remove them.
469 * inode is still freeable, proceed. The right inode is found 99.9% of the
470 * time in testing on a 4-way.
471 * 481 *
472 * If the inode has metadata buffers attached to mapping->private_list then 482 * If the inode has the I_REFERENCED flag set, then it means that it has been
473 * try to remove them. 483 * used recently - the flag is set in iput_final(). When we encounter such an
484 * inode, clear the flag and move it to the back of the LRU so it gets another
485 * pass through the LRU before it gets reclaimed. This is necessary because of
486 * the fact we are doing lazy LRU updates to minimise lock contention so the
487 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
488 * with this flag set because they are the inodes that are out of order.
474 */ 489 */
475static void prune_icache(int nr_to_scan) 490static void prune_icache(int nr_to_scan)
476{ 491{
@@ -488,8 +503,21 @@ static void prune_icache(int nr_to_scan)
488 503
489 inode = list_entry(inode_unused.prev, struct inode, i_list); 504 inode = list_entry(inode_unused.prev, struct inode, i_list);
490 505
491 if (inode->i_state || atomic_read(&inode->i_count)) { 506 /*
507 * Referenced or dirty inodes are still in use. Give them
508 * another pass through the LRU as we canot reclaim them now.
509 */
510 if (atomic_read(&inode->i_count) ||
511 (inode->i_state & ~I_REFERENCED)) {
512 list_del_init(&inode->i_list);
513 percpu_counter_dec(&nr_inodes_unused);
514 continue;
515 }
516
517 /* recently referenced inodes get one more pass */
518 if (inode->i_state & I_REFERENCED) {
492 list_move(&inode->i_list, &inode_unused); 519 list_move(&inode->i_list, &inode_unused);
520 inode->i_state &= ~I_REFERENCED;
493 continue; 521 continue;
494 } 522 }
495 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 523 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -620,7 +648,6 @@ static inline void
620__inode_add_to_lists(struct super_block *sb, struct hlist_head *head, 648__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
621 struct inode *inode) 649 struct inode *inode)
622{ 650{
623 list_add(&inode->i_list, &inode_in_use);
624 list_add(&inode->i_sb_list, &sb->s_inodes); 651 list_add(&inode->i_sb_list, &sb->s_inodes);
625 if (head) 652 if (head)
626 hlist_add_head(&inode->i_hash, head); 653 hlist_add_head(&inode->i_hash, head);
@@ -1237,10 +1264,11 @@ static void iput_final(struct inode *inode)
1237 drop = generic_drop_inode(inode); 1264 drop = generic_drop_inode(inode);
1238 1265
1239 if (!drop) { 1266 if (!drop) {
1240 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1241 list_move(&inode->i_list, &inode_unused);
1242 percpu_counter_inc(&nr_inodes_unused);
1243 if (sb->s_flags & MS_ACTIVE) { 1267 if (sb->s_flags & MS_ACTIVE) {
1268 inode->i_state |= I_REFERENCED;
1269 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1270 inode_lru_list_add(inode);
1271 }
1244 spin_unlock(&inode_lock); 1272 spin_unlock(&inode_lock);
1245 return; 1273 return;
1246 } 1274 }
@@ -1251,13 +1279,19 @@ static void iput_final(struct inode *inode)
1251 spin_lock(&inode_lock); 1279 spin_lock(&inode_lock);
1252 WARN_ON(inode->i_state & I_NEW); 1280 WARN_ON(inode->i_state & I_NEW);
1253 inode->i_state &= ~I_WILL_FREE; 1281 inode->i_state &= ~I_WILL_FREE;
1254 percpu_counter_dec(&nr_inodes_unused);
1255 hlist_del_init(&inode->i_hash); 1282 hlist_del_init(&inode->i_hash);
1256 } 1283 }
1257 list_del_init(&inode->i_list);
1258 list_del_init(&inode->i_sb_list);
1259 WARN_ON(inode->i_state & I_NEW); 1284 WARN_ON(inode->i_state & I_NEW);
1260 inode->i_state |= I_FREEING; 1285 inode->i_state |= I_FREEING;
1286
1287 /*
1288 * After we delete the inode from the LRU here, we avoid moving dirty
1289 * inodes back onto the LRU now because I_FREEING is set and hence
1290 * writeback_single_inode() won't move the inode around.
1291 */
1292 inode_lru_list_del(inode);
1293
1294 list_del_init(&inode->i_sb_list);
1261 spin_unlock(&inode_lock); 1295 spin_unlock(&inode_lock);
1262 evict(inode); 1296 evict(inode);
1263 spin_lock(&inode_lock); 1297 spin_lock(&inode_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a3937a8ee95e..876275fc0638 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1641,16 +1641,17 @@ struct super_operations {
1641 * 1641 *
1642 * Q: What is the difference between I_WILL_FREE and I_FREEING? 1642 * Q: What is the difference between I_WILL_FREE and I_FREEING?
1643 */ 1643 */
1644#define I_DIRTY_SYNC 1 1644#define I_DIRTY_SYNC (1 << 0)
1645#define I_DIRTY_DATASYNC 2 1645#define I_DIRTY_DATASYNC (1 << 1)
1646#define I_DIRTY_PAGES 4 1646#define I_DIRTY_PAGES (1 << 2)
1647#define __I_NEW 3 1647#define __I_NEW 3
1648#define I_NEW (1 << __I_NEW) 1648#define I_NEW (1 << __I_NEW)
1649#define I_WILL_FREE 16 1649#define I_WILL_FREE (1 << 4)
1650#define I_FREEING 32 1650#define I_FREEING (1 << 5)
1651#define I_CLEAR 64 1651#define I_CLEAR (1 << 6)
1652#define __I_SYNC 7 1652#define __I_SYNC 7
1653#define I_SYNC (1 << __I_SYNC) 1653#define I_SYNC (1 << __I_SYNC)
1654#define I_REFERENCED (1 << 8)
1654 1655
1655#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) 1656#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
1656 1657
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 72a5d647a5f2..242b6f812ba6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -10,8 +10,6 @@
10struct backing_dev_info; 10struct backing_dev_info;
11 11
12extern spinlock_t inode_lock; 12extern spinlock_t inode_lock;
13extern struct list_head inode_in_use;
14extern struct list_head inode_unused;
15 13
16/* 14/*
17 * fs/fs-writeback.c 15 * fs/fs-writeback.c