diff options
author | Dave Chinner <dchinner@redhat.com> | 2011-03-22 07:23:38 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2011-03-24 21:16:31 -0400 |
commit | 02afc410f363f98ac4f186341e38dcec13fc0e60 (patch) | |
tree | 2c5d5b2f1556806da135f2323b4df4d7d72d3734 /fs | |
parent | b2b2af8e614b4dcd8aca1369d82ce5ad0461a7b1 (diff) |
fs: Lock the inode LRU list separately
Introduce the inode_lru_lock to protect the inode_lru list. This
lock is nested inside the inode->i_lock to allow the inode to be
added to the LRU list in iput_final without needing to deal with
lock inversions. This keeps iput_final() clean and neat.
Further, where marking the inode I_FREEING and removing it from the
LRU, move the LRU list manipulation within the inode->i_lock to keep
the list manipulation consistent with iput_final. This also means
that most of the open coded LRU list removal + unused inode
accounting can now use the inode_lru_list_del() wrappers which
cleans the code up further.
However, this locking change means what the LRU traversal in
prune_icache() inverts this lock ordering and needs to use trylock
semantics on the inode->i_lock to avoid deadlocking. In these cases,
if we fail to lock the inode we move it to the back of the LRU to
prevent spinning on it.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/inode.c | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/fs/inode.c b/fs/inode.c index f752a959254b..b19cb6ee6ca3 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -32,10 +32,13 @@ | |||
32 | * | 32 | * |
33 | * inode->i_lock protects: | 33 | * inode->i_lock protects: |
34 | * inode->i_state, inode->i_hash, __iget() | 34 | * inode->i_state, inode->i_hash, __iget() |
35 | * inode_lru_lock protects: | ||
36 | * inode_lru, inode->i_lru | ||
35 | * | 37 | * |
36 | * Lock ordering: | 38 | * Lock ordering: |
37 | * inode_lock | 39 | * inode_lock |
38 | * inode->i_lock | 40 | * inode->i_lock |
41 | * inode_lru_lock | ||
39 | */ | 42 | */ |
40 | 43 | ||
41 | /* | 44 | /* |
@@ -85,6 +88,7 @@ static unsigned int i_hash_shift __read_mostly; | |||
85 | */ | 88 | */ |
86 | 89 | ||
87 | static LIST_HEAD(inode_lru); | 90 | static LIST_HEAD(inode_lru); |
91 | static DEFINE_SPINLOCK(inode_lru_lock); | ||
88 | static struct hlist_head *inode_hashtable __read_mostly; | 92 | static struct hlist_head *inode_hashtable __read_mostly; |
89 | 93 | ||
90 | /* | 94 | /* |
@@ -356,18 +360,22 @@ EXPORT_SYMBOL(ihold); | |||
356 | 360 | ||
357 | static void inode_lru_list_add(struct inode *inode) | 361 | static void inode_lru_list_add(struct inode *inode) |
358 | { | 362 | { |
363 | spin_lock(&inode_lru_lock); | ||
359 | if (list_empty(&inode->i_lru)) { | 364 | if (list_empty(&inode->i_lru)) { |
360 | list_add(&inode->i_lru, &inode_lru); | 365 | list_add(&inode->i_lru, &inode_lru); |
361 | inodes_stat.nr_unused++; | 366 | inodes_stat.nr_unused++; |
362 | } | 367 | } |
368 | spin_unlock(&inode_lru_lock); | ||
363 | } | 369 | } |
364 | 370 | ||
365 | static void inode_lru_list_del(struct inode *inode) | 371 | static void inode_lru_list_del(struct inode *inode) |
366 | { | 372 | { |
373 | spin_lock(&inode_lru_lock); | ||
367 | if (!list_empty(&inode->i_lru)) { | 374 | if (!list_empty(&inode->i_lru)) { |
368 | list_del_init(&inode->i_lru); | 375 | list_del_init(&inode->i_lru); |
369 | inodes_stat.nr_unused--; | 376 | inodes_stat.nr_unused--; |
370 | } | 377 | } |
378 | spin_unlock(&inode_lru_lock); | ||
371 | } | 379 | } |
372 | 380 | ||
373 | static inline void __inode_sb_list_add(struct inode *inode) | 381 | static inline void __inode_sb_list_add(struct inode *inode) |
@@ -543,10 +551,9 @@ void evict_inodes(struct super_block *sb) | |||
543 | } | 551 | } |
544 | 552 | ||
545 | inode->i_state |= I_FREEING; | 553 | inode->i_state |= I_FREEING; |
546 | if (!(inode->i_state & (I_DIRTY | I_SYNC))) | 554 | inode_lru_list_del(inode); |
547 | inodes_stat.nr_unused--; | ||
548 | spin_unlock(&inode->i_lock); | 555 | spin_unlock(&inode->i_lock); |
549 | list_move(&inode->i_lru, &dispose); | 556 | list_add(&inode->i_lru, &dispose); |
550 | } | 557 | } |
551 | spin_unlock(&inode_lock); | 558 | spin_unlock(&inode_lock); |
552 | 559 | ||
@@ -596,10 +603,9 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) | |||
596 | } | 603 | } |
597 | 604 | ||
598 | inode->i_state |= I_FREEING; | 605 | inode->i_state |= I_FREEING; |
599 | if (!(inode->i_state & (I_DIRTY | I_SYNC))) | 606 | inode_lru_list_del(inode); |
600 | inodes_stat.nr_unused--; | ||
601 | spin_unlock(&inode->i_lock); | 607 | spin_unlock(&inode->i_lock); |
602 | list_move(&inode->i_lru, &dispose); | 608 | list_add(&inode->i_lru, &dispose); |
603 | } | 609 | } |
604 | spin_unlock(&inode_lock); | 610 | spin_unlock(&inode_lock); |
605 | 611 | ||
@@ -623,7 +629,7 @@ static int can_unuse(struct inode *inode) | |||
623 | 629 | ||
624 | /* | 630 | /* |
625 | * Scan `goal' inodes on the unused list for freeable ones. They are moved to a | 631 | * Scan `goal' inodes on the unused list for freeable ones. They are moved to a |
626 | * temporary list and then are freed outside inode_lock by dispose_list(). | 632 | * temporary list and then are freed outside inode_lru_lock by dispose_list(). |
627 | * | 633 | * |
628 | * Any inodes which are pinned purely because of attached pagecache have their | 634 | * Any inodes which are pinned purely because of attached pagecache have their |
629 | * pagecache removed. If the inode has metadata buffers attached to | 635 | * pagecache removed. If the inode has metadata buffers attached to |
@@ -645,6 +651,7 @@ static void prune_icache(int nr_to_scan) | |||
645 | 651 | ||
646 | down_read(&iprune_sem); | 652 | down_read(&iprune_sem); |
647 | spin_lock(&inode_lock); | 653 | spin_lock(&inode_lock); |
654 | spin_lock(&inode_lru_lock); | ||
648 | for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { | 655 | for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { |
649 | struct inode *inode; | 656 | struct inode *inode; |
650 | 657 | ||
@@ -654,10 +661,19 @@ static void prune_icache(int nr_to_scan) | |||
654 | inode = list_entry(inode_lru.prev, struct inode, i_lru); | 661 | inode = list_entry(inode_lru.prev, struct inode, i_lru); |
655 | 662 | ||
656 | /* | 663 | /* |
664 | * we are inverting the inode_lru_lock/inode->i_lock here, | ||
665 | * so use a trylock. If we fail to get the lock, just move the | ||
666 | * inode to the back of the list so we don't spin on it. | ||
667 | */ | ||
668 | if (!spin_trylock(&inode->i_lock)) { | ||
669 | list_move(&inode->i_lru, &inode_lru); | ||
670 | continue; | ||
671 | } | ||
672 | |||
673 | /* | ||
657 | * Referenced or dirty inodes are still in use. Give them | 674 | * Referenced or dirty inodes are still in use. Give them |
658 | * another pass through the LRU as we canot reclaim them now. | 675 | * another pass through the LRU as we canot reclaim them now. |
659 | */ | 676 | */ |
660 | spin_lock(&inode->i_lock); | ||
661 | if (atomic_read(&inode->i_count) || | 677 | if (atomic_read(&inode->i_count) || |
662 | (inode->i_state & ~I_REFERENCED)) { | 678 | (inode->i_state & ~I_REFERENCED)) { |
663 | spin_unlock(&inode->i_lock); | 679 | spin_unlock(&inode->i_lock); |
@@ -676,17 +692,21 @@ static void prune_icache(int nr_to_scan) | |||
676 | if (inode_has_buffers(inode) || inode->i_data.nrpages) { | 692 | if (inode_has_buffers(inode) || inode->i_data.nrpages) { |
677 | __iget(inode); | 693 | __iget(inode); |
678 | spin_unlock(&inode->i_lock); | 694 | spin_unlock(&inode->i_lock); |
695 | spin_unlock(&inode_lru_lock); | ||
679 | spin_unlock(&inode_lock); | 696 | spin_unlock(&inode_lock); |
680 | if (remove_inode_buffers(inode)) | 697 | if (remove_inode_buffers(inode)) |
681 | reap += invalidate_mapping_pages(&inode->i_data, | 698 | reap += invalidate_mapping_pages(&inode->i_data, |
682 | 0, -1); | 699 | 0, -1); |
683 | iput(inode); | 700 | iput(inode); |
684 | spin_lock(&inode_lock); | 701 | spin_lock(&inode_lock); |
702 | spin_lock(&inode_lru_lock); | ||
685 | 703 | ||
686 | if (inode != list_entry(inode_lru.next, | 704 | if (inode != list_entry(inode_lru.next, |
687 | struct inode, i_lru)) | 705 | struct inode, i_lru)) |
688 | continue; /* wrong inode or list_empty */ | 706 | continue; /* wrong inode or list_empty */ |
689 | spin_lock(&inode->i_lock); | 707 | /* avoid lock inversions with trylock */ |
708 | if (!spin_trylock(&inode->i_lock)) | ||
709 | continue; | ||
690 | if (!can_unuse(inode)) { | 710 | if (!can_unuse(inode)) { |
691 | spin_unlock(&inode->i_lock); | 711 | spin_unlock(&inode->i_lock); |
692 | continue; | 712 | continue; |
@@ -703,6 +723,7 @@ static void prune_icache(int nr_to_scan) | |||
703 | __count_vm_events(KSWAPD_INODESTEAL, reap); | 723 | __count_vm_events(KSWAPD_INODESTEAL, reap); |
704 | else | 724 | else |
705 | __count_vm_events(PGINODESTEAL, reap); | 725 | __count_vm_events(PGINODESTEAL, reap); |
726 | spin_unlock(&inode_lru_lock); | ||
706 | spin_unlock(&inode_lock); | 727 | spin_unlock(&inode_lock); |
707 | 728 | ||
708 | dispose_list(&freeable); | 729 | dispose_list(&freeable); |