diff options
-rw-r--r-- | include/linux/swap.h | 3 | ||||
-rw-r--r-- | include/linux/swapfile.h | 2 | ||||
-rw-r--r-- | mm/frontswap.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 145 |
4 files changed, 97 insertions, 59 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index 8bb85d6d65f0..9155bcdcce12 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -214,7 +214,8 @@ struct percpu_cluster { | |||
214 | struct swap_info_struct { | 214 | struct swap_info_struct { |
215 | unsigned long flags; /* SWP_USED etc: see above */ | 215 | unsigned long flags; /* SWP_USED etc: see above */ |
216 | signed short prio; /* swap priority of this type */ | 216 | signed short prio; /* swap priority of this type */ |
217 | struct list_head list; /* entry in swap list */ | 217 | struct plist_node list; /* entry in swap_active_head */ |
218 | struct plist_node avail_list; /* entry in swap_avail_head */ | ||
218 | signed char type; /* strange name for an index */ | 219 | signed char type; /* strange name for an index */ |
219 | unsigned int max; /* extent of the swap_map */ | 220 | unsigned int max; /* extent of the swap_map */ |
220 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ | 221 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ |
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 2eab382d593d..388293a91e8c 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h | |||
@@ -6,7 +6,7 @@ | |||
6 | * want to expose them to the dozens of source files that include swap.h | 6 | * want to expose them to the dozens of source files that include swap.h |
7 | */ | 7 | */ |
8 | extern spinlock_t swap_lock; | 8 | extern spinlock_t swap_lock; |
9 | extern struct list_head swap_list_head; | 9 | extern struct plist_head swap_active_head; |
10 | extern struct swap_info_struct *swap_info[]; | 10 | extern struct swap_info_struct *swap_info[]; |
11 | extern int try_to_unuse(unsigned int, bool, unsigned long); | 11 | extern int try_to_unuse(unsigned int, bool, unsigned long); |
12 | 12 | ||
diff --git a/mm/frontswap.c b/mm/frontswap.c index fae11602e8a9..c30eec536f03 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c | |||
@@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pages(void) | |||
331 | struct swap_info_struct *si = NULL; | 331 | struct swap_info_struct *si = NULL; |
332 | 332 | ||
333 | assert_spin_locked(&swap_lock); | 333 | assert_spin_locked(&swap_lock); |
334 | list_for_each_entry(si, &swap_list_head, list) | 334 | plist_for_each_entry(si, &swap_active_head, list) |
335 | totalpages += atomic_read(&si->frontswap_pages); | 335 | totalpages += atomic_read(&si->frontswap_pages); |
336 | return totalpages; | 336 | return totalpages; |
337 | } | 337 | } |
@@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | |||
346 | unsigned long pages = 0, pages_to_unuse = 0; | 346 | unsigned long pages = 0, pages_to_unuse = 0; |
347 | 347 | ||
348 | assert_spin_locked(&swap_lock); | 348 | assert_spin_locked(&swap_lock); |
349 | list_for_each_entry(si, &swap_list_head, list) { | 349 | plist_for_each_entry(si, &swap_active_head, list) { |
350 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | 350 | si_frontswap_pages = atomic_read(&si->frontswap_pages); |
351 | if (total_pages_to_unuse < si_frontswap_pages) { | 351 | if (total_pages_to_unuse < si_frontswap_pages) { |
352 | pages = pages_to_unuse = total_pages_to_unuse; | 352 | pages = pages_to_unuse = total_pages_to_unuse; |
@@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) | |||
408 | /* | 408 | /* |
409 | * we don't want to hold swap_lock while doing a very | 409 | * we don't want to hold swap_lock while doing a very |
410 | * lengthy try_to_unuse, but swap_list may change | 410 | * lengthy try_to_unuse, but swap_list may change |
411 | * so restart scan from swap_list_head each time | 411 | * so restart scan from swap_active_head each time |
412 | */ | 412 | */ |
413 | spin_lock(&swap_lock); | 413 | spin_lock(&swap_lock); |
414 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); | 414 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c95a8c63b1a..beeeef8a1b2d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unused swap offset entry "; | |||
61 | * all active swap_info_structs | 61 | * all active swap_info_structs |
62 | * protected with swap_lock, and ordered by priority. | 62 | * protected with swap_lock, and ordered by priority. |
63 | */ | 63 | */ |
64 | LIST_HEAD(swap_list_head); | 64 | PLIST_HEAD(swap_active_head); |
65 | |||
66 | /* | ||
67 | * all available (active, not full) swap_info_structs | ||
68 | * protected with swap_avail_lock, ordered by priority. | ||
69 | * This is used by get_swap_page() instead of swap_active_head | ||
70 | * because swap_active_head includes all swap_info_structs, | ||
71 | * but get_swap_page() doesn't need to look at full ones. | ||
72 | * This uses its own lock instead of swap_lock because when a | ||
73 | * swap_info_struct changes between not-full/full, it needs to | ||
74 | * add/remove itself to/from this list, but the swap_info_struct->lock | ||
75 | * is held and the locking order requires swap_lock to be taken | ||
76 | * before any swap_info_struct->lock. | ||
77 | */ | ||
78 | static PLIST_HEAD(swap_avail_head); | ||
79 | static DEFINE_SPINLOCK(swap_avail_lock); | ||
65 | 80 | ||
66 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 81 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
67 | 82 | ||
@@ -594,6 +609,9 @@ checks: | |||
594 | if (si->inuse_pages == si->pages) { | 609 | if (si->inuse_pages == si->pages) { |
595 | si->lowest_bit = si->max; | 610 | si->lowest_bit = si->max; |
596 | si->highest_bit = 0; | 611 | si->highest_bit = 0; |
612 | spin_lock(&swap_avail_lock); | ||
613 | plist_del(&si->avail_list, &swap_avail_head); | ||
614 | spin_unlock(&swap_avail_lock); | ||
597 | } | 615 | } |
598 | si->swap_map[offset] = usage; | 616 | si->swap_map[offset] = usage; |
599 | inc_cluster_info_page(si, si->cluster_info, offset); | 617 | inc_cluster_info_page(si, si->cluster_info, offset); |
@@ -645,57 +663,63 @@ swp_entry_t get_swap_page(void) | |||
645 | { | 663 | { |
646 | struct swap_info_struct *si, *next; | 664 | struct swap_info_struct *si, *next; |
647 | pgoff_t offset; | 665 | pgoff_t offset; |
648 | struct list_head *tmp; | ||
649 | 666 | ||
650 | spin_lock(&swap_lock); | ||
651 | if (atomic_long_read(&nr_swap_pages) <= 0) | 667 | if (atomic_long_read(&nr_swap_pages) <= 0) |
652 | goto noswap; | 668 | goto noswap; |
653 | atomic_long_dec(&nr_swap_pages); | 669 | atomic_long_dec(&nr_swap_pages); |
654 | 670 | ||
655 | list_for_each(tmp, &swap_list_head) { | 671 | spin_lock(&swap_avail_lock); |
656 | si = list_entry(tmp, typeof(*si), list); | 672 | |
673 | start_over: | ||
674 | plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { | ||
675 | /* requeue si to after same-priority siblings */ | ||
676 | plist_requeue(&si->avail_list, &swap_avail_head); | ||
677 | spin_unlock(&swap_avail_lock); | ||
657 | spin_lock(&si->lock); | 678 | spin_lock(&si->lock); |
658 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { | 679 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { |
680 | spin_lock(&swap_avail_lock); | ||
681 | if (plist_node_empty(&si->avail_list)) { | ||
682 | spin_unlock(&si->lock); | ||
683 | goto nextsi; | ||
684 | } | ||
685 | WARN(!si->highest_bit, | ||
686 | "swap_info %d in list but !highest_bit\n", | ||
687 | si->type); | ||
688 | WARN(!(si->flags & SWP_WRITEOK), | ||
689 | "swap_info %d in list but !SWP_WRITEOK\n", | ||
690 | si->type); | ||
691 | plist_del(&si->avail_list, &swap_avail_head); | ||
659 | spin_unlock(&si->lock); | 692 | spin_unlock(&si->lock); |
660 | continue; | 693 | goto nextsi; |
661 | } | 694 | } |
662 | 695 | ||
663 | /* | ||
664 | * rotate the current swap_info that we're going to use | ||
665 | * to after any other swap_info that have the same prio, | ||
666 | * so that all equal-priority swap_info get used equally | ||
667 | */ | ||
668 | next = si; | ||
669 | list_for_each_entry_continue(next, &swap_list_head, list) { | ||
670 | if (si->prio != next->prio) | ||
671 | break; | ||
672 | list_rotate_left(&si->list); | ||
673 | next = si; | ||
674 | } | ||
675 | |||
676 | spin_unlock(&swap_lock); | ||
677 | /* This is called for allocating swap entry for cache */ | 696 | /* This is called for allocating swap entry for cache */ |
678 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | 697 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
679 | spin_unlock(&si->lock); | 698 | spin_unlock(&si->lock); |
680 | if (offset) | 699 | if (offset) |
681 | return swp_entry(si->type, offset); | 700 | return swp_entry(si->type, offset); |
682 | spin_lock(&swap_lock); | 701 | pr_debug("scan_swap_map of si %d failed to find offset\n", |
702 | si->type); | ||
703 | spin_lock(&swap_avail_lock); | ||
704 | nextsi: | ||
683 | /* | 705 | /* |
684 | * if we got here, it's likely that si was almost full before, | 706 | * if we got here, it's likely that si was almost full before, |
685 | * and since scan_swap_map() can drop the si->lock, multiple | 707 | * and since scan_swap_map() can drop the si->lock, multiple |
686 | * callers probably all tried to get a page from the same si | 708 | * callers probably all tried to get a page from the same si |
687 | * and it filled up before we could get one. So we need to | 709 | * and it filled up before we could get one; or, the si filled |
688 | * try again. Since we dropped the swap_lock, there may now | 710 | * up between us dropping swap_avail_lock and taking si->lock. |
689 | * be non-full higher priority swap_infos, and this si may have | 711 | * Since we dropped the swap_avail_lock, the swap_avail_head |
690 | * even been removed from the list (although very unlikely). | 712 | * list may have been modified; so if next is still in the |
691 | * Let's start over. | 713 | * swap_avail_head list then try it, otherwise start over. |
692 | */ | 714 | */ |
693 | tmp = &swap_list_head; | 715 | if (plist_node_empty(&next->avail_list)) |
716 | goto start_over; | ||
694 | } | 717 | } |
695 | 718 | ||
719 | spin_unlock(&swap_avail_lock); | ||
720 | |||
696 | atomic_long_inc(&nr_swap_pages); | 721 | atomic_long_inc(&nr_swap_pages); |
697 | noswap: | 722 | noswap: |
698 | spin_unlock(&swap_lock); | ||
699 | return (swp_entry_t) {0}; | 723 | return (swp_entry_t) {0}; |
700 | } | 724 | } |
701 | 725 | ||
@@ -798,8 +822,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
798 | dec_cluster_info_page(p, p->cluster_info, offset); | 822 | dec_cluster_info_page(p, p->cluster_info, offset); |
799 | if (offset < p->lowest_bit) | 823 | if (offset < p->lowest_bit) |
800 | p->lowest_bit = offset; | 824 | p->lowest_bit = offset; |
801 | if (offset > p->highest_bit) | 825 | if (offset > p->highest_bit) { |
826 | bool was_full = !p->highest_bit; | ||
802 | p->highest_bit = offset; | 827 | p->highest_bit = offset; |
828 | if (was_full && (p->flags & SWP_WRITEOK)) { | ||
829 | spin_lock(&swap_avail_lock); | ||
830 | WARN_ON(!plist_node_empty(&p->avail_list)); | ||
831 | if (plist_node_empty(&p->avail_list)) | ||
832 | plist_add(&p->avail_list, | ||
833 | &swap_avail_head); | ||
834 | spin_unlock(&swap_avail_lock); | ||
835 | } | ||
836 | } | ||
803 | atomic_long_inc(&nr_swap_pages); | 837 | atomic_long_inc(&nr_swap_pages); |
804 | p->inuse_pages--; | 838 | p->inuse_pages--; |
805 | frontswap_invalidate_page(p->type, offset); | 839 | frontswap_invalidate_page(p->type, offset); |
@@ -1734,12 +1768,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1734 | unsigned char *swap_map, | 1768 | unsigned char *swap_map, |
1735 | struct swap_cluster_info *cluster_info) | 1769 | struct swap_cluster_info *cluster_info) |
1736 | { | 1770 | { |
1737 | struct swap_info_struct *si; | ||
1738 | |||
1739 | if (prio >= 0) | 1771 | if (prio >= 0) |
1740 | p->prio = prio; | 1772 | p->prio = prio; |
1741 | else | 1773 | else |
1742 | p->prio = --least_priority; | 1774 | p->prio = --least_priority; |
1775 | /* | ||
1776 | * the plist prio is negated because plist ordering is | ||
1777 | * low-to-high, while swap ordering is high-to-low | ||
1778 | */ | ||
1779 | p->list.prio = -p->prio; | ||
1780 | p->avail_list.prio = -p->prio; | ||
1743 | p->swap_map = swap_map; | 1781 | p->swap_map = swap_map; |
1744 | p->cluster_info = cluster_info; | 1782 | p->cluster_info = cluster_info; |
1745 | p->flags |= SWP_WRITEOK; | 1783 | p->flags |= SWP_WRITEOK; |
@@ -1747,27 +1785,20 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1747 | total_swap_pages += p->pages; | 1785 | total_swap_pages += p->pages; |
1748 | 1786 | ||
1749 | assert_spin_locked(&swap_lock); | 1787 | assert_spin_locked(&swap_lock); |
1750 | BUG_ON(!list_empty(&p->list)); | ||
1751 | /* | ||
1752 | * insert into swap list; the list is in priority order, | ||
1753 | * so that get_swap_page() can get a page from the highest | ||
1754 | * priority swap_info_struct with available page(s), and | ||
1755 | * swapoff can adjust the auto-assigned (i.e. negative) prio | ||
1756 | * values for any lower-priority swap_info_structs when | ||
1757 | * removing a negative-prio swap_info_struct | ||
1758 | */ | ||
1759 | list_for_each_entry(si, &swap_list_head, list) { | ||
1760 | if (p->prio >= si->prio) { | ||
1761 | list_add_tail(&p->list, &si->list); | ||
1762 | return; | ||
1763 | } | ||
1764 | } | ||
1765 | /* | 1788 | /* |
1766 | * this covers two cases: | 1789 | * both lists are plists, and thus priority ordered. |
1767 | * 1) p->prio is less than all existing prio | 1790 | * swap_active_head needs to be priority ordered for swapoff(), |
1768 | * 2) the swap list is empty | 1791 | * which on removal of any swap_info_struct with an auto-assigned |
1792 | * (i.e. negative) priority increments the auto-assigned priority | ||
1793 | * of any lower-priority swap_info_structs. | ||
1794 | * swap_avail_head needs to be priority ordered for get_swap_page(), | ||
1795 | * which allocates swap pages from the highest available priority | ||
1796 | * swap_info_struct. | ||
1769 | */ | 1797 | */ |
1770 | list_add_tail(&p->list, &swap_list_head); | 1798 | plist_add(&p->list, &swap_active_head); |
1799 | spin_lock(&swap_avail_lock); | ||
1800 | plist_add(&p->avail_list, &swap_avail_head); | ||
1801 | spin_unlock(&swap_avail_lock); | ||
1771 | } | 1802 | } |
1772 | 1803 | ||
1773 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1804 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
@@ -1821,7 +1852,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1821 | 1852 | ||
1822 | mapping = victim->f_mapping; | 1853 | mapping = victim->f_mapping; |
1823 | spin_lock(&swap_lock); | 1854 | spin_lock(&swap_lock); |
1824 | list_for_each_entry(p, &swap_list_head, list) { | 1855 | plist_for_each_entry(p, &swap_active_head, list) { |
1825 | if (p->flags & SWP_WRITEOK) { | 1856 | if (p->flags & SWP_WRITEOK) { |
1826 | if (p->swap_file->f_mapping == mapping) { | 1857 | if (p->swap_file->f_mapping == mapping) { |
1827 | found = 1; | 1858 | found = 1; |
@@ -1841,16 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1841 | spin_unlock(&swap_lock); | 1872 | spin_unlock(&swap_lock); |
1842 | goto out_dput; | 1873 | goto out_dput; |
1843 | } | 1874 | } |
1875 | spin_lock(&swap_avail_lock); | ||
1876 | plist_del(&p->avail_list, &swap_avail_head); | ||
1877 | spin_unlock(&swap_avail_lock); | ||
1844 | spin_lock(&p->lock); | 1878 | spin_lock(&p->lock); |
1845 | if (p->prio < 0) { | 1879 | if (p->prio < 0) { |
1846 | struct swap_info_struct *si = p; | 1880 | struct swap_info_struct *si = p; |
1847 | 1881 | ||
1848 | list_for_each_entry_continue(si, &swap_list_head, list) { | 1882 | plist_for_each_entry_continue(si, &swap_active_head, list) { |
1849 | si->prio++; | 1883 | si->prio++; |
1884 | si->list.prio--; | ||
1885 | si->avail_list.prio--; | ||
1850 | } | 1886 | } |
1851 | least_priority++; | 1887 | least_priority++; |
1852 | } | 1888 | } |
1853 | list_del_init(&p->list); | 1889 | plist_del(&p->list, &swap_active_head); |
1854 | atomic_long_sub(p->pages, &nr_swap_pages); | 1890 | atomic_long_sub(p->pages, &nr_swap_pages); |
1855 | total_swap_pages -= p->pages; | 1891 | total_swap_pages -= p->pages; |
1856 | p->flags &= ~SWP_WRITEOK; | 1892 | p->flags &= ~SWP_WRITEOK; |
@@ -2115,7 +2151,8 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2115 | */ | 2151 | */ |
2116 | } | 2152 | } |
2117 | INIT_LIST_HEAD(&p->first_swap_extent.list); | 2153 | INIT_LIST_HEAD(&p->first_swap_extent.list); |
2118 | INIT_LIST_HEAD(&p->list); | 2154 | plist_node_init(&p->list, 0); |
2155 | plist_node_init(&p->avail_list, 0); | ||
2119 | p->flags = SWP_USED; | 2156 | p->flags = SWP_USED; |
2120 | spin_unlock(&swap_lock); | 2157 | spin_unlock(&swap_lock); |
2121 | spin_lock_init(&p->lock); | 2158 | spin_lock_init(&p->lock); |