diff options
author | Dan Streetman <ddstreet@ieee.org> | 2014-06-04 19:09:53 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-04 19:54:07 -0400 |
commit | adfab836f4908deb049a5128082719e689eed964 (patch) | |
tree | 345475c5282e96791fe3bce8f0287674fa9acc1d /mm/swapfile.c | |
parent | 7ee07a44eb53374a73544ae14c71366a02d462e0 (diff) |
swap: change swap_info singly-linked list to list_head
The logic controlling the singly-linked list of swap_info_struct entries
for all active, i.e. swapon'ed, swap targets is rather complex, because:
- it stores the entries in priority order
- there is a pointer to the highest priority entry
- there is a pointer to the highest priority not-full entry
- there is a highest_priority_index variable set outside the swap_lock
- swap entries of equal priority should be used equally
this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181
where different priority swap targets are incorrectly used equally.
That bug probably could be solved with the existing singly-linked lists,
but I think it would only add more complexity to the already difficult to
understand get_swap_page() swap_list iteration logic.
The first patch changes from a singly-linked list to a doubly-linked list
using list_heads; the highest_priority_index and related code are removed
and get_swap_page() starts each iteration at the highest priority
swap_info entry, even if it's full. While this does introduce unnecessary
list iteration (i.e. Schlemiel the painter's algorithm) in the case where
one or more of the highest priority entries are full, the iteration and
manipulation code is much simpler and behaves correctly re: the above bug;
and the fourth patch removes the unnecessary iteration.
The second patch adds some minor plist helper functions; nothing new
really, just functions to match existing regular list functions. These
are used by the next two patches.
The third patch adds plist_requeue(), which is used by get_swap_page() in
the next patch - it performs the requeueing of same-priority entries
(which moves the entry to the end of its priority in the plist), so that
all equal-priority swap_info_structs get used equally.
The fourth patch converts the main list into a plist, and adds a new plist
that contains only swap_info entries that are both active and not full.
As Mel suggested using plists allows removing all the ordering code from
swap - plists handle ordering automatically. The list naming is also
clarified now that there are two lists, with the original list changed
from swap_list_head to swap_active_head and the new list named
swap_avail_head. A new spinlock is also added for the new list, so
swap_info entries can be added or removed from the new list immediately as
they become full or not full.
This patch (of 4):
Replace the singly-linked list tracking active, i.e. swapon'ed,
swap_info_struct entries with a doubly-linked list using struct
list_heads. Simplify the logic iterating and manipulating the list of
entries, especially get_swap_page(), by using standard list_head
functions, and removing the highest priority iteration logic.
The change fixes the bug:
https://lkml.org/lkml/2014/2/13/181
in which different priority swap entries after the highest priority entry
are incorrectly used equally in pairs. The swap behavior is now as
advertised, i.e. different priority swap entries are used in order, and
equal priority swap targets are used concurrently.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Cc: Weijie Yang <weijieut@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 171 |
1 files changed, 72 insertions, 99 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index 4a7f7e6992b6..6c95a8c63b1a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages; | |||
51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | 51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ |
52 | long total_swap_pages; | 52 | long total_swap_pages; |
53 | static int least_priority; | 53 | static int least_priority; |
54 | static atomic_t highest_priority_index = ATOMIC_INIT(-1); | ||
55 | 54 | ||
56 | static const char Bad_file[] = "Bad swap file entry "; | 55 | static const char Bad_file[] = "Bad swap file entry "; |
57 | static const char Unused_file[] = "Unused swap file entry "; | 56 | static const char Unused_file[] = "Unused swap file entry "; |
58 | static const char Bad_offset[] = "Bad swap offset entry "; | 57 | static const char Bad_offset[] = "Bad swap offset entry "; |
59 | static const char Unused_offset[] = "Unused swap offset entry "; | 58 | static const char Unused_offset[] = "Unused swap offset entry "; |
60 | 59 | ||
61 | struct swap_list_t swap_list = {-1, -1}; | 60 | /* |
61 | * all active swap_info_structs | ||
62 | * protected with swap_lock, and ordered by priority. | ||
63 | */ | ||
64 | LIST_HEAD(swap_list_head); | ||
62 | 65 | ||
63 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 66 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
64 | 67 | ||
@@ -640,66 +643,54 @@ no_page: | |||
640 | 643 | ||
641 | swp_entry_t get_swap_page(void) | 644 | swp_entry_t get_swap_page(void) |
642 | { | 645 | { |
643 | struct swap_info_struct *si; | 646 | struct swap_info_struct *si, *next; |
644 | pgoff_t offset; | 647 | pgoff_t offset; |
645 | int type, next; | 648 | struct list_head *tmp; |
646 | int wrapped = 0; | ||
647 | int hp_index; | ||
648 | 649 | ||
649 | spin_lock(&swap_lock); | 650 | spin_lock(&swap_lock); |
650 | if (atomic_long_read(&nr_swap_pages) <= 0) | 651 | if (atomic_long_read(&nr_swap_pages) <= 0) |
651 | goto noswap; | 652 | goto noswap; |
652 | atomic_long_dec(&nr_swap_pages); | 653 | atomic_long_dec(&nr_swap_pages); |
653 | 654 | ||
654 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 655 | list_for_each(tmp, &swap_list_head) { |
655 | hp_index = atomic_xchg(&highest_priority_index, -1); | 656 | si = list_entry(tmp, typeof(*si), list); |
656 | /* | ||
657 | * highest_priority_index records current highest priority swap | ||
658 | * type which just frees swap entries. If its priority is | ||
659 | * higher than that of swap_list.next swap type, we use it. It | ||
660 | * isn't protected by swap_lock, so it can be an invalid value | ||
661 | * if the corresponding swap type is swapoff. We double check | ||
662 | * the flags here. It's even possible the swap type is swapoff | ||
663 | * and swapon again and its priority is changed. In such rare | ||
664 | * case, low prority swap type might be used, but eventually | ||
665 | * high priority swap will be used after several rounds of | ||
666 | * swap. | ||
667 | */ | ||
668 | if (hp_index != -1 && hp_index != type && | ||
669 | swap_info[type]->prio < swap_info[hp_index]->prio && | ||
670 | (swap_info[hp_index]->flags & SWP_WRITEOK)) { | ||
671 | type = hp_index; | ||
672 | swap_list.next = type; | ||
673 | } | ||
674 | |||
675 | si = swap_info[type]; | ||
676 | next = si->next; | ||
677 | if (next < 0 || | ||
678 | (!wrapped && si->prio != swap_info[next]->prio)) { | ||
679 | next = swap_list.head; | ||
680 | wrapped++; | ||
681 | } | ||
682 | |||
683 | spin_lock(&si->lock); | 657 | spin_lock(&si->lock); |
684 | if (!si->highest_bit) { | 658 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { |
685 | spin_unlock(&si->lock); | ||
686 | continue; | ||
687 | } | ||
688 | if (!(si->flags & SWP_WRITEOK)) { | ||
689 | spin_unlock(&si->lock); | 659 | spin_unlock(&si->lock); |
690 | continue; | 660 | continue; |
691 | } | 661 | } |
692 | 662 | ||
693 | swap_list.next = next; | 663 | /* |
664 | * rotate the current swap_info that we're going to use | ||
665 | * to after any other swap_info that have the same prio, | ||
666 | * so that all equal-priority swap_info get used equally | ||
667 | */ | ||
668 | next = si; | ||
669 | list_for_each_entry_continue(next, &swap_list_head, list) { | ||
670 | if (si->prio != next->prio) | ||
671 | break; | ||
672 | list_rotate_left(&si->list); | ||
673 | next = si; | ||
674 | } | ||
694 | 675 | ||
695 | spin_unlock(&swap_lock); | 676 | spin_unlock(&swap_lock); |
696 | /* This is called for allocating swap entry for cache */ | 677 | /* This is called for allocating swap entry for cache */ |
697 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | 678 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
698 | spin_unlock(&si->lock); | 679 | spin_unlock(&si->lock); |
699 | if (offset) | 680 | if (offset) |
700 | return swp_entry(type, offset); | 681 | return swp_entry(si->type, offset); |
701 | spin_lock(&swap_lock); | 682 | spin_lock(&swap_lock); |
702 | next = swap_list.next; | 683 | /* |
684 | * if we got here, it's likely that si was almost full before, | ||
685 | * and since scan_swap_map() can drop the si->lock, multiple | ||
686 | * callers probably all tried to get a page from the same si | ||
687 | * and it filled up before we could get one. So we need to | ||
688 | * try again. Since we dropped the swap_lock, there may now | ||
689 | * be non-full higher priority swap_infos, and this si may have | ||
690 | * even been removed from the list (although very unlikely). | ||
691 | * Let's start over. | ||
692 | */ | ||
693 | tmp = &swap_list_head; | ||
703 | } | 694 | } |
704 | 695 | ||
705 | atomic_long_inc(&nr_swap_pages); | 696 | atomic_long_inc(&nr_swap_pages); |
@@ -766,27 +757,6 @@ out: | |||
766 | return NULL; | 757 | return NULL; |
767 | } | 758 | } |
768 | 759 | ||
769 | /* | ||
770 | * This swap type frees swap entry, check if it is the highest priority swap | ||
771 | * type which just frees swap entry. get_swap_page() uses | ||
772 | * highest_priority_index to search highest priority swap type. The | ||
773 | * swap_info_struct.lock can't protect us if there are multiple swap types | ||
774 | * active, so we use atomic_cmpxchg. | ||
775 | */ | ||
776 | static void set_highest_priority_index(int type) | ||
777 | { | ||
778 | int old_hp_index, new_hp_index; | ||
779 | |||
780 | do { | ||
781 | old_hp_index = atomic_read(&highest_priority_index); | ||
782 | if (old_hp_index != -1 && | ||
783 | swap_info[old_hp_index]->prio >= swap_info[type]->prio) | ||
784 | break; | ||
785 | new_hp_index = type; | ||
786 | } while (atomic_cmpxchg(&highest_priority_index, | ||
787 | old_hp_index, new_hp_index) != old_hp_index); | ||
788 | } | ||
789 | |||
790 | static unsigned char swap_entry_free(struct swap_info_struct *p, | 760 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
791 | swp_entry_t entry, unsigned char usage) | 761 | swp_entry_t entry, unsigned char usage) |
792 | { | 762 | { |
@@ -830,7 +800,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
830 | p->lowest_bit = offset; | 800 | p->lowest_bit = offset; |
831 | if (offset > p->highest_bit) | 801 | if (offset > p->highest_bit) |
832 | p->highest_bit = offset; | 802 | p->highest_bit = offset; |
833 | set_highest_priority_index(p->type); | ||
834 | atomic_long_inc(&nr_swap_pages); | 803 | atomic_long_inc(&nr_swap_pages); |
835 | p->inuse_pages--; | 804 | p->inuse_pages--; |
836 | frontswap_invalidate_page(p->type, offset); | 805 | frontswap_invalidate_page(p->type, offset); |
@@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1765 | unsigned char *swap_map, | 1734 | unsigned char *swap_map, |
1766 | struct swap_cluster_info *cluster_info) | 1735 | struct swap_cluster_info *cluster_info) |
1767 | { | 1736 | { |
1768 | int i, prev; | 1737 | struct swap_info_struct *si; |
1769 | 1738 | ||
1770 | if (prio >= 0) | 1739 | if (prio >= 0) |
1771 | p->prio = prio; | 1740 | p->prio = prio; |
@@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1777 | atomic_long_add(p->pages, &nr_swap_pages); | 1746 | atomic_long_add(p->pages, &nr_swap_pages); |
1778 | total_swap_pages += p->pages; | 1747 | total_swap_pages += p->pages; |
1779 | 1748 | ||
1780 | /* insert swap space into swap_list: */ | 1749 | assert_spin_locked(&swap_lock); |
1781 | prev = -1; | 1750 | BUG_ON(!list_empty(&p->list)); |
1782 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { | 1751 | /* |
1783 | if (p->prio >= swap_info[i]->prio) | 1752 | * insert into swap list; the list is in priority order, |
1784 | break; | 1753 | * so that get_swap_page() can get a page from the highest |
1785 | prev = i; | 1754 | * priority swap_info_struct with available page(s), and |
1755 | * swapoff can adjust the auto-assigned (i.e. negative) prio | ||
1756 | * values for any lower-priority swap_info_structs when | ||
1757 | * removing a negative-prio swap_info_struct | ||
1758 | */ | ||
1759 | list_for_each_entry(si, &swap_list_head, list) { | ||
1760 | if (p->prio >= si->prio) { | ||
1761 | list_add_tail(&p->list, &si->list); | ||
1762 | return; | ||
1763 | } | ||
1786 | } | 1764 | } |
1787 | p->next = i; | 1765 | /* |
1788 | if (prev < 0) | 1766 | * this covers two cases: |
1789 | swap_list.head = swap_list.next = p->type; | 1767 | * 1) p->prio is less than all existing prio |
1790 | else | 1768 | * 2) the swap list is empty |
1791 | swap_info[prev]->next = p->type; | 1769 | */ |
1770 | list_add_tail(&p->list, &swap_list_head); | ||
1792 | } | 1771 | } |
1793 | 1772 | ||
1794 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1773 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
@@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1823 | struct address_space *mapping; | 1802 | struct address_space *mapping; |
1824 | struct inode *inode; | 1803 | struct inode *inode; |
1825 | struct filename *pathname; | 1804 | struct filename *pathname; |
1826 | int i, type, prev; | 1805 | int err, found = 0; |
1827 | int err; | ||
1828 | unsigned int old_block_size; | 1806 | unsigned int old_block_size; |
1829 | 1807 | ||
1830 | if (!capable(CAP_SYS_ADMIN)) | 1808 | if (!capable(CAP_SYS_ADMIN)) |
@@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1842 | goto out; | 1820 | goto out; |
1843 | 1821 | ||
1844 | mapping = victim->f_mapping; | 1822 | mapping = victim->f_mapping; |
1845 | prev = -1; | ||
1846 | spin_lock(&swap_lock); | 1823 | spin_lock(&swap_lock); |
1847 | for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { | 1824 | list_for_each_entry(p, &swap_list_head, list) { |
1848 | p = swap_info[type]; | ||
1849 | if (p->flags & SWP_WRITEOK) { | 1825 | if (p->flags & SWP_WRITEOK) { |
1850 | if (p->swap_file->f_mapping == mapping) | 1826 | if (p->swap_file->f_mapping == mapping) { |
1827 | found = 1; | ||
1851 | break; | 1828 | break; |
1829 | } | ||
1852 | } | 1830 | } |
1853 | prev = type; | ||
1854 | } | 1831 | } |
1855 | if (type < 0) { | 1832 | if (!found) { |
1856 | err = -EINVAL; | 1833 | err = -EINVAL; |
1857 | spin_unlock(&swap_lock); | 1834 | spin_unlock(&swap_lock); |
1858 | goto out_dput; | 1835 | goto out_dput; |
@@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1864 | spin_unlock(&swap_lock); | 1841 | spin_unlock(&swap_lock); |
1865 | goto out_dput; | 1842 | goto out_dput; |
1866 | } | 1843 | } |
1867 | if (prev < 0) | ||
1868 | swap_list.head = p->next; | ||
1869 | else | ||
1870 | swap_info[prev]->next = p->next; | ||
1871 | if (type == swap_list.next) { | ||
1872 | /* just pick something that's safe... */ | ||
1873 | swap_list.next = swap_list.head; | ||
1874 | } | ||
1875 | spin_lock(&p->lock); | 1844 | spin_lock(&p->lock); |
1876 | if (p->prio < 0) { | 1845 | if (p->prio < 0) { |
1877 | for (i = p->next; i >= 0; i = swap_info[i]->next) | 1846 | struct swap_info_struct *si = p; |
1878 | swap_info[i]->prio = p->prio--; | 1847 | |
1848 | list_for_each_entry_continue(si, &swap_list_head, list) { | ||
1849 | si->prio++; | ||
1850 | } | ||
1879 | least_priority++; | 1851 | least_priority++; |
1880 | } | 1852 | } |
1853 | list_del_init(&p->list); | ||
1881 | atomic_long_sub(p->pages, &nr_swap_pages); | 1854 | atomic_long_sub(p->pages, &nr_swap_pages); |
1882 | total_swap_pages -= p->pages; | 1855 | total_swap_pages -= p->pages; |
1883 | p->flags &= ~SWP_WRITEOK; | 1856 | p->flags &= ~SWP_WRITEOK; |
@@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1885 | spin_unlock(&swap_lock); | 1858 | spin_unlock(&swap_lock); |
1886 | 1859 | ||
1887 | set_current_oom_origin(); | 1860 | set_current_oom_origin(); |
1888 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ | 1861 | err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ |
1889 | clear_current_oom_origin(); | 1862 | clear_current_oom_origin(); |
1890 | 1863 | ||
1891 | if (err) { | 1864 | if (err) { |
@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1926 | frontswap_map = frontswap_map_get(p); | 1899 | frontswap_map = frontswap_map_get(p); |
1927 | spin_unlock(&p->lock); | 1900 | spin_unlock(&p->lock); |
1928 | spin_unlock(&swap_lock); | 1901 | spin_unlock(&swap_lock); |
1929 | frontswap_invalidate_area(type); | 1902 | frontswap_invalidate_area(p->type); |
1930 | frontswap_map_set(p, NULL); | 1903 | frontswap_map_set(p, NULL); |
1931 | mutex_unlock(&swapon_mutex); | 1904 | mutex_unlock(&swapon_mutex); |
1932 | free_percpu(p->percpu_cluster); | 1905 | free_percpu(p->percpu_cluster); |
@@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1935 | vfree(cluster_info); | 1908 | vfree(cluster_info); |
1936 | vfree(frontswap_map); | 1909 | vfree(frontswap_map); |
1937 | /* Destroy swap account information */ | 1910 | /* Destroy swap account information */ |
1938 | swap_cgroup_swapoff(type); | 1911 | swap_cgroup_swapoff(p->type); |
1939 | 1912 | ||
1940 | inode = mapping->host; | 1913 | inode = mapping->host; |
1941 | if (S_ISBLK(inode->i_mode)) { | 1914 | if (S_ISBLK(inode->i_mode)) { |
@@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2142 | */ | 2115 | */ |
2143 | } | 2116 | } |
2144 | INIT_LIST_HEAD(&p->first_swap_extent.list); | 2117 | INIT_LIST_HEAD(&p->first_swap_extent.list); |
2118 | INIT_LIST_HEAD(&p->list); | ||
2145 | p->flags = SWP_USED; | 2119 | p->flags = SWP_USED; |
2146 | p->next = -1; | ||
2147 | spin_unlock(&swap_lock); | 2120 | spin_unlock(&swap_lock); |
2148 | spin_lock_init(&p->lock); | 2121 | spin_lock_init(&p->lock); |
2149 | 2122 | ||