aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorDan Streetman <ddstreet@ieee.org>2014-06-04 19:09:53 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-04 19:54:07 -0400
commitadfab836f4908deb049a5128082719e689eed964 (patch)
tree345475c5282e96791fe3bce8f0287674fa9acc1d /mm/swapfile.c
parent7ee07a44eb53374a73544ae14c71366a02d462e0 (diff)
swap: change swap_info singly-linked list to list_head
The logic controlling the singly-linked list of swap_info_struct entries for all active, i.e. swapon'ed, swap targets is rather complex, because: - it stores the entries in priority order - there is a pointer to the highest priority entry - there is a pointer to the highest priority not-full entry - there is a highest_priority_index variable set outside the swap_lock - swap entries of equal priority should be used equally this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181 where different priority swap targets are incorrectly used equally. That bug probably could be solved with the existing singly-linked lists, but I think it would only add more complexity to the already difficult to understand get_swap_page() swap_list iteration logic. The first patch changes from a singly-linked list to a doubly-linked list using list_heads; the highest_priority_index and related code are removed and get_swap_page() starts each iteration at the highest priority swap_info entry, even if it's full. While this does introduce unnecessary list iteration (i.e. Schlemiel the painter's algorithm) in the case where one or more of the highest priority entries are full, the iteration and manipulation code is much simpler and behaves correctly re: the above bug; and the fourth patch removes the unnecessary iteration. The second patch adds some minor plist helper functions; nothing new really, just functions to match existing regular list functions. These are used by the next two patches. The third patch adds plist_requeue(), which is used by get_swap_page() in the next patch - it performs the requeueing of same-priority entries (which moves the entry to the end of its priority in the plist), so that all equal-priority swap_info_structs get used equally. The fourth patch converts the main list into a plist, and adds a new plist that contains only swap_info entries that are both active and not full. As Mel suggested using plists allows removing all the ordering code from swap - plists handle ordering automatically. The list naming is also clarified now that there are two lists, with the original list changed from swap_list_head to swap_active_head and the new list named swap_avail_head. A new spinlock is also added for the new list, so swap_info entries can be added or removed from the new list immediately as they become full or not full. This patch (of 4): Replace the singly-linked list tracking active, i.e. swapon'ed, swap_info_struct entries with a doubly-linked list using struct list_heads. Simplify the logic iterating and manipulating the list of entries, especially get_swap_page(), by using standard list_head functions, and removing the highest priority iteration logic. The change fixes the bug: https://lkml.org/lkml/2014/2/13/181 in which different priority swap entries after the highest priority entry are incorrectly used equally in pairs. The swap behavior is now as advertised, i.e. different priority swap entries are used in order, and equal priority swap targets are used concurrently. Signed-off-by: Dan Streetman <ddstreet@ieee.org> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Shaohua Li <shli@fusionio.com> Cc: Hugh Dickins <hughd@google.com> Cc: Dan Streetman <ddstreet@ieee.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> Cc: Weijie Yang <weijieut@gmail.com> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Bob Liu <bob.liu@oracle.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c171
1 files changed, 72 insertions, 99 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4a7f7e6992b6..6c95a8c63b1a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages;
51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
52long total_swap_pages; 52long total_swap_pages;
53static int least_priority; 53static int least_priority;
54static atomic_t highest_priority_index = ATOMIC_INIT(-1);
55 54
56static const char Bad_file[] = "Bad swap file entry "; 55static const char Bad_file[] = "Bad swap file entry ";
57static const char Unused_file[] = "Unused swap file entry "; 56static const char Unused_file[] = "Unused swap file entry ";
58static const char Bad_offset[] = "Bad swap offset entry "; 57static const char Bad_offset[] = "Bad swap offset entry ";
59static const char Unused_offset[] = "Unused swap offset entry "; 58static const char Unused_offset[] = "Unused swap offset entry ";
60 59
61struct swap_list_t swap_list = {-1, -1}; 60/*
61 * all active swap_info_structs
62 * protected with swap_lock, and ordered by priority.
63 */
64LIST_HEAD(swap_list_head);
62 65
63struct swap_info_struct *swap_info[MAX_SWAPFILES]; 66struct swap_info_struct *swap_info[MAX_SWAPFILES];
64 67
@@ -640,66 +643,54 @@ no_page:
640 643
641swp_entry_t get_swap_page(void) 644swp_entry_t get_swap_page(void)
642{ 645{
643 struct swap_info_struct *si; 646 struct swap_info_struct *si, *next;
644 pgoff_t offset; 647 pgoff_t offset;
645 int type, next; 648 struct list_head *tmp;
646 int wrapped = 0;
647 int hp_index;
648 649
649 spin_lock(&swap_lock); 650 spin_lock(&swap_lock);
650 if (atomic_long_read(&nr_swap_pages) <= 0) 651 if (atomic_long_read(&nr_swap_pages) <= 0)
651 goto noswap; 652 goto noswap;
652 atomic_long_dec(&nr_swap_pages); 653 atomic_long_dec(&nr_swap_pages);
653 654
654 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 655 list_for_each(tmp, &swap_list_head) {
655 hp_index = atomic_xchg(&highest_priority_index, -1); 656 si = list_entry(tmp, typeof(*si), list);
656 /*
657 * highest_priority_index records current highest priority swap
658 * type which just frees swap entries. If its priority is
659 * higher than that of swap_list.next swap type, we use it. It
660 * isn't protected by swap_lock, so it can be an invalid value
661 * if the corresponding swap type is swapoff. We double check
662 * the flags here. It's even possible the swap type is swapoff
663 * and swapon again and its priority is changed. In such rare
664 * case, low prority swap type might be used, but eventually
665 * high priority swap will be used after several rounds of
666 * swap.
667 */
668 if (hp_index != -1 && hp_index != type &&
669 swap_info[type]->prio < swap_info[hp_index]->prio &&
670 (swap_info[hp_index]->flags & SWP_WRITEOK)) {
671 type = hp_index;
672 swap_list.next = type;
673 }
674
675 si = swap_info[type];
676 next = si->next;
677 if (next < 0 ||
678 (!wrapped && si->prio != swap_info[next]->prio)) {
679 next = swap_list.head;
680 wrapped++;
681 }
682
683 spin_lock(&si->lock); 657 spin_lock(&si->lock);
684 if (!si->highest_bit) { 658 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
685 spin_unlock(&si->lock);
686 continue;
687 }
688 if (!(si->flags & SWP_WRITEOK)) {
689 spin_unlock(&si->lock); 659 spin_unlock(&si->lock);
690 continue; 660 continue;
691 } 661 }
692 662
693 swap_list.next = next; 663 /*
664 * rotate the current swap_info that we're going to use
665 * to after any other swap_info that have the same prio,
666 * so that all equal-priority swap_info get used equally
667 */
668 next = si;
669 list_for_each_entry_continue(next, &swap_list_head, list) {
670 if (si->prio != next->prio)
671 break;
672 list_rotate_left(&si->list);
673 next = si;
674 }
694 675
695 spin_unlock(&swap_lock); 676 spin_unlock(&swap_lock);
696 /* This is called for allocating swap entry for cache */ 677 /* This is called for allocating swap entry for cache */
697 offset = scan_swap_map(si, SWAP_HAS_CACHE); 678 offset = scan_swap_map(si, SWAP_HAS_CACHE);
698 spin_unlock(&si->lock); 679 spin_unlock(&si->lock);
699 if (offset) 680 if (offset)
700 return swp_entry(type, offset); 681 return swp_entry(si->type, offset);
701 spin_lock(&swap_lock); 682 spin_lock(&swap_lock);
702 next = swap_list.next; 683 /*
684 * if we got here, it's likely that si was almost full before,
685 * and since scan_swap_map() can drop the si->lock, multiple
686 * callers probably all tried to get a page from the same si
687 * and it filled up before we could get one. So we need to
688 * try again. Since we dropped the swap_lock, there may now
689 * be non-full higher priority swap_infos, and this si may have
690 * even been removed from the list (although very unlikely).
691 * Let's start over.
692 */
693 tmp = &swap_list_head;
703 } 694 }
704 695
705 atomic_long_inc(&nr_swap_pages); 696 atomic_long_inc(&nr_swap_pages);
@@ -766,27 +757,6 @@ out:
766 return NULL; 757 return NULL;
767} 758}
768 759
769/*
770 * This swap type frees swap entry, check if it is the highest priority swap
771 * type which just frees swap entry. get_swap_page() uses
772 * highest_priority_index to search highest priority swap type. The
773 * swap_info_struct.lock can't protect us if there are multiple swap types
774 * active, so we use atomic_cmpxchg.
775 */
776static void set_highest_priority_index(int type)
777{
778 int old_hp_index, new_hp_index;
779
780 do {
781 old_hp_index = atomic_read(&highest_priority_index);
782 if (old_hp_index != -1 &&
783 swap_info[old_hp_index]->prio >= swap_info[type]->prio)
784 break;
785 new_hp_index = type;
786 } while (atomic_cmpxchg(&highest_priority_index,
787 old_hp_index, new_hp_index) != old_hp_index);
788}
789
790static unsigned char swap_entry_free(struct swap_info_struct *p, 760static unsigned char swap_entry_free(struct swap_info_struct *p,
791 swp_entry_t entry, unsigned char usage) 761 swp_entry_t entry, unsigned char usage)
792{ 762{
@@ -830,7 +800,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
830 p->lowest_bit = offset; 800 p->lowest_bit = offset;
831 if (offset > p->highest_bit) 801 if (offset > p->highest_bit)
832 p->highest_bit = offset; 802 p->highest_bit = offset;
833 set_highest_priority_index(p->type);
834 atomic_long_inc(&nr_swap_pages); 803 atomic_long_inc(&nr_swap_pages);
835 p->inuse_pages--; 804 p->inuse_pages--;
836 frontswap_invalidate_page(p->type, offset); 805 frontswap_invalidate_page(p->type, offset);
@@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1765 unsigned char *swap_map, 1734 unsigned char *swap_map,
1766 struct swap_cluster_info *cluster_info) 1735 struct swap_cluster_info *cluster_info)
1767{ 1736{
1768 int i, prev; 1737 struct swap_info_struct *si;
1769 1738
1770 if (prio >= 0) 1739 if (prio >= 0)
1771 p->prio = prio; 1740 p->prio = prio;
@@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1777 atomic_long_add(p->pages, &nr_swap_pages); 1746 atomic_long_add(p->pages, &nr_swap_pages);
1778 total_swap_pages += p->pages; 1747 total_swap_pages += p->pages;
1779 1748
1780 /* insert swap space into swap_list: */ 1749 assert_spin_locked(&swap_lock);
1781 prev = -1; 1750 BUG_ON(!list_empty(&p->list));
1782 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { 1751 /*
1783 if (p->prio >= swap_info[i]->prio) 1752 * insert into swap list; the list is in priority order,
1784 break; 1753 * so that get_swap_page() can get a page from the highest
1785 prev = i; 1754 * priority swap_info_struct with available page(s), and
1755 * swapoff can adjust the auto-assigned (i.e. negative) prio
1756 * values for any lower-priority swap_info_structs when
1757 * removing a negative-prio swap_info_struct
1758 */
1759 list_for_each_entry(si, &swap_list_head, list) {
1760 if (p->prio >= si->prio) {
1761 list_add_tail(&p->list, &si->list);
1762 return;
1763 }
1786 } 1764 }
1787 p->next = i; 1765 /*
1788 if (prev < 0) 1766 * this covers two cases:
1789 swap_list.head = swap_list.next = p->type; 1767 * 1) p->prio is less than all existing prio
1790 else 1768 * 2) the swap list is empty
1791 swap_info[prev]->next = p->type; 1769 */
1770 list_add_tail(&p->list, &swap_list_head);
1792} 1771}
1793 1772
1794static void enable_swap_info(struct swap_info_struct *p, int prio, 1773static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1823 struct address_space *mapping; 1802 struct address_space *mapping;
1824 struct inode *inode; 1803 struct inode *inode;
1825 struct filename *pathname; 1804 struct filename *pathname;
1826 int i, type, prev; 1805 int err, found = 0;
1827 int err;
1828 unsigned int old_block_size; 1806 unsigned int old_block_size;
1829 1807
1830 if (!capable(CAP_SYS_ADMIN)) 1808 if (!capable(CAP_SYS_ADMIN))
@@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1842 goto out; 1820 goto out;
1843 1821
1844 mapping = victim->f_mapping; 1822 mapping = victim->f_mapping;
1845 prev = -1;
1846 spin_lock(&swap_lock); 1823 spin_lock(&swap_lock);
1847 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { 1824 list_for_each_entry(p, &swap_list_head, list) {
1848 p = swap_info[type];
1849 if (p->flags & SWP_WRITEOK) { 1825 if (p->flags & SWP_WRITEOK) {
1850 if (p->swap_file->f_mapping == mapping) 1826 if (p->swap_file->f_mapping == mapping) {
1827 found = 1;
1851 break; 1828 break;
1829 }
1852 } 1830 }
1853 prev = type;
1854 } 1831 }
1855 if (type < 0) { 1832 if (!found) {
1856 err = -EINVAL; 1833 err = -EINVAL;
1857 spin_unlock(&swap_lock); 1834 spin_unlock(&swap_lock);
1858 goto out_dput; 1835 goto out_dput;
@@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1864 spin_unlock(&swap_lock); 1841 spin_unlock(&swap_lock);
1865 goto out_dput; 1842 goto out_dput;
1866 } 1843 }
1867 if (prev < 0)
1868 swap_list.head = p->next;
1869 else
1870 swap_info[prev]->next = p->next;
1871 if (type == swap_list.next) {
1872 /* just pick something that's safe... */
1873 swap_list.next = swap_list.head;
1874 }
1875 spin_lock(&p->lock); 1844 spin_lock(&p->lock);
1876 if (p->prio < 0) { 1845 if (p->prio < 0) {
1877 for (i = p->next; i >= 0; i = swap_info[i]->next) 1846 struct swap_info_struct *si = p;
1878 swap_info[i]->prio = p->prio--; 1847
1848 list_for_each_entry_continue(si, &swap_list_head, list) {
1849 si->prio++;
1850 }
1879 least_priority++; 1851 least_priority++;
1880 } 1852 }
1853 list_del_init(&p->list);
1881 atomic_long_sub(p->pages, &nr_swap_pages); 1854 atomic_long_sub(p->pages, &nr_swap_pages);
1882 total_swap_pages -= p->pages; 1855 total_swap_pages -= p->pages;
1883 p->flags &= ~SWP_WRITEOK; 1856 p->flags &= ~SWP_WRITEOK;
@@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1885 spin_unlock(&swap_lock); 1858 spin_unlock(&swap_lock);
1886 1859
1887 set_current_oom_origin(); 1860 set_current_oom_origin();
1888 err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1861 err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
1889 clear_current_oom_origin(); 1862 clear_current_oom_origin();
1890 1863
1891 if (err) { 1864 if (err) {
@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1926 frontswap_map = frontswap_map_get(p); 1899 frontswap_map = frontswap_map_get(p);
1927 spin_unlock(&p->lock); 1900 spin_unlock(&p->lock);
1928 spin_unlock(&swap_lock); 1901 spin_unlock(&swap_lock);
1929 frontswap_invalidate_area(type); 1902 frontswap_invalidate_area(p->type);
1930 frontswap_map_set(p, NULL); 1903 frontswap_map_set(p, NULL);
1931 mutex_unlock(&swapon_mutex); 1904 mutex_unlock(&swapon_mutex);
1932 free_percpu(p->percpu_cluster); 1905 free_percpu(p->percpu_cluster);
@@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1935 vfree(cluster_info); 1908 vfree(cluster_info);
1936 vfree(frontswap_map); 1909 vfree(frontswap_map);
1937 /* Destroy swap account information */ 1910 /* Destroy swap account information */
1938 swap_cgroup_swapoff(type); 1911 swap_cgroup_swapoff(p->type);
1939 1912
1940 inode = mapping->host; 1913 inode = mapping->host;
1941 if (S_ISBLK(inode->i_mode)) { 1914 if (S_ISBLK(inode->i_mode)) {
@@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_swap_info(void)
2142 */ 2115 */
2143 } 2116 }
2144 INIT_LIST_HEAD(&p->first_swap_extent.list); 2117 INIT_LIST_HEAD(&p->first_swap_extent.list);
2118 INIT_LIST_HEAD(&p->list);
2145 p->flags = SWP_USED; 2119 p->flags = SWP_USED;
2146 p->next = -1;
2147 spin_unlock(&swap_lock); 2120 spin_unlock(&swap_lock);
2148 spin_lock_init(&p->lock); 2121 spin_lock_init(&p->lock);
2149 2122