summaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorHuang Ying <ying.huang@intel.com>2017-11-02 18:59:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-03 10:39:19 -0400
commit2628bd6fc052bd85e9864dae4de494d8a6313391 (patch)
tree109b74ba01ac402232f89d5650bacf1f761dc5a0 /mm/swapfile.c
parentdd8a67f9a37c74b61e5e050924ceec9ffb4f8c3c (diff)
mm, swap: fix race between swap count continuation operations
One page may store a set of entries of the sis->swap_map (swap_info_struct->swap_map) in multiple swap clusters. If some of the entries has sis->swap_map[offset] > SWAP_MAP_MAX, multiple pages will be used to store the set of entries of the sis->swap_map. And the pages are linked with page->lru. This is called swap count continuation. To access the pages which store the set of entries of the sis->swap_map simultaneously, previously, sis->lock is used. But to improve the scalability of __swap_duplicate(), swap cluster lock may be used in swap_count_continued() now. This may race with add_swap_count_continuation() which operates on a nearby swap cluster, in which the sis->swap_map entries are stored in the same page. The race can cause wrong swap count in practice, thus cause unfreeable swap entries or software lockup, etc. To fix the race, a new spin lock called cont_lock is added to struct swap_info_struct to protect the swap count continuation page list. This is a lock at the swap device level, so the scalability isn't very well. But it is still much better than the original sis->lock, because it is only acquired/released when swap count continuation is used. Which is considered rare in practice. If it turns out that the scalability becomes an issue for some workloads, we can split the lock into some more fine grained locks. Link: http://lkml.kernel.org/r/20171017081320.28133-1-ying.huang@intel.com Fixes: 235b62176712 ("mm/swap: add cluster lock") Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Shaohua Li <shli@kernel.org> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Aaron Lu <aaron.lu@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Hugh Dickins <hughd@google.com> Cc: <stable@vger.kernel.org> [4.11+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c23
1 files changed, 17 insertions, 6 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bf91dc9e7a79..e47a21e64764 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2869,6 +2869,7 @@ static struct swap_info_struct *alloc_swap_info(void)
2869 p->flags = SWP_USED; 2869 p->flags = SWP_USED;
2870 spin_unlock(&swap_lock); 2870 spin_unlock(&swap_lock);
2871 spin_lock_init(&p->lock); 2871 spin_lock_init(&p->lock);
2872 spin_lock_init(&p->cont_lock);
2872 2873
2873 return p; 2874 return p;
2874} 2875}
@@ -3545,6 +3546,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3545 head = vmalloc_to_page(si->swap_map + offset); 3546 head = vmalloc_to_page(si->swap_map + offset);
3546 offset &= ~PAGE_MASK; 3547 offset &= ~PAGE_MASK;
3547 3548
3549 spin_lock(&si->cont_lock);
3548 /* 3550 /*
3549 * Page allocation does not initialize the page's lru field, 3551 * Page allocation does not initialize the page's lru field,
3550 * but it does always reset its private field. 3552 * but it does always reset its private field.
@@ -3564,7 +3566,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3564 * a continuation page, free our allocation and use this one. 3566 * a continuation page, free our allocation and use this one.
3565 */ 3567 */
3566 if (!(count & COUNT_CONTINUED)) 3568 if (!(count & COUNT_CONTINUED))
3567 goto out; 3569 goto out_unlock_cont;
3568 3570
3569 map = kmap_atomic(list_page) + offset; 3571 map = kmap_atomic(list_page) + offset;
3570 count = *map; 3572 count = *map;
@@ -3575,11 +3577,13 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3575 * free our allocation and use this one. 3577 * free our allocation and use this one.
3576 */ 3578 */
3577 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 3579 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3578 goto out; 3580 goto out_unlock_cont;
3579 } 3581 }
3580 3582
3581 list_add_tail(&page->lru, &head->lru); 3583 list_add_tail(&page->lru, &head->lru);
3582 page = NULL; /* now it's attached, don't free it */ 3584 page = NULL; /* now it's attached, don't free it */
3585out_unlock_cont:
3586 spin_unlock(&si->cont_lock);
3583out: 3587out:
3584 unlock_cluster(ci); 3588 unlock_cluster(ci);
3585 spin_unlock(&si->lock); 3589 spin_unlock(&si->lock);
@@ -3604,6 +3608,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
3604 struct page *head; 3608 struct page *head;
3605 struct page *page; 3609 struct page *page;
3606 unsigned char *map; 3610 unsigned char *map;
3611 bool ret;
3607 3612
3608 head = vmalloc_to_page(si->swap_map + offset); 3613 head = vmalloc_to_page(si->swap_map + offset);
3609 if (page_private(head) != SWP_CONTINUED) { 3614 if (page_private(head) != SWP_CONTINUED) {
@@ -3611,6 +3616,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
3611 return false; /* need to add count continuation */ 3616 return false; /* need to add count continuation */
3612 } 3617 }
3613 3618
3619 spin_lock(&si->cont_lock);
3614 offset &= ~PAGE_MASK; 3620 offset &= ~PAGE_MASK;
3615 page = list_entry(head->lru.next, struct page, lru); 3621 page = list_entry(head->lru.next, struct page, lru);
3616 map = kmap_atomic(page) + offset; 3622 map = kmap_atomic(page) + offset;
@@ -3631,8 +3637,10 @@ static bool swap_count_continued(struct swap_info_struct *si,
3631 if (*map == SWAP_CONT_MAX) { 3637 if (*map == SWAP_CONT_MAX) {
3632 kunmap_atomic(map); 3638 kunmap_atomic(map);
3633 page = list_entry(page->lru.next, struct page, lru); 3639 page = list_entry(page->lru.next, struct page, lru);
3634 if (page == head) 3640 if (page == head) {
3635 return false; /* add count continuation */ 3641 ret = false; /* add count continuation */
3642 goto out;
3643 }
3636 map = kmap_atomic(page) + offset; 3644 map = kmap_atomic(page) + offset;
3637init_map: *map = 0; /* we didn't zero the page */ 3645init_map: *map = 0; /* we didn't zero the page */
3638 } 3646 }
@@ -3645,7 +3653,7 @@ init_map: *map = 0; /* we didn't zero the page */
3645 kunmap_atomic(map); 3653 kunmap_atomic(map);
3646 page = list_entry(page->lru.prev, struct page, lru); 3654 page = list_entry(page->lru.prev, struct page, lru);
3647 } 3655 }
3648 return true; /* incremented */ 3656 ret = true; /* incremented */
3649 3657
3650 } else { /* decrementing */ 3658 } else { /* decrementing */
3651 /* 3659 /*
@@ -3671,8 +3679,11 @@ init_map: *map = 0; /* we didn't zero the page */
3671 kunmap_atomic(map); 3679 kunmap_atomic(map);
3672 page = list_entry(page->lru.prev, struct page, lru); 3680 page = list_entry(page->lru.prev, struct page, lru);
3673 } 3681 }
3674 return count == COUNT_CONTINUED; 3682 ret = count == COUNT_CONTINUED;
3675 } 3683 }
3684out:
3685 spin_unlock(&si->cont_lock);
3686 return ret;
3676} 3687}
3677 3688
3678/* 3689/*