aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorHugh Dickins <hugh.dickins@tiscali.co.uk>2009-12-14 20:58:46 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 11:53:15 -0500
commit570a335b8e22579e2a51a68136d2b1f907a20eec (patch)
treec5312383e948d2e7ac60c2fa410fee98e8b38a70 /mm/memory.c
parent8d69aaee80c123b460918816cbfa2e83224c3646 (diff)
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same swap page is inserted into another mm (when forking finds a swap entry in place of a pte, or when reclaim unmaps a pte to insert the swap entry). swap_info_struct's vmalloc'ed swap_map is the array of these reference counts: but what happens when the unsigned short (or unsigned char since the preceding patch) is full? (and its high bit is kept for a cache flag) We then lose track of it, never freeing, leaving it in use until swapoff: at which point we _hope_ that a single pass will have found all instances, assume there are no more, and will lose user data if we're wrong. Swapping of KSM pages has not yet been enabled; but it is implemented, and makes it very easy for a user to overflow the maximum swap count: possible with ordinary process pages, but unlikely, even when pid_max has been raised from PID_MAX_DEFAULT. This patch implements swap count continuations: when the count overflows, a continuation page is allocated and linked to the original vmalloc'ed map page, and this used to hold the continuation counts for that entry and its neighbours. These continuation pages are seldom referenced: the common paths all work on the original swap_map, only referring to a continuation page when the low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c19
1 files changed, 16 insertions, 3 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 6ab19dd4a199..543c446bf4ed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -572,7 +572,7 @@ out:
572 * covered by this vma. 572 * covered by this vma.
573 */ 573 */
574 574
575static inline void 575static inline unsigned long
576copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 576copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
577 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, 577 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
578 unsigned long addr, int *rss) 578 unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
586 if (!pte_file(pte)) { 586 if (!pte_file(pte)) {
587 swp_entry_t entry = pte_to_swp_entry(pte); 587 swp_entry_t entry = pte_to_swp_entry(pte);
588 588
589 swap_duplicate(entry); 589 if (swap_duplicate(entry) < 0)
590 return entry.val;
591
590 /* make sure dst_mm is on swapoff's mmlist. */ 592 /* make sure dst_mm is on swapoff's mmlist. */
591 if (unlikely(list_empty(&dst_mm->mmlist))) { 593 if (unlikely(list_empty(&dst_mm->mmlist))) {
592 spin_lock(&mmlist_lock); 594 spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
635 637
636out_set_pte: 638out_set_pte:
637 set_pte_at(dst_mm, addr, dst_pte, pte); 639 set_pte_at(dst_mm, addr, dst_pte, pte);
640 return 0;
638} 641}
639 642
640static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 643static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
646 spinlock_t *src_ptl, *dst_ptl; 649 spinlock_t *src_ptl, *dst_ptl;
647 int progress = 0; 650 int progress = 0;
648 int rss[2]; 651 int rss[2];
652 swp_entry_t entry = (swp_entry_t){0};
649 653
650again: 654again:
651 rss[1] = rss[0] = 0; 655 rss[1] = rss[0] = 0;
@@ -674,7 +678,10 @@ again:
674 progress++; 678 progress++;
675 continue; 679 continue;
676 } 680 }
677 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); 681 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
682 vma, addr, rss);
683 if (entry.val)
684 break;
678 progress += 8; 685 progress += 8;
679 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 686 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
680 687
@@ -684,6 +691,12 @@ again:
684 add_mm_rss(dst_mm, rss[0], rss[1]); 691 add_mm_rss(dst_mm, rss[0], rss[1]);
685 pte_unmap_unlock(orig_dst_pte, dst_ptl); 692 pte_unmap_unlock(orig_dst_pte, dst_ptl);
686 cond_resched(); 693 cond_resched();
694
695 if (entry.val) {
696 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
697 return -ENOMEM;
698 progress = 0;
699 }
687 if (addr != end) 700 if (addr != end)
688 goto again; 701 goto again;
689 return 0; 702 return 0;