swap_info: swap count continuations

Swap is duplicated (reference count incremented by one) whenever the same swap page is inserted into another mm (when forking finds a swap entry in place of a pte, or when reclaim unmaps a pte to insert the swap entry). swap_info_struct's vmalloc'ed swap_map is the array of these reference counts: but what happens when the unsigned short (or unsigned char since the preceding patch) is full? (and its high bit is kept for a cache flag) We then lose track of it, never freeing, leaving it in use until swapoff: at which point we _hope_ that a single pass will have found all instances, assume there are no more, and will lose user data if we're wrong. Swapping of KSM pages has not yet been enabled; but it is implemented, and makes it very easy for a user to overflow the maximum swap count: possible with ordinary process pages, but unlikely, even when pid_max has been raised from PID_MAX_DEFAULT. This patch implements swap count continuations: when the count overflows, a continuation page is allocated and linked to the original vmalloc'ed map page, and this used to hold the continuation counts for that entry and its neighbours. These continuation pages are seldom referenced: the common paths all work on the original swap_map, only referring to a continuation page when the low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Hugh Dickins <hugh.dickins@tiscali.co.uk> 2009-12-14 20:58:46 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-12-15 11:53:15 -0500
commit: 570a335b8e22579e2a51a68136d2b1f907a20eec (patch)
tree: c5312383e948d2e7ac60c2fa410fee98e8b38a70 /mm
parent: 8d69aaee80c123b460918816cbfa2e83224c3646 (diff)
3 files changed, 271 insertions, 58 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 6ab19dd4a199..543c446bf4ed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -572,7 +572,7 @@ out:
 * covered by this vma.
 */
-static inline void
+static inline unsigned long
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
                unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                if (!pte_file(pte)) {
                        swp_entry_t entry = pte_to_swp_entry(pte);
-                        swap_duplicate(entry);
+                        if (swap_duplicate(entry) < 0)
+                                return entry.val;
                        /* make sure dst_mm is on swapoff's mmlist. */
                        if (unlikely(list_empty(&dst_mm->mmlist))) {
                                spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 out_set_pte:
        set_pte_at(dst_mm, addr, dst_pte, pte);
+        return 0;
 }
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        spinlock_t *src_ptl, *dst_ptl;
        int progress = 0;
        int rss[2];
+        swp_entry_t entry = (swp_entry_t){0};
 again:
        rss[1] = rss[0] = 0;
@@ -674,7 +678,10 @@ again:
                        progress++;
                        continue;
                }
-                copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+                entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+                                                        vma, addr, rss);
+                if (entry.val)
+                        break;
                progress += 8;
        } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -684,6 +691,12 @@ again:
        add_mm_rss(dst_mm, rss[0], rss[1]);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
+        if (entry.val) {
+                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+                        return -ENOMEM;
+                progress = 0;
+        }
        if (addr != end)
                goto again;
        return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index dd43373a483f..710bb4b2adf1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
-                        swap_duplicate(entry);
+                        if (swap_duplicate(entry) < 0) {
+                                set_pte_at(mm, address, pte, pteval);
+                                ret = SWAP_FAIL;
+                                goto out_unmap;
+                        }
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c0d7b9ed0c16..cc5e7ebf2d2c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -35,11 +35,14 @@
 #include <linux/swapops.h>
 #include <linux/page_cgroup.h>
+static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+                                 unsigned char);
+static void free_swap_count_continuations(struct swap_info_struct *);
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
-static int swap_overflow;
 static int least_priority;
 static const char Bad_file[] = "Bad swap file entry ";
@@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex);
 static inline unsigned char swap_count(unsigned char ent)
 {
-        return ent & ~SWAP_HAS_CACHE;
+        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
 }
 /* returns 1 if swap entry is freed */
@@ -545,8 +548,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        if (usage == SWAP_HAS_CACHE) {
                VM_BUG_ON(!has_cache);
                has_cache = 0;
-        } else if (count < SWAP_MAP_MAX)
+        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
-                count--;
+                if (count == COUNT_CONTINUED) {
+                        if (swap_count_continued(p, offset, count))
+                                count = SWAP_MAP_MAX | COUNT_CONTINUED;
+                        else
+                                count = SWAP_MAP_MAX;
+                } else
+                        count--;
+        }
        if (!count)
                mem_cgroup_uncharge_swap(entry);
@@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, struct page *page)
 /*
 * How many references to page are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
 */
 static inline int page_swapcount(struct page *page)
 {
@@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int type)
        swp_entry_t entry;
        unsigned int i = 0;
        int retval = 0;
-        int reset_overflow = 0;
        int shmem;
        /*
@@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int type)
         * together, child after parent.  If we race with dup_mmap(), we
         * prefer to resolve parent before child, lest we miss entries
         * duplicated after we scanned child: using last mm would invert
-         * that.  Though it's only a serious concern when an overflowed
+         * that.
-         * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
         */
        start_mm = &init_mm;
        atomic_inc(&init_mm.mm_users);
@@ -1165,36 +1175,6 @@ static int try_to_unuse(unsigned int type)
                }
                /*
-                 * How could swap count reach 0x7ffe ?
-                 * There's no way to repeat a swap page within an mm
-                 * (except in shmem, where it's the shared object which takes
-                 * the reference count)?
-                 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
-                 * short is too small....)
-                 * If that's wrong, then we should worry more about
-                 * exit_mmap() and do_munmap() cases described above:
-                 * we might be resetting SWAP_MAP_MAX too early here.
-                 *
-                 * Yes, that's wrong: though very unlikely, swap count 0x7ffe
-                 * could surely occur if pid_max raised from PID_MAX_DEFAULT;
-                 * and we are now lowering SWAP_MAP_MAX to 0x7e, making it
-                 * much easier to reach.  But the next patch will fix that.
-                 *
-                 * We know "Undead"s can happen, they're okay, so don't
-                 * report them; but do report if we reset SWAP_MAP_MAX.
-                 */
-                /* We might release the lock_page() in unuse_mm(). */
-                if (!PageSwapCache(page) || page_private(page) != entry.val)
-                        goto retry;
-                if (swap_count(*swap_map) == SWAP_MAP_MAX) {
-                        spin_lock(&swap_lock);
-                        *swap_map = SWAP_HAS_CACHE;
-                        spin_unlock(&swap_lock);
-                        reset_overflow = 1;
-                }
-                /*
                 * If a reference remains (rare), we would like to leave
                 * the page in the swap cache; but try_to_unmap could
                 * then re-duplicate the entry once we drop page lock,
@@ -1235,7 +1215,6 @@ static int try_to_unuse(unsigned int type)
                 * mark page dirty so shrink_page_list will preserve it.
                 */
                SetPageDirty(page);
-retry:
                unlock_page(page);
                page_cache_release(page);
@@ -1247,10 +1226,6 @@ retry:
        }
        mmput(start_mm);
-        if (reset_overflow) {
-                printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
-                swap_overflow = 0;
-        }
        return retval;
 }
@@ -1593,6 +1568,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        up_write(&swap_unplug_sem);
        destroy_swap_extents(p);
+        if (p->flags & SWP_CONTINUED)
+                free_swap_count_continuations(p);
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        drain_mmlist();
@@ -2079,14 +2057,13 @@ void si_swapinfo(struct sysinfo *val)
 /*
 * Verify that a swap entry is valid and increment its swap map count.
 *
- * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
- * "permanent", but will be reclaimed by the next swapoff.
 * Returns error code in following case.
 * - success -> 0
 * - swp_entry is invalid -> EINVAL
 * - swp_entry is migration entry -> EINVAL
 * - swap-cache reference is requested but there is already one. -> EEXIST
 * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
 */
 static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 {
@@ -2126,15 +2103,14 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
        } else if (count || has_cache) {
-                if (count < SWAP_MAP_MAX - 1)
+                if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
-                        count++;
+                        count += usage;
-                else if (count <= SWAP_MAP_MAX) {
+                else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
-                        if (swap_overflow++ < 5)
-                                printk(KERN_WARNING
-                                       "swap_dup: swap entry overflow\n");
-                        count = SWAP_MAP_MAX;
-                } else
                        err = -EINVAL;
+                else if (swap_count_continued(p, offset, count))
+                        count = COUNT_CONTINUED;
+                else
+                        err = -ENOMEM;
        } else
                err = -ENOENT;                  /* unused swap entry */
@@ -2153,9 +2129,13 @@ bad_file:
 /*
 * increase reference count of swap entry by 1.
 */
-void swap_duplicate(swp_entry_t entry)
+int swap_duplicate(swp_entry_t entry)
 {
-        __swap_duplicate(entry, 1);
+        int err = 0;
+        while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
+                err = add_swap_count_continuation(entry, GFP_ATOMIC);
+        return err;
 }
 /*
@@ -2222,3 +2202,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
        *offset = ++toff;
        return nr_pages? ++nr_pages: 0;
 }
+/*
+ * add_swap_count_continuation - called when a swap count is duplicated
+ * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+ * page of the original vmalloc'ed swap_map, to hold the continuation count
+ * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
+ * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
+ *
+ * These continuation pages are seldom referenced: the common paths all work
+ * on the original swap_map, only referring to a continuation page when the
+ * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
+ *
+ * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
+ * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
+ * can be called after dropping locks.
+ */
+int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+{
+        struct swap_info_struct *si;
+        struct page *head;
+        struct page *page;
+        struct page *list_page;
+        pgoff_t offset;
+        unsigned char count;
+        /*
+         * When debugging, it's easier to use __GFP_ZERO here; but it's better
+         * for latency not to zero a page while GFP_ATOMIC and holding locks.
+         */
+        page = alloc_page(gfp_mask | __GFP_HIGHMEM);
+        si = swap_info_get(entry);
+        if (!si) {
+                /*
+                 * An acceptable race has occurred since the failing
+                 * __swap_duplicate(): the swap entry has been freed,
+                 * perhaps even the whole swap_map cleared for swapoff.
+                 */
+                goto outer;
+        }
+        offset = swp_offset(entry);
+        count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
+                /*
+                 * The higher the swap count, the more likely it is that tasks
+                 * will race to add swap count continuation: we need to avoid
+                 * over-provisioning.
+                 */
+                goto out;
+        }
+        if (!page) {
+                spin_unlock(&swap_lock);
+                return -ENOMEM;
+        }
+        /*
+         * We are fortunate that although vmalloc_to_page uses pte_offset_map,
+         * no architecture is using highmem pages for kernel pagetables: so it
+         * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
+         */
+        head = vmalloc_to_page(si->swap_map + offset);
+        offset &= ~PAGE_MASK;
+        /*
+         * Page allocation does not initialize the page's lru field,
+         * but it does always reset its private field.
+         */
+        if (!page_private(head)) {
+                BUG_ON(count & COUNT_CONTINUED);
+                INIT_LIST_HEAD(&head->lru);
+                set_page_private(head, SWP_CONTINUED);
+                si->flags |= SWP_CONTINUED;
+        }
+        list_for_each_entry(list_page, &head->lru, lru) {
+                unsigned char *map;
+                /*
+                 * If the previous map said no continuation, but we've found
+                 * a continuation page, free our allocation and use this one.
+                 */
+                if (!(count & COUNT_CONTINUED))
+                        goto out;
+                map = kmap_atomic(list_page, KM_USER0) + offset;
+                count = *map;
+                kunmap_atomic(map, KM_USER0);
+                /*
+                 * If this continuation count now has some space in it,
+                 * free our allocation and use this one.
+                 */
+                if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
+                        goto out;
+        }
+        list_add_tail(&page->lru, &head->lru);
+        page = NULL;                    /* now it's attached, don't free it */
+out:
+        spin_unlock(&swap_lock);
+outer:
+        if (page)
+                __free_page(page);
+        return 0;
+}
+/*
+ * swap_count_continued - when the original swap_map count is incremented
+ * from SWAP_MAP_MAX, check if there is already a continuation page to carry
+ * into, carry if so, or else fail until a new continuation page is allocated;
+ * when the original swap_map count is decremented from 0 with continuation,
+ * borrow from the continuation and report whether it still holds more.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ */
+static bool swap_count_continued(struct swap_info_struct *si,
+                                 pgoff_t offset, unsigned char count)
+{
+        struct page *head;
+        struct page *page;
+        unsigned char *map;
+        head = vmalloc_to_page(si->swap_map + offset);
+        if (page_private(head) != SWP_CONTINUED) {
+                BUG_ON(count & COUNT_CONTINUED);
+                return false;           /* need to add count continuation */
+        }
+        offset &= ~PAGE_MASK;
+        page = list_entry(head->lru.next, struct page, lru);
+        map = kmap_atomic(page, KM_USER0) + offset;
+        if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
+                goto init_map;          /* jump over SWAP_CONT_MAX checks */
+        if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
+                /*
+                 * Think of how you add 1 to 999
+                 */
+                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.next, struct page, lru);
+                        BUG_ON(page == head);
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                }
+                if (*map == SWAP_CONT_MAX) {
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.next, struct page, lru);
+                        if (page == head)
+                                return false;   /* add count continuation */
+                        map = kmap_atomic(page, KM_USER0) + offset;
+init_map:               *map = 0;               /* we didn't zero the page */
+                }
+                *map += 1;
+                kunmap_atomic(map, KM_USER0);
+                page = list_entry(page->lru.prev, struct page, lru);
+                while (page != head) {
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                        *map = COUNT_CONTINUED;
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.prev, struct page, lru);
+                }
+                return true;                    /* incremented */
+        } else {                                /* decrementing */
+                /*
+                 * Think of how you subtract 1 from 1000
+                 */
+                BUG_ON(count != COUNT_CONTINUED);
+                while (*map == COUNT_CONTINUED) {
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.next, struct page, lru);
+                        BUG_ON(page == head);
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                }
+                BUG_ON(*map == 0);
+                *map -= 1;
+                if (*map == 0)
+                        count = 0;
+                kunmap_atomic(map, KM_USER0);
+                page = list_entry(page->lru.prev, struct page, lru);
+                while (page != head) {
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                        *map = SWAP_CONT_MAX | count;
+                        count = COUNT_CONTINUED;
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.prev, struct page, lru);
+                }
+                return count == COUNT_CONTINUED;
+        }
+}
+/*
+ * free_swap_count_continuations - swapoff free all the continuation pages
+ * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
+ */
+static void free_swap_count_continuations(struct swap_info_struct *si)
+{
+        pgoff_t offset;
+        for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
+                struct page *head;
+                head = vmalloc_to_page(si->swap_map + offset);
+                if (page_private(head)) {
+                        struct list_head *this, *next;
+                        list_for_each_safe(this, next, &head->lru) {
+                                struct page *page;
+                                page = list_entry(this, struct page, lru);
+                                list_del(this);
+                                __free_page(page);
+                        }
+                }
+        }
+}
author	Hugh Dickins <hugh.dickins@tiscali.co.uk>	2009-12-14 20:58:46 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-12-15 11:53:15 -0500
commit	570a335b8e22579e2a51a68136d2b1f907a20eec (patch)
tree	c5312383e948d2e7ac60c2fa410fee98e8b38a70 /mm
parent	8d69aaee80c123b460918816cbfa2e83224c3646 (diff)

diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd4a199..543c446bf4ed 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -572,7 +572,7 @@ out:
572	* covered by this vma.	572	* covered by this vma.
573	*/	573	*/
574		574
575	static inline void	575	static inline unsigned long
576	copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,	576	copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
577	pte_t dst_pte, pte_t src_pte, struct vm_area_struct *vma,	577	pte_t dst_pte, pte_t src_pte, struct vm_area_struct *vma,
578	unsigned long addr, int *rss)	578	unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
586	if (!pte_file(pte)) {	586	if (!pte_file(pte)) {
587	swp_entry_t entry = pte_to_swp_entry(pte);	587	swp_entry_t entry = pte_to_swp_entry(pte);
588		588
589	swap_duplicate(entry);	589	if (swap_duplicate(entry) < 0)
		590	return entry.val;
		591
590	/* make sure dst_mm is on swapoff's mmlist. */	592	/* make sure dst_mm is on swapoff's mmlist. */
591	if (unlikely(list_empty(&dst_mm->mmlist))) {	593	if (unlikely(list_empty(&dst_mm->mmlist))) {
592	spin_lock(&mmlist_lock);	594	spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
635		637
636	out_set_pte:	638	out_set_pte:
637	set_pte_at(dst_mm, addr, dst_pte, pte);	639	set_pte_at(dst_mm, addr, dst_pte, pte);
		640	return 0;
638	}	641	}
639		642
640	static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,	643	static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,
646	spinlock_t src_ptl, dst_ptl;	649	spinlock_t src_ptl, dst_ptl;
647	int progress = 0;	650	int progress = 0;
648	int rss[2];	651	int rss[2];
		652	swp_entry_t entry = (swp_entry_t){0};
649		653
650	again:	654	again:
651	rss[1] = rss[0] = 0;	655	rss[1] = rss[0] = 0;
@@ -674,7 +678,10 @@ again:
674	progress++;	678	progress++;
675	continue;	679	continue;
676	}	680	}
677	copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);	681	entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
		682	vma, addr, rss);
		683	if (entry.val)
		684	break;
678	progress += 8;	685	progress += 8;
679	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);	686	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
680		687
@@ -684,6 +691,12 @@ again:
684	add_mm_rss(dst_mm, rss[0], rss[1]);	691	add_mm_rss(dst_mm, rss[0], rss[1]);
685	pte_unmap_unlock(orig_dst_pte, dst_ptl);	692	pte_unmap_unlock(orig_dst_pte, dst_ptl);
686	cond_resched();	693	cond_resched();
		694
		695	if (entry.val) {
		696	if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
		697	return -ENOMEM;
		698	progress = 0;
		699	}
687	if (addr != end)	700	if (addr != end)
688	goto again;	701	goto again;
689	return 0;	702	return 0;


diff --git a/mm/rmap.c b/mm/rmap.c index dd43373a483f..710bb4b2adf1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c
@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page page, struct vm_area_struct vma,
822	* Store the swap location in the pte.	822	* Store the swap location in the pte.
823	* See handle_pte_fault() ...	823	* See handle_pte_fault() ...
824	*/	824	*/
825	swap_duplicate(entry);	825	if (swap_duplicate(entry) < 0) {
		826	set_pte_at(mm, address, pte, pteval);
		827	ret = SWAP_FAIL;
		828	goto out_unmap;
		829	}
826	if (list_empty(&mm->mmlist)) {	830	if (list_empty(&mm->mmlist)) {
827	spin_lock(&mmlist_lock);	831	spin_lock(&mmlist_lock);
828	if (list_empty(&mm->mmlist))	832	if (list_empty(&mm->mmlist))


diff --git a/mm/swapfile.c b/mm/swapfile.c index c0d7b9ed0c16..cc5e7ebf2d2c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c
@@ -35,11 +35,14 @@
35	#include <linux/swapops.h>	35	#include <linux/swapops.h>
36	#include <linux/page_cgroup.h>	36	#include <linux/page_cgroup.h>
37		37
		38	static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
		39	unsigned char);
		40	static void free_swap_count_continuations(struct swap_info_struct *);
		41
38	static DEFINE_SPINLOCK(swap_lock);	42	static DEFINE_SPINLOCK(swap_lock);
39	static unsigned int nr_swapfiles;	43	static unsigned int nr_swapfiles;
40	long nr_swap_pages;	44	long nr_swap_pages;
41	long total_swap_pages;	45	long total_swap_pages;
42	static int swap_overflow;
43	static int least_priority;	46	static int least_priority;
44		47
45	static const char Bad_file[] = "Bad swap file entry ";	48	static const char Bad_file[] = "Bad swap file entry ";
@@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex);
55		58
56	static inline unsigned char swap_count(unsigned char ent)	59	static inline unsigned char swap_count(unsigned char ent)
57	{	60	{
58	return ent & ~SWAP_HAS_CACHE;	61	return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
59	}	62	}
60		63
61	/* returns 1 if swap entry is freed */	64	/* returns 1 if swap entry is freed */
@@ -545,8 +548,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
545	if (usage == SWAP_HAS_CACHE) {	548	if (usage == SWAP_HAS_CACHE) {
546	VM_BUG_ON(!has_cache);	549	VM_BUG_ON(!has_cache);
547	has_cache = 0;	550	has_cache = 0;
548	} else if (count < SWAP_MAP_MAX)	551	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
549	count--;	552	if (count == COUNT_CONTINUED) {
		553	if (swap_count_continued(p, offset, count))
		554	count = SWAP_MAP_MAX \| COUNT_CONTINUED;
		555	else
		556	count = SWAP_MAP_MAX;
		557	} else
		558	count--;
		559	}
550		560
551	if (!count)	561	if (!count)
552	mem_cgroup_uncharge_swap(entry);	562	mem_cgroup_uncharge_swap(entry);
@@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, struct page *page)
604		614
605	/*	615	/*
606	* How many references to page are currently swapped out?	616	* How many references to page are currently swapped out?
		617	* This does not give an exact answer when swap count is continued,
		618	* but does include the high COUNT_CONTINUED flag to allow for that.
607	*/	619	*/
608	static inline int page_swapcount(struct page *page)	620	static inline int page_swapcount(struct page *page)
609	{	621	{
@@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int type)
1019	swp_entry_t entry;	1031	swp_entry_t entry;
1020	unsigned int i = 0;	1032	unsigned int i = 0;
1021	int retval = 0;	1033	int retval = 0;
1022	int reset_overflow = 0;
1023	int shmem;	1034	int shmem;
1024		1035
1025	/*	1036	/*
@@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int type)
1034	* together, child after parent. If we race with dup_mmap(), we	1045	* together, child after parent. If we race with dup_mmap(), we
1035	* prefer to resolve parent before child, lest we miss entries	1046	* prefer to resolve parent before child, lest we miss entries
1036	* duplicated after we scanned child: using last mm would invert	1047	* duplicated after we scanned child: using last mm would invert
1037	* that. Though it's only a serious concern when an overflowed	1048	* that.
1038	* swap count is reset from SWAP_MAP_MAX, preventing a rescan.
1039	*/	1049	*/
1040	start_mm = &init_mm;	1050	start_mm = &init_mm;
1041	atomic_inc(&init_mm.mm_users);	1051	atomic_inc(&init_mm.mm_users);
@@ -1165,36 +1175,6 @@ static int try_to_unuse(unsigned int type)
1165	}	1175	}
1166		1176
1167	/*	1177	/*
1168	* How could swap count reach 0x7ffe ?
1169	* There's no way to repeat a swap page within an mm
1170	* (except in shmem, where it's the shared object which takes
1171	* the reference count)?
1172	* We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1173	* short is too small....)
1174	* If that's wrong, then we should worry more about
1175	* exit_mmap() and do_munmap() cases described above:
1176	* we might be resetting SWAP_MAP_MAX too early here.
1177	*
1178	* Yes, that's wrong: though very unlikely, swap count 0x7ffe
1179	* could surely occur if pid_max raised from PID_MAX_DEFAULT;
1180	* and we are now lowering SWAP_MAP_MAX to 0x7e, making it
1181	* much easier to reach. But the next patch will fix that.
1182	*
1183	* We know "Undead"s can happen, they're okay, so don't
1184	* report them; but do report if we reset SWAP_MAP_MAX.
1185	*/
1186	/* We might release the lock_page() in unuse_mm(). */
1187	if (!PageSwapCache(page) \|\| page_private(page) != entry.val)
1188	goto retry;
1189
1190	if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1191	spin_lock(&swap_lock);
1192	*swap_map = SWAP_HAS_CACHE;
1193	spin_unlock(&swap_lock);
1194	reset_overflow = 1;
1195	}
1196
1197	/*
1198	* If a reference remains (rare), we would like to leave	1178	* If a reference remains (rare), we would like to leave
1199	* the page in the swap cache; but try_to_unmap could	1179	* the page in the swap cache; but try_to_unmap could
1200	* then re-duplicate the entry once we drop page lock,	1180	* then re-duplicate the entry once we drop page lock,
@@ -1235,7 +1215,6 @@ static int try_to_unuse(unsigned int type)
1235	* mark page dirty so shrink_page_list will preserve it.	1215	* mark page dirty so shrink_page_list will preserve it.
1236	*/	1216	*/
1237	SetPageDirty(page);	1217	SetPageDirty(page);
1238	retry:
1239	unlock_page(page);	1218	unlock_page(page);
1240	page_cache_release(page);	1219	page_cache_release(page);
1241		1220
@@ -1247,10 +1226,6 @@ retry:
1247	}	1226	}
1248		1227
1249	mmput(start_mm);	1228	mmput(start_mm);
1250	if (reset_overflow) {
1251	printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
1252	swap_overflow = 0;
1253	}
1254	return retval;	1229	return retval;
1255	}	1230	}
1256		1231
@@ -1593,6 +1568,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1593	up_write(&swap_unplug_sem);	1568	up_write(&swap_unplug_sem);
1594		1569
1595	destroy_swap_extents(p);	1570	destroy_swap_extents(p);
		1571	if (p->flags & SWP_CONTINUED)
		1572	free_swap_count_continuations(p);
		1573
1596	mutex_lock(&swapon_mutex);	1574	mutex_lock(&swapon_mutex);
1597	spin_lock(&swap_lock);	1575	spin_lock(&swap_lock);
1598	drain_mmlist();	1576	drain_mmlist();
@@ -2079,14 +2057,13 @@ void si_swapinfo(struct sysinfo *val)
2079	/*	2057	/*
2080	* Verify that a swap entry is valid and increment its swap map count.	2058	* Verify that a swap entry is valid and increment its swap map count.
2081	*	2059	*
2082	* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
2083	* "permanent", but will be reclaimed by the next swapoff.
2084	* Returns error code in following case.	2060	* Returns error code in following case.
2085	* - success -> 0	2061	* - success -> 0
2086	* - swp_entry is invalid -> EINVAL	2062	* - swp_entry is invalid -> EINVAL
2087	* - swp_entry is migration entry -> EINVAL	2063	* - swp_entry is migration entry -> EINVAL
2088	* - swap-cache reference is requested but there is already one. -> EEXIST	2064	* - swap-cache reference is requested but there is already one. -> EEXIST
2089	* - swap-cache reference is requested but the entry is not used. -> ENOENT	2065	* - swap-cache reference is requested but the entry is not used. -> ENOENT
		2066	* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2090	*/	2067	*/
2091	static int __swap_duplicate(swp_entry_t entry, unsigned char usage)	2068	static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2092	{	2069	{
@@ -2126,15 +2103,14 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2126		2103
2127	} else if (count \|\| has_cache) {	2104	} else if (count \|\| has_cache) {
2128		2105
2129	if (count < SWAP_MAP_MAX - 1)	2106	if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2130	count++;	2107	count += usage;
2131	else if (count <= SWAP_MAP_MAX) {	2108	else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2132	if (swap_overflow++ < 5)
2133	printk(KERN_WARNING
2134	"swap_dup: swap entry overflow\n");
2135	count = SWAP_MAP_MAX;
2136	} else
2137	err = -EINVAL;	2109	err = -EINVAL;
		2110	else if (swap_count_continued(p, offset, count))
		2111	count = COUNT_CONTINUED;
		2112	else
		2113	err = -ENOMEM;
2138	} else	2114	} else
2139	err = -ENOENT; /* unused swap entry */	2115	err = -ENOENT; /* unused swap entry */
2140		2116
@@ -2153,9 +2129,13 @@ bad_file:
2153	/*	2129	/*
2154	* increase reference count of swap entry by 1.	2130	* increase reference count of swap entry by 1.
2155	*/	2131	*/
2156	void swap_duplicate(swp_entry_t entry)	2132	int swap_duplicate(swp_entry_t entry)
2157	{	2133	{
2158	__swap_duplicate(entry, 1);	2134	int err = 0;
		2135
		2136	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
		2137	err = add_swap_count_continuation(entry, GFP_ATOMIC);
		2138	return err;
2159	}	2139	}
2160		2140
2161	/*	2141	/*
@@ -2222,3 +2202,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2222	*offset = ++toff;	2202	*offset = ++toff;
2223	return nr_pages? ++nr_pages: 0;	2203	return nr_pages? ++nr_pages: 0;
2224	}	2204	}
		2205
		2206	/*
		2207	* add_swap_count_continuation - called when a swap count is duplicated
		2208	* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
		2209	* page of the original vmalloc'ed swap_map, to hold the continuation count
		2210	* (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
		2211	* again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
		2212	*
		2213	* These continuation pages are seldom referenced: the common paths all work
		2214	* on the original swap_map, only referring to a continuation page when the
		2215	* low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
		2216	*
		2217	* add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
		2218	* page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
		2219	* can be called after dropping locks.
		2220	*/
		2221	int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
		2222	{
		2223	struct swap_info_struct *si;
		2224	struct page *head;
		2225	struct page *page;
		2226	struct page *list_page;
		2227	pgoff_t offset;
		2228	unsigned char count;
		2229
		2230	/*
		2231	* When debugging, it's easier to use __GFP_ZERO here; but it's better
		2232	* for latency not to zero a page while GFP_ATOMIC and holding locks.
		2233	*/
		2234	page = alloc_page(gfp_mask \| __GFP_HIGHMEM);
		2235
		2236	si = swap_info_get(entry);
		2237	if (!si) {
		2238	/*
		2239	* An acceptable race has occurred since the failing
		2240	* __swap_duplicate(): the swap entry has been freed,
		2241	* perhaps even the whole swap_map cleared for swapoff.
		2242	*/
		2243	goto outer;
		2244	}
		2245
		2246	offset = swp_offset(entry);
		2247	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
		2248
		2249	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
		2250	/*
		2251	* The higher the swap count, the more likely it is that tasks
		2252	* will race to add swap count continuation: we need to avoid
		2253	* over-provisioning.
		2254	*/
		2255	goto out;
		2256	}
		2257
		2258	if (!page) {
		2259	spin_unlock(&swap_lock);
		2260	return -ENOMEM;
		2261	}
		2262
		2263	/*
		2264	* We are fortunate that although vmalloc_to_page uses pte_offset_map,
		2265	* no architecture is using highmem pages for kernel pagetables: so it
		2266	* will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
		2267	*/
		2268	head = vmalloc_to_page(si->swap_map + offset);
		2269	offset &= ~PAGE_MASK;
		2270
		2271	/*
		2272	* Page allocation does not initialize the page's lru field,
		2273	* but it does always reset its private field.
		2274	*/
		2275	if (!page_private(head)) {
		2276	BUG_ON(count & COUNT_CONTINUED);
		2277	INIT_LIST_HEAD(&head->lru);
		2278	set_page_private(head, SWP_CONTINUED);
		2279	si->flags \|= SWP_CONTINUED;
		2280	}
		2281
		2282	list_for_each_entry(list_page, &head->lru, lru) {
		2283	unsigned char *map;
		2284
		2285	/*
		2286	* If the previous map said no continuation, but we've found
		2287	* a continuation page, free our allocation and use this one.
		2288	*/
		2289	if (!(count & COUNT_CONTINUED))
		2290	goto out;
		2291
		2292	map = kmap_atomic(list_page, KM_USER0) + offset;
		2293	count = *map;
		2294	kunmap_atomic(map, KM_USER0);
		2295
		2296	/*
		2297	* If this continuation count now has some space in it,
		2298	* free our allocation and use this one.
		2299	*/
		2300	if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
		2301	goto out;
		2302	}
		2303
		2304	list_add_tail(&page->lru, &head->lru);
		2305	page = NULL; /* now it's attached, don't free it */
		2306	out:
		2307	spin_unlock(&swap_lock);
		2308	outer:
		2309	if (page)
		2310	__free_page(page);
		2311	return 0;
		2312	}
		2313
		2314	/*
		2315	* swap_count_continued - when the original swap_map count is incremented
		2316	* from SWAP_MAP_MAX, check if there is already a continuation page to carry
		2317	* into, carry if so, or else fail until a new continuation page is allocated;
		2318	* when the original swap_map count is decremented from 0 with continuation,
		2319	* borrow from the continuation and report whether it still holds more.
		2320	* Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
		2321	*/
		2322	static bool swap_count_continued(struct swap_info_struct *si,
		2323	pgoff_t offset, unsigned char count)
		2324	{
		2325	struct page *head;
		2326	struct page *page;
		2327	unsigned char *map;
		2328
		2329	head = vmalloc_to_page(si->swap_map + offset);
		2330	if (page_private(head) != SWP_CONTINUED) {
		2331	BUG_ON(count & COUNT_CONTINUED);
		2332	return false; /* need to add count continuation */
		2333	}
		2334
		2335	offset &= ~PAGE_MASK;
		2336	page = list_entry(head->lru.next, struct page, lru);
		2337	map = kmap_atomic(page, KM_USER0) + offset;
		2338
		2339	if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
		2340	goto init_map; /* jump over SWAP_CONT_MAX checks */
		2341
		2342	if (count == (SWAP_MAP_MAX \| COUNT_CONTINUED)) { /* incrementing */
		2343	/*
		2344	* Think of how you add 1 to 999
		2345	*/
		2346	while (*map == (SWAP_CONT_MAX \| COUNT_CONTINUED)) {
		2347	kunmap_atomic(map, KM_USER0);
		2348	page = list_entry(page->lru.next, struct page, lru);
		2349	BUG_ON(page == head);
		2350	map = kmap_atomic(page, KM_USER0) + offset;
		2351	}
		2352	if (*map == SWAP_CONT_MAX) {
		2353	kunmap_atomic(map, KM_USER0);
		2354	page = list_entry(page->lru.next, struct page, lru);
		2355	if (page == head)
		2356	return false; /* add count continuation */
		2357	map = kmap_atomic(page, KM_USER0) + offset;
		2358	init_map: map = 0; / we didn't zero the page */
		2359	}
		2360	*map += 1;
		2361	kunmap_atomic(map, KM_USER0);
		2362	page = list_entry(page->lru.prev, struct page, lru);
		2363	while (page != head) {
		2364	map = kmap_atomic(page, KM_USER0) + offset;
		2365	*map = COUNT_CONTINUED;
		2366	kunmap_atomic(map, KM_USER0);
		2367	page = list_entry(page->lru.prev, struct page, lru);
		2368	}
		2369	return true; /* incremented */
		2370
		2371	} else { /* decrementing */
		2372	/*
		2373	* Think of how you subtract 1 from 1000
		2374	*/
		2375	BUG_ON(count != COUNT_CONTINUED);
		2376	while (*map == COUNT_CONTINUED) {
		2377	kunmap_atomic(map, KM_USER0);
		2378	page = list_entry(page->lru.next, struct page, lru);
		2379	BUG_ON(page == head);
		2380	map = kmap_atomic(page, KM_USER0) + offset;
		2381	}
		2382	BUG_ON(*map == 0);
		2383	*map -= 1;
		2384	if (*map == 0)
		2385	count = 0;
		2386	kunmap_atomic(map, KM_USER0);
		2387	page = list_entry(page->lru.prev, struct page, lru);
		2388	while (page != head) {
		2389	map = kmap_atomic(page, KM_USER0) + offset;
		2390	*map = SWAP_CONT_MAX \| count;
		2391	count = COUNT_CONTINUED;
		2392	kunmap_atomic(map, KM_USER0);
		2393	page = list_entry(page->lru.prev, struct page, lru);
		2394	}
		2395	return count == COUNT_CONTINUED;
		2396	}
		2397	}
		2398
		2399	/*
		2400	* free_swap_count_continuations - swapoff free all the continuation pages
		2401	* appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
		2402	*/
		2403	static void free_swap_count_continuations(struct swap_info_struct *si)
		2404	{
		2405	pgoff_t offset;
		2406
		2407	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
		2408	struct page *head;
		2409	head = vmalloc_to_page(si->swap_map + offset);
		2410	if (page_private(head)) {
		2411	struct list_head this, next;
		2412	list_for_each_safe(this, next, &head->lru) {
		2413	struct page *page;
		2414	page = list_entry(this, struct page, lru);
		2415	list_del(this);
		2416	__free_page(page);
		2417	}
		2418	}
		2419	}
		2420	}