3 files changed, 271 insertions, 58 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 6ab19dd4a199..543c446bf4ed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -572,7 +572,7 @@ out:
 * covered by this vma.
 */
-static inline void
+static inline unsigned long
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
                unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                if (!pte_file(pte)) {
                        swp_entry_t entry = pte_to_swp_entry(pte);
-                        swap_duplicate(entry);
+                        if (swap_duplicate(entry) < 0)
+                                return entry.val;
                        /* make sure dst_mm is on swapoff's mmlist. */
                        if (unlikely(list_empty(&dst_mm->mmlist))) {
                                spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 out_set_pte:
        set_pte_at(dst_mm, addr, dst_pte, pte);
+        return 0;
 }
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        spinlock_t *src_ptl, *dst_ptl;
        int progress = 0;
        int rss[2];
+        swp_entry_t entry = (swp_entry_t){0};
 again:
        rss[1] = rss[0] = 0;
@@ -674,7 +678,10 @@ again:
                        progress++;
                        continue;
                }
-                copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+                entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+                                                        vma, addr, rss);
+                if (entry.val)
+                        break;
                progress += 8;
        } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -684,6 +691,12 @@ again:
        add_mm_rss(dst_mm, rss[0], rss[1]);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
+        if (entry.val) {
+                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+                        return -ENOMEM;
+                progress = 0;
+        }
        if (addr != end)
                goto again;
        return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index dd43373a483f..710bb4b2adf1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
-                        swap_duplicate(entry);
+                        if (swap_duplicate(entry) < 0) {
+                                set_pte_at(mm, address, pte, pteval);
+                                ret = SWAP_FAIL;
+                                goto out_unmap;
+                        }
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c0d7b9ed0c16..cc5e7ebf2d2c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -35,11 +35,14 @@
 #include <linux/swapops.h>
 #include <linux/page_cgroup.h>
+static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+                                 unsigned char);
+static void free_swap_count_continuations(struct swap_info_struct *);
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
-static int swap_overflow;
 static int least_priority;
 static const char Bad_file[] = "Bad swap file entry ";
@@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex);
 static inline unsigned char swap_count(unsigned char ent)
 {
-        return ent & ~SWAP_HAS_CACHE;
+        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
 }
 /* returns 1 if swap entry is freed */
@@ -545,8 +548,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        if (usage == SWAP_HAS_CACHE) {
                VM_BUG_ON(!has_cache);
                has_cache = 0;
-        } else if (count < SWAP_MAP_MAX)
+        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
-                count--;
+                if (count == COUNT_CONTINUED) {
+                        if (swap_count_continued(p, offset, count))
+                                count = SWAP_MAP_MAX | COUNT_CONTINUED;
+                        else
+                                count = SWAP_MAP_MAX;
+                } else
+                        count--;
+        }
        if (!count)
                mem_cgroup_uncharge_swap(entry);
@@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, struct page *page)
 /*
 * How many references to page are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
 */
 static inline int page_swapcount(struct page *page)
 {
@@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int type)
        swp_entry_t entry;
        unsigned int i = 0;
        int retval = 0;
-        int reset_overflow = 0;
        int shmem;
        /*
@@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int type)
         * together, child after parent.  If we race with dup_mmap(), we
         * prefer to resolve parent before child, lest we miss entries
         * duplicated after we scanned child: using last mm would invert
-         * that.  Though it's only a serious concern when an overflowed
+         * that.
-         * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
         */
        start_mm = &init_mm;
        atomic_inc(&init_mm.mm_users);
@@ -1165,36 +1175,6 @@ static int try_to_unuse(unsigned int type)
                }
                /*
-                 * How could swap count reach 0x7ffe ?
-                 * There's no way to repeat a swap page within an mm
-                 * (except in shmem, where it's the shared object which takes
-                 * the reference count)?
-                 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
-                 * short is too small....)
-                 * If that's wrong, then we should worry more about
-                 * exit_mmap() and do_munmap() cases described above:
-                 * we might be resetting SWAP_MAP_MAX too early here.
-                 *
-                 * Yes, that's wrong: though very unlikely, swap count 0x7ffe
-                 * could surely occur if pid_max raised from PID_MAX_DEFAULT;
-                 * and we are now lowering SWAP_MAP_MAX to 0x7e, making it
-                 * much easier to reach.  But the next patch will fix that.
-                 *
-                 * We know "Undead"s can happen, they're okay, so don't
-                 * report them; but do report if we reset SWAP_MAP_MAX.
-                 */
-                /* We might release the lock_page() in unuse_mm(). */
-                if (!PageSwapCache(page) || page_private(page) != entry.val)
-                        goto retry;
-                if (swap_count(*swap_map) == SWAP_MAP_MAX) {
-                        spin_lock(&swap_lock);
-                        *swap_map = SWAP_HAS_CACHE;
-                        spin_unlock(&swap_lock);
-                        reset_overflow = 1;
-                }
-                /*
                 * If a reference remains (rare), we would like to leave
                 * the page in the swap cache; but try_to_unmap could
                 * then re-duplicate the entry once we drop page lock,
@@ -1235,7 +1215,6 @@ static int try_to_unuse(unsigned int type)
                 * mark page dirty so shrink_page_list will preserve it.
                 */
                SetPageDirty(page);
-retry:
                unlock_page(page);
                page_cache_release(page);
@@ -1247,10 +1226,6 @@ retry:
        }
        mmput(start_mm);
-        if (reset_overflow) {
-                printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
-                swap_overflow = 0;
-        }
        return retval;
 }
@@ -1593,6 +1568,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        up_write(&swap_unplug_sem);
        destroy_swap_extents(p);
+        if (p->flags & SWP_CONTINUED)
+                free_swap_count_continuations(p);
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        drain_mmlist();
@@ -2079,14 +2057,13 @@ void si_swapinfo(struct sysinfo *val)
 /*
 * Verify that a swap entry is valid and increment its swap map count.
 *
- * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
- * "permanent", but will be reclaimed by the next swapoff.
 * Returns error code in following case.
 * - success -> 0
 * - swp_entry is invalid -> EINVAL
 * - swp_entry is migration entry -> EINVAL
 * - swap-cache reference is requested but there is already one. -> EEXIST
 * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
 */
 static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 {
@@ -2126,15 +2103,14 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
        } else if (count || has_cache) {
-                if (count < SWAP_MAP_MAX - 1)
+                if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
-                        count++;
+                        count += usage;
-                else if (count <= SWAP_MAP_MAX) {
+                else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
-                        if (swap_overflow++ < 5)
-                                printk(KERN_WARNING
-                                       "swap_dup: swap entry overflow\n");
-                        count = SWAP_MAP_MAX;
-                } else
                        err = -EINVAL;
+                else if (swap_count_continued(p, offset, count))
+                        count = COUNT_CONTINUED;
+                else
+                        err = -ENOMEM;
        } else
                err = -ENOENT;                  /* unused swap entry */
@@ -2153,9 +2129,13 @@ bad_file:
 /*
 * increase reference count of swap entry by 1.
 */
-void swap_duplicate(swp_entry_t entry)
+int swap_duplicate(swp_entry_t entry)
 {
-        __swap_duplicate(entry, 1);
+        int err = 0;
+        while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
+                err = add_swap_count_continuation(entry, GFP_ATOMIC);
+        return err;
 }
 /*
@@ -2222,3 +2202,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
        *offset = ++toff;
        return nr_pages? ++nr_pages: 0;
 }
+/*
+ * add_swap_count_continuation - called when a swap count is duplicated
+ * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+ * page of the original vmalloc'ed swap_map, to hold the continuation count
+ * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
+ * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
+ *
+ * These continuation pages are seldom referenced: the common paths all work
+ * on the original swap_map, only referring to a continuation page when the
+ * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
+ *
+ * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
+ * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
+ * can be called after dropping locks.
+ */
+int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+{
+        struct swap_info_struct *si;
+        struct page *head;
+        struct page *page;
+        struct page *list_page;
+        pgoff_t offset;
+        unsigned char count;
+        /*
+         * When debugging, it's easier to use __GFP_ZERO here; but it's better
+         * for latency not to zero a page while GFP_ATOMIC and holding locks.
+         */
+        page = alloc_page(gfp_mask | __GFP_HIGHMEM);
+        si = swap_info_get(entry);
+        if (!si) {
+                /*
+                 * An acceptable race has occurred since the failing
+                 * __swap_duplicate(): the swap entry has been freed,
+                 * perhaps even the whole swap_map cleared for swapoff.
+                 */
+                goto outer;
+        }
+        offset = swp_offset(entry);
+        count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
+                /*
+                 * The higher the swap count, the more likely it is that tasks
+                 * will race to add swap count continuation: we need to avoid
+                 * over-provisioning.
+                 */
+                goto out;
+        }
+        if (!page) {
+                spin_unlock(&swap_lock);
+                return -ENOMEM;
+        }
+        /*
+         * We are fortunate that although vmalloc_to_page uses pte_offset_map,
+         * no architecture is using highmem pages for kernel pagetables: so it
+         * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
+         */
+        head = vmalloc_to_page(si->swap_map + offset);
+        offset &= ~PAGE_MASK;
+        /*
+         * Page allocation does not initialize the page's lru field,
+         * but it does always reset its private field.
+         */
+        if (!page_private(head)) {
+                BUG_ON(count & COUNT_CONTINUED);
+                INIT_LIST_HEAD(&head->lru);
+                set_page_private(head, SWP_CONTINUED);
+                si->flags |= SWP_CONTINUED;
+        }
+        list_for_each_entry(list_page, &head->lru, lru) {
+                unsigned char *map;
+                /*
+                 * If the previous map said no continuation, but we've found
+                 * a continuation page, free our allocation and use this one.
+                 */
+                if (!(count & COUNT_CONTINUED))
+                        goto out;
+                map = kmap_atomic(list_page, KM_USER0) + offset;
+                count = *map;
+                kunmap_atomic(map, KM_USER0);
+                /*
+                 * If this continuation count now has some space in it,
+                 * free our allocation and use this one.
+                 */
+                if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
+                        goto out;
+        }
+        list_add_tail(&page->lru, &head->lru);
+        page = NULL;                    /* now it's attached, don't free it */
+out:
+        spin_unlock(&swap_lock);
+outer:
+        if (page)
+                __free_page(page);
+        return 0;
+}
+/*
+ * swap_count_continued - when the original swap_map count is incremented
+ * from SWAP_MAP_MAX, check if there is already a continuation page to carry
+ * into, carry if so, or else fail until a new continuation page is allocated;
+ * when the original swap_map count is decremented from 0 with continuation,
+ * borrow from the continuation and report whether it still holds more.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ */
+static bool swap_count_continued(struct swap_info_struct *si,
+                                 pgoff_t offset, unsigned char count)
+{
+        struct page *head;
+        struct page *page;
+        unsigned char *map;
+        head = vmalloc_to_page(si->swap_map + offset);
+        if (page_private(head) != SWP_CONTINUED) {
+                BUG_ON(count & COUNT_CONTINUED);
+                return false;           /* need to add count continuation */
+        }
+        offset &= ~PAGE_MASK;
+        page = list_entry(head->lru.next, struct page, lru);
+        map = kmap_atomic(page, KM_USER0) + offset;
+        if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
+                goto init_map;          /* jump over SWAP_CONT_MAX checks */
+        if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
+                /*
+                 * Think of how you add 1 to 999
+                 */
+                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.next, struct page, lru);
+                        BUG_ON(page == head);
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                }
+                if (*map == SWAP_CONT_MAX) {
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.next, struct page, lru);
+                        if (page == head)
+                                return false;   /* add count continuation */
+                        map = kmap_atomic(page, KM_USER0) + offset;
+init_map:               *map = 0;               /* we didn't zero the page */
+                }
+                *map += 1;
+                kunmap_atomic(map, KM_USER0);
+                page = list_entry(page->lru.prev, struct page, lru);
+                while (page != head) {
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                        *map = COUNT_CONTINUED;
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.prev, struct page, lru);
+                }
+                return true;                    /* incremented */
+        } else {                                /* decrementing */
+                /*
+                 * Think of how you subtract 1 from 1000
+                 */
+                BUG_ON(count != COUNT_CONTINUED);
+                while (*map == COUNT_CONTINUED) {
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.next, struct page, lru);
+                        BUG_ON(page == head);
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                }
+                BUG_ON(*map == 0);
+                *map -= 1;
+                if (*map == 0)
+                        count = 0;
+                kunmap_atomic(map, KM_USER0);
+                page = list_entry(page->lru.prev, struct page, lru);
+                while (page != head) {
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                        *map = SWAP_CONT_MAX | count;
+                        count = COUNT_CONTINUED;
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.prev, struct page, lru);
+                }
+                return count == COUNT_CONTINUED;
+        }
+}
+/*
+ * free_swap_count_continuations - swapoff free all the continuation pages
+ * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
+ */
+static void free_swap_count_continuations(struct swap_info_struct *si)
+{
+        pgoff_t offset;
+        for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
+                struct page *head;
+                head = vmalloc_to_page(si->swap_map + offset);
+                if (page_private(head)) {
+                        struct list_head *this, *next;
+                        list_for_each_safe(this, next, &head->lru) {
+                                struct page *page;
+                                page = list_entry(this, struct page, lru);
+                                list_del(this);
+                                __free_page(page);
+                        }
+                }
+        }
+}

diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd4a199..543c446bf4ed 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -572,7 +572,7 @@ out:
572	* covered by this vma.	572	* covered by this vma.
573	*/	573	*/
574		574
575	static inline void	575	static inline unsigned long
576	copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,	576	copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
577	pte_t dst_pte, pte_t src_pte, struct vm_area_struct *vma,	577	pte_t dst_pte, pte_t src_pte, struct vm_area_struct *vma,
578	unsigned long addr, int *rss)	578	unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
586	if (!pte_file(pte)) {	586	if (!pte_file(pte)) {
587	swp_entry_t entry = pte_to_swp_entry(pte);	587	swp_entry_t entry = pte_to_swp_entry(pte);
588		588
589	swap_duplicate(entry);	589	if (swap_duplicate(entry) < 0)
		590	return entry.val;
		591
590	/* make sure dst_mm is on swapoff's mmlist. */	592	/* make sure dst_mm is on swapoff's mmlist. */
591	if (unlikely(list_empty(&dst_mm->mmlist))) {	593	if (unlikely(list_empty(&dst_mm->mmlist))) {
592	spin_lock(&mmlist_lock);	594	spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
635		637
636	out_set_pte:	638	out_set_pte:
637	set_pte_at(dst_mm, addr, dst_pte, pte);	639	set_pte_at(dst_mm, addr, dst_pte, pte);
		640	return 0;
638	}	641	}
639		642
640	static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,	643	static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,
646	spinlock_t src_ptl, dst_ptl;	649	spinlock_t src_ptl, dst_ptl;
647	int progress = 0;	650	int progress = 0;
648	int rss[2];	651	int rss[2];
		652	swp_entry_t entry = (swp_entry_t){0};
649		653
650	again:	654	again:
651	rss[1] = rss[0] = 0;	655	rss[1] = rss[0] = 0;
@@ -674,7 +678,10 @@ again:
674	progress++;	678	progress++;
675	continue;	679	continue;
676	}	680	}
677	copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);	681	entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
		682	vma, addr, rss);
		683	if (entry.val)
		684	break;
678	progress += 8;	685	progress += 8;
679	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);	686	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
680		687
@@ -684,6 +691,12 @@ again:
684	add_mm_rss(dst_mm, rss[0], rss[1]);	691	add_mm_rss(dst_mm, rss[0], rss[1]);
685	pte_unmap_unlock(orig_dst_pte, dst_ptl);	692	pte_unmap_unlock(orig_dst_pte, dst_ptl);
686	cond_resched();	693	cond_resched();
		694
		695	if (entry.val) {
		696	if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
		697	return -ENOMEM;
		698	progress = 0;
		699	}
687	if (addr != end)	700	if (addr != end)
688	goto again;	701	goto again;
689	return 0;	702	return 0;


diff --git a/mm/rmap.c b/mm/rmap.c index dd43373a483f..710bb4b2adf1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c
@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page page, struct vm_area_struct vma,
822	* Store the swap location in the pte.	822	* Store the swap location in the pte.
823	* See handle_pte_fault() ...	823	* See handle_pte_fault() ...
824	*/	824	*/
825	swap_duplicate(entry);	825	if (swap_duplicate(entry) < 0) {
		826	set_pte_at(mm, address, pte, pteval);
		827	ret = SWAP_FAIL;
		828	goto out_unmap;
		829	}
826	if (list_empty(&mm->mmlist)) {	830	if (list_empty(&mm->mmlist)) {
827	spin_lock(&mmlist_lock);	831	spin_lock(&mmlist_lock);
828	if (list_empty(&mm->mmlist))	832	if (list_empty(&mm->mmlist))


diff --git a/mm/swapfile.c b/mm/swapfile.c index c0d7b9ed0c16..cc5e7ebf2d2c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c
@@ -35,11 +35,14 @@
35	#include <linux/swapops.h>	35	#include <linux/swapops.h>
36	#include <linux/page_cgroup.h>	36	#include <linux/page_cgroup.h>
37		37
		38	static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
		39	unsigned char);
		40	static void free_swap_count_continuations(struct swap_info_struct *);
		41
38	static DEFINE_SPINLOCK(swap_lock);	42	static DEFINE_SPINLOCK(swap_lock);
39	static unsigned int nr_swapfiles;	43	static unsigned int nr_swapfiles;
40	long nr_swap_pages;	44	long nr_swap_pages;
41	long total_swap_pages;	45	long total_swap_pages;
42	static int swap_overflow;
43	static int least_priority;	46	static int least_priority;
44		47
45	static const char Bad_file[] = "Bad swap file entry ";	48	static const char Bad_file[] = "Bad swap file entry ";
@@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex);
55		58
56	static inline unsigned char swap_count(unsigned char ent)	59	static inline unsigned char swap_count(unsigned char ent)
57	{	60	{
58	return ent & ~SWAP_HAS_CACHE;	61	return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
59	}	62	}
60		63
61	/* returns 1 if swap entry is freed */	64	/* returns 1 if swap entry is freed */
@@ -545,8 +548,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
545	if (usage == SWAP_HAS_CACHE) {	548	if (usage == SWAP_HAS_CACHE) {
546	VM_BUG_ON(!has_cache);	549	VM_BUG_ON(!has_cache);
547	has_cache = 0;	550	has_cache = 0;
548	} else if (count < SWAP_MAP_MAX)	551	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
549	count--;	552	if (count == COUNT_CONTINUED) {
		553	if (swap_count_continued(p, offset, count))
		554	count = SWAP_MAP_MAX \| COUNT_CONTINUED;
		555	else
		556	count = SWAP_MAP_MAX;
		557	} else
		558	count--;
		559	}
550		560
551	if (!count)	561	if (!count)
552	mem_cgroup_uncharge_swap(entry);	562	mem_cgroup_uncharge_swap(entry);
@@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, struct page *page)
604		614
605	/*	615	/*
606	* How many references to page are currently swapped out?	616	* How many references to page are currently swapped out?
		617	* This does not give an exact answer when swap count is continued,
		618	* but does include the high COUNT_CONTINUED flag to allow for that.
607	*/	619	*/
608	static inline int page_swapcount(struct page *page)	620	static inline int page_swapcount(struct page *page)
609	{	621	{
@@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int type)
1019	swp_entry_t entry;	1031	swp_entry_t entry;
1020	unsigned int i = 0;	1032	unsigned int i = 0;
1021	int retval = 0;	1033	int retval = 0;
1022	int reset_overflow = 0;
1023	int shmem;	1034	int shmem;
1024		1035
1025	/*	1036	/*
@@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int type)
1034	* together, child after parent. If we race with dup_mmap(), we	1045	* together, child after parent. If we race with dup_mmap(), we
1035	* prefer to resolve parent before child, lest we miss entries	1046	* prefer to resolve parent before child, lest we miss entries
1036	* duplicated after we scanned child: using last mm would invert	1047	* duplicated after we scanned child: using last mm would invert
1037	* that. Though it's only a serious concern when an overflowed	1048	* that.
1038	* swap count is reset from SWAP_MAP_MAX, preventing a rescan.
1039	*/	1049	*/
1040	start_mm = &init_mm;	1050	start_mm = &init_mm;
1041	atomic_inc(&init_mm.mm_users);	1051	atomic_inc(&init_mm.mm_users);
@@ -1165,36 +1175,6 @@ static int try_to_unuse(unsigned int type)
1165	}	1175	}
1166		1176
1167	/*	1177	/*
1168	* How could swap count reach 0x7ffe ?
1169	* There's no way to repeat a swap page within an mm
1170	* (except in shmem, where it's the shared object which takes
1171	* the reference count)?
1172	* We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1173	* short is too small....)
1174	* If that's wrong, then we should worry more about
1175	* exit_mmap() and do_munmap() cases described above:
1176	* we might be resetting SWAP_MAP_MAX too early here.
1177	*
1178	* Yes, that's wrong: though very unlikely, swap count 0x7ffe
1179	* could surely occur if pid_max raised from PID_MAX_DEFAULT;
1180	* and we are now lowering SWAP_MAP_MAX to 0x7e, making it
1181	* much easier to reach. But the next patch will fix that.
1182	*
1183	* We know "Undead"s can happen, they're okay, so don't
1184	* report them; but do report if we reset SWAP_MAP_MAX.
1185	*/
1186	/* We might release the lock_page() in unuse_mm(). */
1187	if (!PageSwapCache(page) \|\| page_private(page) != entry.val)
1188	goto retry;
1189
1190	if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1191	spin_lock(&swap_lock);
1192	*swap_map = SWAP_HAS_CACHE;
1193	spin_unlock(&swap_lock);
1194	reset_overflow = 1;
1195	}
1196
1197	/*
1198	* If a reference remains (rare), we would like to leave	1178	* If a reference remains (rare), we would like to leave
1199	* the page in the swap cache; but try_to_unmap could	1179	* the page in the swap cache; but try_to_unmap could
1200	* then re-duplicate the entry once we drop page lock,	1180	* then re-duplicate the entry once we drop page lock,
@@ -1235,7 +1215,6 @@ static int try_to_unuse(unsigned int type)
1235	* mark page dirty so shrink_page_list will preserve it.	1215	* mark page dirty so shrink_page_list will preserve it.
1236	*/	1216	*/
1237	SetPageDirty(page);	1217	SetPageDirty(page);
1238	retry:
1239	unlock_page(page);	1218	unlock_page(page);
1240	page_cache_release(page);	1219	page_cache_release(page);
1241		1220
@@ -1247,10 +1226,6 @@ retry:
1247	}	1226	}
1248		1227
1249	mmput(start_mm);	1228	mmput(start_mm);
1250	if (reset_overflow) {
1251	printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
1252	swap_overflow = 0;
1253	}
1254	return retval;	1229	return retval;
1255	}	1230	}
1256		1231
@@ -1593,6 +1568,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1593	up_write(&swap_unplug_sem);	1568	up_write(&swap_unplug_sem);
1594		1569
1595	destroy_swap_extents(p);	1570	destroy_swap_extents(p);
		1571	if (p->flags & SWP_CONTINUED)
		1572	free_swap_count_continuations(p);
		1573
1596	mutex_lock(&swapon_mutex);	1574	mutex_lock(&swapon_mutex);
1597	spin_lock(&swap_lock);	1575	spin_lock(&swap_lock);
1598	drain_mmlist();	1576	drain_mmlist();
@@ -2079,14 +2057,13 @@ void si_swapinfo(struct sysinfo *val)
2079	/*	2057	/*
2080	* Verify that a swap entry is valid and increment its swap map count.	2058	* Verify that a swap entry is valid and increment its swap map count.
2081	*	2059	*
2082	* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
2083	* "permanent", but will be reclaimed by the next swapoff.
2084	* Returns error code in following case.	2060	* Returns error code in following case.
2085	* - success -> 0	2061	* - success -> 0
2086	* - swp_entry is invalid -> EINVAL	2062	* - swp_entry is invalid -> EINVAL
2087	* - swp_entry is migration entry -> EINVAL	2063	* - swp_entry is migration entry -> EINVAL
2088	* - swap-cache reference is requested but there is already one. -> EEXIST	2064	* - swap-cache reference is requested but there is already one. -> EEXIST
2089	* - swap-cache reference is requested but the entry is not used. -> ENOENT	2065	* - swap-cache reference is requested but the entry is not used. -> ENOENT
		2066	* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2090	*/	2067	*/
2091	static int __swap_duplicate(swp_entry_t entry, unsigned char usage)	2068	static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2092	{	2069	{
@@ -2126,15 +2103,14 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2126		2103
2127	} else if (count \|\| has_cache) {	2104	} else if (count \|\| has_cache) {
2128		2105
2129	if (count < SWAP_MAP_MAX - 1)	2106	if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2130	count++;	2107	count += usage;
2131	else if (count <= SWAP_MAP_MAX) {	2108	else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2132	if (swap_overflow++ < 5)
2133	printk(KERN_WARNING
2134	"swap_dup: swap entry overflow\n");
2135	count = SWAP_MAP_MAX;
2136	} else
2137	err = -EINVAL;	2109	err = -EINVAL;
		2110	else if (swap_count_continued(p, offset, count))
		2111	count = COUNT_CONTINUED;
		2112	else
		2113	err = -ENOMEM;
2138	} else	2114	} else
2139	err = -ENOENT; /* unused swap entry */	2115	err = -ENOENT; /* unused swap entry */
2140		2116
@@ -2153,9 +2129,13 @@ bad_file:
2153	/*	2129	/*
2154	* increase reference count of swap entry by 1.	2130	* increase reference count of swap entry by 1.
2155	*/	2131	*/
2156	void swap_duplicate(swp_entry_t entry)	2132	int swap_duplicate(swp_entry_t entry)
2157	{	2133	{
2158	__swap_duplicate(entry, 1);	2134	int err = 0;
		2135
		2136	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
		2137	err = add_swap_count_continuation(entry, GFP_ATOMIC);
		2138	return err;
2159	}	2139	}
2160		2140
2161	/*	2141	/*
@@ -2222,3 +2202,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2222	*offset = ++toff;	2202	*offset = ++toff;
2223	return nr_pages? ++nr_pages: 0;	2203	return nr_pages? ++nr_pages: 0;
2224	}	2204	}
		2205
		2206	/*
		2207	* add_swap_count_continuation - called when a swap count is duplicated
		2208	* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
		2209	* page of the original vmalloc'ed swap_map, to hold the continuation count
		2210	* (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
		2211	* again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
		2212	*
		2213	* These continuation pages are seldom referenced: the common paths all work
		2214	* on the original swap_map, only referring to a continuation page when the
		2215	* low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
		2216	*
		2217	* add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
		2218	* page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
		2219	* can be called after dropping locks.
		2220	*/
		2221	int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
		2222	{
		2223	struct swap_info_struct *si;
		2224	struct page *head;
		2225	struct page *page;
		2226	struct page *list_page;
		2227	pgoff_t offset;
		2228	unsigned char count;
		2229
		2230	/*
		2231	* When debugging, it's easier to use __GFP_ZERO here; but it's better
		2232	* for latency not to zero a page while GFP_ATOMIC and holding locks.
		2233	*/
		2234	page = alloc_page(gfp_mask \| __GFP_HIGHMEM);
		2235
		2236	si = swap_info_get(entry);
		2237	if (!si) {
		2238	/*
		2239	* An acceptable race has occurred since the failing
		2240	* __swap_duplicate(): the swap entry has been freed,
		2241	* perhaps even the whole swap_map cleared for swapoff.
		2242	*/
		2243	goto outer;
		2244	}
		2245
		2246	offset = swp_offset(entry);
		2247	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
		2248
		2249	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
		2250	/*
		2251	* The higher the swap count, the more likely it is that tasks
		2252	* will race to add swap count continuation: we need to avoid
		2253	* over-provisioning.
		2254	*/
		2255	goto out;
		2256	}
		2257
		2258	if (!page) {
		2259	spin_unlock(&swap_lock);
		2260	return -ENOMEM;
		2261	}
		2262
		2263	/*
		2264	* We are fortunate that although vmalloc_to_page uses pte_offset_map,
		2265	* no architecture is using highmem pages for kernel pagetables: so it
		2266	* will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
		2267	*/
		2268	head = vmalloc_to_page(si->swap_map + offset);
		2269	offset &= ~PAGE_MASK;
		2270
		2271	/*
		2272	* Page allocation does not initialize the page's lru field,
		2273	* but it does always reset its private field.
		2274	*/
		2275	if (!page_private(head)) {
		2276	BUG_ON(count & COUNT_CONTINUED);
		2277	INIT_LIST_HEAD(&head->lru);
		2278	set_page_private(head, SWP_CONTINUED);
		2279	si->flags \|= SWP_CONTINUED;
		2280	}
		2281
		2282	list_for_each_entry(list_page, &head->lru, lru) {
		2283	unsigned char *map;
		2284
		2285	/*
		2286	* If the previous map said no continuation, but we've found
		2287	* a continuation page, free our allocation and use this one.
		2288	*/
		2289	if (!(count & COUNT_CONTINUED))
		2290	goto out;
		2291
		2292	map = kmap_atomic(list_page, KM_USER0) + offset;
		2293	count = *map;
		2294	kunmap_atomic(map, KM_USER0);
		2295
		2296	/*
		2297	* If this continuation count now has some space in it,
		2298	* free our allocation and use this one.
		2299	*/
		2300	if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
		2301	goto out;
		2302	}
		2303
		2304	list_add_tail(&page->lru, &head->lru);
		2305	page = NULL; /* now it's attached, don't free it */
		2306	out:
		2307	spin_unlock(&swap_lock);
		2308	outer:
		2309	if (page)
		2310	__free_page(page);
		2311	return 0;
		2312	}
		2313
		2314	/*
		2315	* swap_count_continued - when the original swap_map count is incremented
		2316	* from SWAP_MAP_MAX, check if there is already a continuation page to carry
		2317	* into, carry if so, or else fail until a new continuation page is allocated;
		2318	* when the original swap_map count is decremented from 0 with continuation,
		2319	* borrow from the continuation and report whether it still holds more.
		2320	* Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
		2321	*/
		2322	static bool swap_count_continued(struct swap_info_struct *si,
		2323	pgoff_t offset, unsigned char count)
		2324	{
		2325	struct page *head;
		2326	struct page *page;
		2327	unsigned char *map;
		2328
		2329	head = vmalloc_to_page(si->swap_map + offset);
		2330	if (page_private(head) != SWP_CONTINUED) {
		2331	BUG_ON(count & COUNT_CONTINUED);
		2332	return false; /* need to add count continuation */
		2333	}
		2334
		2335	offset &= ~PAGE_MASK;
		2336	page = list_entry(head->lru.next, struct page, lru);
		2337	map = kmap_atomic(page, KM_USER0) + offset;
		2338
		2339	if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
		2340	goto init_map; /* jump over SWAP_CONT_MAX checks */
		2341
		2342	if (count == (SWAP_MAP_MAX \| COUNT_CONTINUED)) { /* incrementing */
		2343	/*
		2344	* Think of how you add 1 to 999
		2345	*/
		2346	while (*map == (SWAP_CONT_MAX \| COUNT_CONTINUED)) {
		2347	kunmap_atomic(map, KM_USER0);
		2348	page = list_entry(page->lru.next, struct page, lru);
		2349	BUG_ON(page == head);
		2350	map = kmap_atomic(page, KM_USER0) + offset;
		2351	}
		2352	if (*map == SWAP_CONT_MAX) {
		2353	kunmap_atomic(map, KM_USER0);
		2354	page = list_entry(page->lru.next, struct page, lru);
		2355	if (page == head)
		2356	return false; /* add count continuation */
		2357	map = kmap_atomic(page, KM_USER0) + offset;
		2358	init_map: map = 0; / we didn't zero the page */
		2359	}
		2360	*map += 1;
		2361	kunmap_atomic(map, KM_USER0);
		2362	page = list_entry(page->lru.prev, struct page, lru);
		2363	while (page != head) {
		2364	map = kmap_atomic(page, KM_USER0) + offset;
		2365	*map = COUNT_CONTINUED;
		2366	kunmap_atomic(map, KM_USER0);
		2367	page = list_entry(page->lru.prev, struct page, lru);
		2368	}
		2369	return true; /* incremented */
		2370
		2371	} else { /* decrementing */
		2372	/*
		2373	* Think of how you subtract 1 from 1000
		2374	*/
		2375	BUG_ON(count != COUNT_CONTINUED);
		2376	while (*map == COUNT_CONTINUED) {
		2377	kunmap_atomic(map, KM_USER0);
		2378	page = list_entry(page->lru.next, struct page, lru);
		2379	BUG_ON(page == head);
		2380	map = kmap_atomic(page, KM_USER0) + offset;
		2381	}
		2382	BUG_ON(*map == 0);
		2383	*map -= 1;
		2384	if (*map == 0)
		2385	count = 0;
		2386	kunmap_atomic(map, KM_USER0);
		2387	page = list_entry(page->lru.prev, struct page, lru);
		2388	while (page != head) {
		2389	map = kmap_atomic(page, KM_USER0) + offset;
		2390	*map = SWAP_CONT_MAX \| count;
		2391	count = COUNT_CONTINUED;
		2392	kunmap_atomic(map, KM_USER0);
		2393	page = list_entry(page->lru.prev, struct page, lru);
		2394	}
		2395	return count == COUNT_CONTINUED;
		2396	}
		2397	}
		2398
		2399	/*
		2400	* free_swap_count_continuations - swapoff free all the continuation pages
		2401	* appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
		2402	*/
		2403	static void free_swap_count_continuations(struct swap_info_struct *si)
		2404	{
		2405	pgoff_t offset;
		2406
		2407	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
		2408	struct page *head;
		2409	head = vmalloc_to_page(si->swap_map + offset);
		2410	if (page_private(head)) {
		2411	struct list_head this, next;
		2412	list_for_each_safe(this, next, &head->lru) {
		2413	struct page *page;
		2414	page = list_entry(this, struct page, lru);
		2415	list_del(this);
		2416	__free_page(page);
		2417	}
		2418	}
		2419	}
		2420	}