[PATCH] can_share_swap_page: use page_mapcount

Remember that ironic get_user_pages race? when the raised page_count on a page swapped out led do_wp_page to decide that it had to copy on write, so substituted a different page into userspace. 2.6.7 onwards have Andrea's solution, where try_to_unmap_one backs out if it finds page_count raised. Which works, but is unsatisfying (rmap.c has no other page_count heuristics), and was found a few months ago to hang an intensive page migration test. A year ago I was hesitant to engage page_mapcount, now it seems the right fix. So remove the page_count hack from try_to_unmap_one; and use activate_page in unuse_mm when dropping lock, to replace its secondary effect of helping swapoff to make progress in that case. Simplify can_share_swap_page (now called only on anonymous pages) to check page_mapcount + page_swapcount == 1: still needs the page lock to stabilize their (pessimistic) sum, but does not need swapper_space.tree_lock for that. In do_swap_page, move swap_free and unlock_page below page_add_anon_rmap, to keep sum on the high side, and correct when can_share_swap_page called. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Hugh Dickins <hugh@veritas.com> 2005-06-21 20:15:12 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-06-21 21:46:21 -0400
commit: c475a8ab625d567eacf5e30ec35d6d8704558062 (patch)
tree: 0971bef7b876f1b3eb160621fc2b61cb5313827b /mm
parent: d296e9cd02c92e576ecce5344026a4df4353cdb2 (diff)
3 files changed, 21 insertions, 65 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 1c0a3db78a05..da91b7bf9986 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1686,10 +1686,6 @@ static int do_swap_page(struct mm_struct * mm,
        }
        /* The page isn't present yet, go ahead with the fault. */
-                
-        swap_free(entry);
-        if (vm_swap_full())
-                remove_exclusive_swap_page(page);
        inc_mm_counter(mm, rss);
        pte = mk_pte(page, vma->vm_page_prot);
@@ -1697,12 +1693,16 @@ static int do_swap_page(struct mm_struct * mm,
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                write_access = 0;
        }
-        unlock_page(page);
        flush_icache_page(vma, page);
        set_pte_at(mm, address, page_table, pte);
        page_add_anon_rmap(page, vma, address);
+        swap_free(entry);
+        if (vm_swap_full())
+                remove_exclusive_swap_page(page);
+        unlock_page(page);
        if (write_access) {
                if (do_wp_page(mm, vma, address,
                                page_table, pmd, pte) == VM_FAULT_OOM)
diff --git a/mm/rmap.c b/mm/rmap.c
index 9827409eb7c7..89770bd25f31 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -539,27 +539,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
                goto out_unmap;
        }
-        /*
-         * Don't pull an anonymous page out from under get_user_pages.
-         * GUP carefully breaks COW and raises page count (while holding
-         * page_table_lock, as we have here) to make sure that the page
-         * cannot be freed.  If we unmap that page here, a user write
-         * access to the virtual address will bring back the page, but
-         * its raised count will (ironically) be taken to mean it's not
-         * an exclusive swap page, do_wp_page will replace it by a copy
-         * page, and the user never get to see the data GUP was holding
-         * the original page for.
-         *
-         * This test is also useful for when swapoff (unuse_process) has
-         * to drop page lock: its reference to the page stops existing
-         * ptes from being unmapped, so swapoff can make progress.
-         */
-        if (PageSwapCache(page) &&
-            page_count(page) != page_mapcount(page) + 2) {
-                ret = SWAP_FAIL;
-                goto out_unmap;
-        }
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
        pteval = ptep_clear_flush(vma, address, pte);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index da48405cd9a3..60cd24a55204 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -276,61 +276,37 @@ void swap_free(swp_entry_t entry)
 }
 /*
- * Check if we're the only user of a swap page,
+ * How many references to page are currently swapped out?
- * when the page is locked.
 */
-static int exclusive_swap_page(struct page *page)
+static inline int page_swapcount(struct page *page)
 {
-        int retval = 0;
+        int count = 0;
-        struct swap_info_struct * p;
+        struct swap_info_struct *p;
        swp_entry_t entry;
        entry.val = page->private;
        p = swap_info_get(entry);
        if (p) {
-                /* Is the only swap cache user the cache itself? */
+                /* Subtract the 1 for the swap cache itself */
-                if (p->swap_map[swp_offset(entry)] == 1) {
+                count = p->swap_map[swp_offset(entry)] - 1;
-                        /* Recheck the page count with the swapcache lock held.. */
-                        write_lock_irq(&swapper_space.tree_lock);
-                        if (page_count(page) == 2)
-                                retval = 1;
-                        write_unlock_irq(&swapper_space.tree_lock);
-                }
                swap_info_put(p);
        }
-        return retval;
+        return count;
 }
 /*
 * We can use this swap cache entry directly
 * if there are no other references to it.
- *
- * Here "exclusive_swap_page()" does the real
- * work, but we opportunistically check whether
- * we need to get all the locks first..
 */
 int can_share_swap_page(struct page *page)
 {
-        int retval = 0;
+        int count;
-        if (!PageLocked(page))
+        BUG_ON(!PageLocked(page));
-                BUG();
+        count = page_mapcount(page);
-        switch (page_count(page)) {
+        if (count <= 1 && PageSwapCache(page))
-        case 3:
+                count += page_swapcount(page);
-                if (!PagePrivate(page))
+        return count == 1;
-                        break;
-                /* Fallthrough */
-        case 2:
-                if (!PageSwapCache(page))
-                        break;
-                retval = exclusive_swap_page(page);
-                break;
-        case 1:
-                if (PageReserved(page))
-                        break;
-                retval = 1;
-        }
-        return retval;
 }
 /*
@@ -529,9 +505,10 @@ static int unuse_mm(struct mm_struct *mm,
        if (!down_read_trylock(&mm->mmap_sem)) {
                /*
-                 * Our reference to the page stops try_to_unmap_one from
+                 * Activate page so shrink_cache is unlikely to unmap its
-                 * unmapping its ptes, so swapoff can make progress.
+                 * ptes while lock is dropped, so swapoff can make progress.
                 */
+                activate_page(page);
                unlock_page(page);
                down_read(&mm->mmap_sem);
                lock_page(page);
author	Hugh Dickins <hugh@veritas.com>	2005-06-21 20:15:12 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-06-21 21:46:21 -0400
commit	c475a8ab625d567eacf5e30ec35d6d8704558062 (patch)
tree	0971bef7b876f1b3eb160621fc2b61cb5313827b /mm
parent	d296e9cd02c92e576ecce5344026a4df4353cdb2 (diff)