aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-06-21 20:15:12 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-21 21:46:21 -0400
commitc475a8ab625d567eacf5e30ec35d6d8704558062 (patch)
tree0971bef7b876f1b3eb160621fc2b61cb5313827b
parentd296e9cd02c92e576ecce5344026a4df4353cdb2 (diff)
[PATCH] can_share_swap_page: use page_mapcount
Remember that ironic get_user_pages race? when the raised page_count on a page swapped out led do_wp_page to decide that it had to copy on write, so substituted a different page into userspace. 2.6.7 onwards have Andrea's solution, where try_to_unmap_one backs out if it finds page_count raised. Which works, but is unsatisfying (rmap.c has no other page_count heuristics), and was found a few months ago to hang an intensive page migration test. A year ago I was hesitant to engage page_mapcount, now it seems the right fix. So remove the page_count hack from try_to_unmap_one; and use activate_page in unuse_mm when dropping lock, to replace its secondary effect of helping swapoff to make progress in that case. Simplify can_share_swap_page (now called only on anonymous pages) to check page_mapcount + page_swapcount == 1: still needs the page lock to stabilize their (pessimistic) sum, but does not need swapper_space.tree_lock for that. In do_swap_page, move swap_free and unlock_page below page_add_anon_rmap, to keep sum on the high side, and correct when can_share_swap_page called. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--mm/memory.c10
-rw-r--r--mm/rmap.c21
-rw-r--r--mm/swapfile.c55
3 files changed, 21 insertions, 65 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 1c0a3db78a05..da91b7bf9986 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1686,10 +1686,6 @@ static int do_swap_page(struct mm_struct * mm,
1686 } 1686 }
1687 1687
1688 /* The page isn't present yet, go ahead with the fault. */ 1688 /* The page isn't present yet, go ahead with the fault. */
1689
1690 swap_free(entry);
1691 if (vm_swap_full())
1692 remove_exclusive_swap_page(page);
1693 1689
1694 inc_mm_counter(mm, rss); 1690 inc_mm_counter(mm, rss);
1695 pte = mk_pte(page, vma->vm_page_prot); 1691 pte = mk_pte(page, vma->vm_page_prot);
@@ -1697,12 +1693,16 @@ static int do_swap_page(struct mm_struct * mm,
1697 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 1693 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
1698 write_access = 0; 1694 write_access = 0;
1699 } 1695 }
1700 unlock_page(page);
1701 1696
1702 flush_icache_page(vma, page); 1697 flush_icache_page(vma, page);
1703 set_pte_at(mm, address, page_table, pte); 1698 set_pte_at(mm, address, page_table, pte);
1704 page_add_anon_rmap(page, vma, address); 1699 page_add_anon_rmap(page, vma, address);
1705 1700
1701 swap_free(entry);
1702 if (vm_swap_full())
1703 remove_exclusive_swap_page(page);
1704 unlock_page(page);
1705
1706 if (write_access) { 1706 if (write_access) {
1707 if (do_wp_page(mm, vma, address, 1707 if (do_wp_page(mm, vma, address,
1708 page_table, pmd, pte) == VM_FAULT_OOM) 1708 page_table, pmd, pte) == VM_FAULT_OOM)
diff --git a/mm/rmap.c b/mm/rmap.c
index 9827409eb7c7..89770bd25f31 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -539,27 +539,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
539 goto out_unmap; 539 goto out_unmap;
540 } 540 }
541 541
542 /*
543 * Don't pull an anonymous page out from under get_user_pages.
544 * GUP carefully breaks COW and raises page count (while holding
545 * page_table_lock, as we have here) to make sure that the page
546 * cannot be freed. If we unmap that page here, a user write
547 * access to the virtual address will bring back the page, but
548 * its raised count will (ironically) be taken to mean it's not
549 * an exclusive swap page, do_wp_page will replace it by a copy
550 * page, and the user never get to see the data GUP was holding
551 * the original page for.
552 *
553 * This test is also useful for when swapoff (unuse_process) has
554 * to drop page lock: its reference to the page stops existing
555 * ptes from being unmapped, so swapoff can make progress.
556 */
557 if (PageSwapCache(page) &&
558 page_count(page) != page_mapcount(page) + 2) {
559 ret = SWAP_FAIL;
560 goto out_unmap;
561 }
562
563 /* Nuke the page table entry. */ 542 /* Nuke the page table entry. */
564 flush_cache_page(vma, address, page_to_pfn(page)); 543 flush_cache_page(vma, address, page_to_pfn(page));
565 pteval = ptep_clear_flush(vma, address, pte); 544 pteval = ptep_clear_flush(vma, address, pte);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index da48405cd9a3..60cd24a55204 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -276,61 +276,37 @@ void swap_free(swp_entry_t entry)
276} 276}
277 277
278/* 278/*
279 * Check if we're the only user of a swap page, 279 * How many references to page are currently swapped out?
280 * when the page is locked.
281 */ 280 */
282static int exclusive_swap_page(struct page *page) 281static inline int page_swapcount(struct page *page)
283{ 282{
284 int retval = 0; 283 int count = 0;
285 struct swap_info_struct * p; 284 struct swap_info_struct *p;
286 swp_entry_t entry; 285 swp_entry_t entry;
287 286
288 entry.val = page->private; 287 entry.val = page->private;
289 p = swap_info_get(entry); 288 p = swap_info_get(entry);
290 if (p) { 289 if (p) {
291 /* Is the only swap cache user the cache itself? */ 290 /* Subtract the 1 for the swap cache itself */
292 if (p->swap_map[swp_offset(entry)] == 1) { 291 count = p->swap_map[swp_offset(entry)] - 1;
293 /* Recheck the page count with the swapcache lock held.. */
294 write_lock_irq(&swapper_space.tree_lock);
295 if (page_count(page) == 2)
296 retval = 1;
297 write_unlock_irq(&swapper_space.tree_lock);
298 }
299 swap_info_put(p); 292 swap_info_put(p);
300 } 293 }
301 return retval; 294 return count;
302} 295}
303 296
304/* 297/*
305 * We can use this swap cache entry directly 298 * We can use this swap cache entry directly
306 * if there are no other references to it. 299 * if there are no other references to it.
307 *
308 * Here "exclusive_swap_page()" does the real
309 * work, but we opportunistically check whether
310 * we need to get all the locks first..
311 */ 300 */
312int can_share_swap_page(struct page *page) 301int can_share_swap_page(struct page *page)
313{ 302{
314 int retval = 0; 303 int count;
315 304
316 if (!PageLocked(page)) 305 BUG_ON(!PageLocked(page));
317 BUG(); 306 count = page_mapcount(page);
318 switch (page_count(page)) { 307 if (count <= 1 && PageSwapCache(page))
319 case 3: 308 count += page_swapcount(page);
320 if (!PagePrivate(page)) 309 return count == 1;
321 break;
322 /* Fallthrough */
323 case 2:
324 if (!PageSwapCache(page))
325 break;
326 retval = exclusive_swap_page(page);
327 break;
328 case 1:
329 if (PageReserved(page))
330 break;
331 retval = 1;
332 }
333 return retval;
334} 310}
335 311
336/* 312/*
@@ -529,9 +505,10 @@ static int unuse_mm(struct mm_struct *mm,
529 505
530 if (!down_read_trylock(&mm->mmap_sem)) { 506 if (!down_read_trylock(&mm->mmap_sem)) {
531 /* 507 /*
532 * Our reference to the page stops try_to_unmap_one from 508 * Activate page so shrink_cache is unlikely to unmap its
533 * unmapping its ptes, so swapoff can make progress. 509 * ptes while lock is dropped, so swapoff can make progress.
534 */ 510 */
511 activate_page(page);
535 unlock_page(page); 512 unlock_page(page);
536 down_read(&mm->mmap_sem); 513 down_read(&mm->mmap_sem);
537 lock_page(page); 514 lock_page(page);