aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2008-07-25 22:45:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-26 15:00:06 -0400
commite286781d5f2e9c846e012a39653a166e9d31777d (patch)
tree14958fe6d8f3e0459c96c68b3034ea2433ab85ac /mm
parent47feff2c8eefe85099f87c43d3096855f0085ca0 (diff)
mm: speculative page references
If we can be sure that elevating the page_count on a pagecache page will pin it, we can speculatively run this operation, and subsequently check to see if we hit the right page rather than relying on holding a lock or otherwise pinning a reference to the page. This can be done if get_page/put_page behaves consistently throughout the whole tree (ie. if we "get" the page after it has been used for something else, we must be able to free it with a put_page). Actually, there is a period where the count behaves differently: when the page is free or if it is a constituent page of a compound page. We need an atomic_inc_not_zero operation to ensure we don't try to grab the page in either case. This patch introduces the core locking protocol to the pagecache (ie. adds page_cache_get_speculative, and tweaks some update-side code to make it work). Thanks to Hugh for pointing out an improvement to the algorithm setting page_count to zero when we have control of all references, in order to hold off speculative getters. [kamezawa.hiroyu@jp.fujitsu.com: fix migration_entry_wait()] [hugh@veritas.com: fix add_to_page_cache] [akpm@linux-foundation.org: repair a comment] Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Jeff Garzik <jeff@garzik.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Hugh Dickins <hugh@veritas.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Acked-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c32
-rw-r--r--mm/migrate.c20
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/swap_state.c17
-rw-r--r--mm/vmscan.c74
5 files changed, 105 insertions, 44 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 2d3ec1ffc66e..4e182a9a14c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -442,39 +442,43 @@ int filemap_write_and_wait_range(struct address_space *mapping,
442} 442}
443 443
444/** 444/**
445 * add_to_page_cache - add newly allocated pagecache pages 445 * add_to_page_cache_locked - add a locked page to the pagecache
446 * @page: page to add 446 * @page: page to add
447 * @mapping: the page's address_space 447 * @mapping: the page's address_space
448 * @offset: page index 448 * @offset: page index
449 * @gfp_mask: page allocation mode 449 * @gfp_mask: page allocation mode
450 * 450 *
451 * This function is used to add newly allocated pagecache pages; 451 * This function is used to add a page to the pagecache. It must be locked.
452 * the page is new, so we can just run SetPageLocked() against it.
453 * The other page state flags were set by rmqueue().
454 *
455 * This function does not add the page to the LRU. The caller must do that. 452 * This function does not add the page to the LRU. The caller must do that.
456 */ 453 */
457int add_to_page_cache(struct page *page, struct address_space *mapping, 454int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
458 pgoff_t offset, gfp_t gfp_mask) 455 pgoff_t offset, gfp_t gfp_mask)
459{ 456{
460 int error = mem_cgroup_cache_charge(page, current->mm, 457 int error;
458
459 VM_BUG_ON(!PageLocked(page));
460
461 error = mem_cgroup_cache_charge(page, current->mm,
461 gfp_mask & ~__GFP_HIGHMEM); 462 gfp_mask & ~__GFP_HIGHMEM);
462 if (error) 463 if (error)
463 goto out; 464 goto out;
464 465
465 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 466 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
466 if (error == 0) { 467 if (error == 0) {
468 page_cache_get(page);
469 page->mapping = mapping;
470 page->index = offset;
471
467 write_lock_irq(&mapping->tree_lock); 472 write_lock_irq(&mapping->tree_lock);
468 error = radix_tree_insert(&mapping->page_tree, offset, page); 473 error = radix_tree_insert(&mapping->page_tree, offset, page);
469 if (!error) { 474 if (likely(!error)) {
470 page_cache_get(page);
471 SetPageLocked(page);
472 page->mapping = mapping;
473 page->index = offset;
474 mapping->nrpages++; 475 mapping->nrpages++;
475 __inc_zone_page_state(page, NR_FILE_PAGES); 476 __inc_zone_page_state(page, NR_FILE_PAGES);
476 } else 477 } else {
478 page->mapping = NULL;
477 mem_cgroup_uncharge_cache_page(page); 479 mem_cgroup_uncharge_cache_page(page);
480 page_cache_release(page);
481 }
478 482
479 write_unlock_irq(&mapping->tree_lock); 483 write_unlock_irq(&mapping->tree_lock);
480 radix_tree_preload_end(); 484 radix_tree_preload_end();
@@ -483,7 +487,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
483out: 487out:
484 return error; 488 return error;
485} 489}
486EXPORT_SYMBOL(add_to_page_cache); 490EXPORT_SYMBOL(add_to_page_cache_locked);
487 491
488int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 492int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
489 pgoff_t offset, gfp_t gfp_mask) 493 pgoff_t offset, gfp_t gfp_mask)
diff --git a/mm/migrate.c b/mm/migrate.c
index d8c65a65c61d..3ca6392e82cc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
285 285
286 page = migration_entry_to_page(entry); 286 page = migration_entry_to_page(entry);
287 287
288 get_page(page); 288 /*
289 * Once radix-tree replacement of page migration started, page_count
290 * *must* be zero. And, we don't want to call wait_on_page_locked()
291 * against a page without get_page().
292 * So, we use get_page_unless_zero(), here. Even failed, page fault
293 * will occur again.
294 */
295 if (!get_page_unless_zero(page))
296 goto out;
289 pte_unmap_unlock(ptep, ptl); 297 pte_unmap_unlock(ptep, ptl);
290 wait_on_page_locked(page); 298 wait_on_page_locked(page);
291 put_page(page); 299 put_page(page);
@@ -305,6 +313,7 @@ out:
305static int migrate_page_move_mapping(struct address_space *mapping, 313static int migrate_page_move_mapping(struct address_space *mapping,
306 struct page *newpage, struct page *page) 314 struct page *newpage, struct page *page)
307{ 315{
316 int expected_count;
308 void **pslot; 317 void **pslot;
309 318
310 if (!mapping) { 319 if (!mapping) {
@@ -319,12 +328,18 @@ static int migrate_page_move_mapping(struct address_space *mapping,
319 pslot = radix_tree_lookup_slot(&mapping->page_tree, 328 pslot = radix_tree_lookup_slot(&mapping->page_tree,
320 page_index(page)); 329 page_index(page));
321 330
322 if (page_count(page) != 2 + !!PagePrivate(page) || 331 expected_count = 2 + !!PagePrivate(page);
332 if (page_count(page) != expected_count ||
323 (struct page *)radix_tree_deref_slot(pslot) != page) { 333 (struct page *)radix_tree_deref_slot(pslot) != page) {
324 write_unlock_irq(&mapping->tree_lock); 334 write_unlock_irq(&mapping->tree_lock);
325 return -EAGAIN; 335 return -EAGAIN;
326 } 336 }
327 337
338 if (!page_freeze_refs(page, expected_count)) {
339 write_unlock_irq(&mapping->tree_lock);
340 return -EAGAIN;
341 }
342
328 /* 343 /*
329 * Now we know that no one else is looking at the page. 344 * Now we know that no one else is looking at the page.
330 */ 345 */
@@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
338 353
339 radix_tree_replace_slot(pslot, newpage); 354 radix_tree_replace_slot(pslot, newpage);
340 355
356 page_unfreeze_refs(page, expected_count);
341 /* 357 /*
342 * Drop cache reference from old page. 358 * Drop cache reference from old page.
343 * We know this isn't the last reference. 359 * We know this isn't the last reference.
diff --git a/mm/shmem.c b/mm/shmem.c
index f92fea94d037..1089092aecaf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -936,7 +936,7 @@ found:
936 spin_lock(&info->lock); 936 spin_lock(&info->lock);
937 ptr = shmem_swp_entry(info, idx, NULL); 937 ptr = shmem_swp_entry(info, idx, NULL);
938 if (ptr && ptr->val == entry.val) { 938 if (ptr && ptr->val == entry.val) {
939 error = add_to_page_cache(page, inode->i_mapping, 939 error = add_to_page_cache_locked(page, inode->i_mapping,
940 idx, GFP_NOWAIT); 940 idx, GFP_NOWAIT);
941 /* does mem_cgroup_uncharge_cache_page on error */ 941 /* does mem_cgroup_uncharge_cache_page on error */
942 } else /* we must compensate for our precharge above */ 942 } else /* we must compensate for our precharge above */
@@ -1301,8 +1301,8 @@ repeat:
1301 SetPageUptodate(filepage); 1301 SetPageUptodate(filepage);
1302 set_page_dirty(filepage); 1302 set_page_dirty(filepage);
1303 swap_free(swap); 1303 swap_free(swap);
1304 } else if (!(error = add_to_page_cache( 1304 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1305 swappage, mapping, idx, GFP_NOWAIT))) { 1305 idx, GFP_NOWAIT))) {
1306 info->flags |= SHMEM_PAGEIN; 1306 info->flags |= SHMEM_PAGEIN;
1307 shmem_swp_set(info, entry, 0); 1307 shmem_swp_set(info, entry, 0);
1308 shmem_swp_unmap(entry); 1308 shmem_swp_unmap(entry);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..3e3381d6c7ee 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -64,7 +64,7 @@ void show_swap_cache_info(void)
64} 64}
65 65
66/* 66/*
67 * add_to_swap_cache resembles add_to_page_cache on swapper_space, 67 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index. 68 * but sets SwapCache flag and private instead of mapping and index.
69 */ 69 */
70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +76,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
76 BUG_ON(PagePrivate(page)); 76 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask); 77 error = radix_tree_preload(gfp_mask);
78 if (!error) { 78 if (!error) {
79 page_cache_get(page);
80 SetPageSwapCache(page);
81 set_page_private(page, entry.val);
82
79 write_lock_irq(&swapper_space.tree_lock); 83 write_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree, 84 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page); 85 entry.val, page);
82 if (!error) { 86 if (likely(!error)) {
83 page_cache_get(page);
84 SetPageSwapCache(page);
85 set_page_private(page, entry.val);
86 total_swapcache_pages++; 87 total_swapcache_pages++;
87 __inc_zone_page_state(page, NR_FILE_PAGES); 88 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total); 89 INC_CACHE_INFO(add_total);
89 } 90 }
90 write_unlock_irq(&swapper_space.tree_lock); 91 write_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end(); 92 radix_tree_preload_end();
93
94 if (unlikely(error)) {
95 set_page_private(page, 0UL);
96 ClearPageSwapCache(page);
97 page_cache_release(page);
98 }
92 } 99 }
93 return error; 100 return error;
94} 101}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26672c6cd3ce..0075eac1cd04 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -391,12 +391,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
391} 391}
392 392
393/* 393/*
394 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 394 * Same as remove_mapping, but if the page is removed from the mapping, it
395 * someone else has a ref on the page, abort and return 0. If it was 395 * gets returned with a refcount of 0.
396 * successfully detached, return 1. Assumes the caller has a single ref on
397 * this page.
398 */ 396 */
399int remove_mapping(struct address_space *mapping, struct page *page) 397static int __remove_mapping(struct address_space *mapping, struct page *page)
400{ 398{
401 BUG_ON(!PageLocked(page)); 399 BUG_ON(!PageLocked(page));
402 BUG_ON(mapping != page_mapping(page)); 400 BUG_ON(mapping != page_mapping(page));
@@ -427,24 +425,24 @@ int remove_mapping(struct address_space *mapping, struct page *page)
427 * Note that if SetPageDirty is always performed via set_page_dirty, 425 * Note that if SetPageDirty is always performed via set_page_dirty,
428 * and thus under tree_lock, then this ordering is not required. 426 * and thus under tree_lock, then this ordering is not required.
429 */ 427 */
430 if (unlikely(page_count(page) != 2)) 428 if (!page_freeze_refs(page, 2))
431 goto cannot_free; 429 goto cannot_free;
432 smp_rmb(); 430 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
433 if (unlikely(PageDirty(page))) 431 if (unlikely(PageDirty(page))) {
432 page_unfreeze_refs(page, 2);
434 goto cannot_free; 433 goto cannot_free;
434 }
435 435
436 if (PageSwapCache(page)) { 436 if (PageSwapCache(page)) {
437 swp_entry_t swap = { .val = page_private(page) }; 437 swp_entry_t swap = { .val = page_private(page) };
438 __delete_from_swap_cache(page); 438 __delete_from_swap_cache(page);
439 write_unlock_irq(&mapping->tree_lock); 439 write_unlock_irq(&mapping->tree_lock);
440 swap_free(swap); 440 swap_free(swap);
441 __put_page(page); /* The pagecache ref */ 441 } else {
442 return 1; 442 __remove_from_page_cache(page);
443 write_unlock_irq(&mapping->tree_lock);
443 } 444 }
444 445
445 __remove_from_page_cache(page);
446 write_unlock_irq(&mapping->tree_lock);
447 __put_page(page);
448 return 1; 446 return 1;
449 447
450cannot_free: 448cannot_free:
@@ -453,6 +451,26 @@ cannot_free:
453} 451}
454 452
455/* 453/*
454 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
455 * someone else has a ref on the page, abort and return 0. If it was
456 * successfully detached, return 1. Assumes the caller has a single ref on
457 * this page.
458 */
459int remove_mapping(struct address_space *mapping, struct page *page)
460{
461 if (__remove_mapping(mapping, page)) {
462 /*
463 * Unfreezing the refcount with 1 rather than 2 effectively
464 * drops the pagecache ref for us without requiring another
465 * atomic operation.
466 */
467 page_unfreeze_refs(page, 1);
468 return 1;
469 }
470 return 0;
471}
472
473/*
456 * shrink_page_list() returns the number of reclaimed pages 474 * shrink_page_list() returns the number of reclaimed pages
457 */ 475 */
458static unsigned long shrink_page_list(struct list_head *page_list, 476static unsigned long shrink_page_list(struct list_head *page_list,
@@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
598 if (PagePrivate(page)) { 616 if (PagePrivate(page)) {
599 if (!try_to_release_page(page, sc->gfp_mask)) 617 if (!try_to_release_page(page, sc->gfp_mask))
600 goto activate_locked; 618 goto activate_locked;
601 if (!mapping && page_count(page) == 1) 619 if (!mapping && page_count(page) == 1) {
602 goto free_it; 620 unlock_page(page);
621 if (put_page_testzero(page))
622 goto free_it;
623 else {
624 /*
625 * rare race with speculative reference.
626 * the speculative reference will free
627 * this page shortly, so we may
628 * increment nr_reclaimed here (and
629 * leave it off the LRU).
630 */
631 nr_reclaimed++;
632 continue;
633 }
634 }
603 } 635 }
604 636
605 if (!mapping || !remove_mapping(mapping, page)) 637 if (!mapping || !__remove_mapping(mapping, page))
606 goto keep_locked; 638 goto keep_locked;
607 639
608free_it:
609 unlock_page(page); 640 unlock_page(page);
641free_it:
610 nr_reclaimed++; 642 nr_reclaimed++;
611 if (!pagevec_add(&freed_pvec, page)) 643 if (!pagevec_add(&freed_pvec, page)) {
612 __pagevec_release_nonlru(&freed_pvec); 644 __pagevec_free(&freed_pvec);
645 pagevec_reinit(&freed_pvec);
646 }
613 continue; 647 continue;
614 648
615activate_locked: 649activate_locked:
@@ -623,7 +657,7 @@ keep:
623 } 657 }
624 list_splice(&ret_pages, page_list); 658 list_splice(&ret_pages, page_list);
625 if (pagevec_count(&freed_pvec)) 659 if (pagevec_count(&freed_pvec))
626 __pagevec_release_nonlru(&freed_pvec); 660 __pagevec_free(&freed_pvec);
627 count_vm_events(PGACTIVATE, pgactivate); 661 count_vm_events(PGACTIVATE, pgactivate);
628 return nr_reclaimed; 662 return nr_reclaimed;
629} 663}