mm: speculative page references

If we can be sure that elevating the page_count on a pagecache page will pin it, we can speculatively run this operation, and subsequently check to see if we hit the right page rather than relying on holding a lock or otherwise pinning a reference to the page. This can be done if get_page/put_page behaves consistently throughout the whole tree (ie. if we "get" the page after it has been used for something else, we must be able to free it with a put_page). Actually, there is a period where the count behaves differently: when the page is free or if it is a constituent page of a compound page. We need an atomic_inc_not_zero operation to ensure we don't try to grab the page in either case. This patch introduces the core locking protocol to the pagecache (ie. adds page_cache_get_speculative, and tweaks some update-side code to make it work). Thanks to Hugh for pointing out an improvement to the algorithm setting page_count to zero when we have control of all references, in order to hold off speculative getters. [kamezawa.hiroyu@jp.fujitsu.com: fix migration_entry_wait()] [hugh@veritas.com: fix add_to_page_cache] [akpm@linux-foundation.org: repair a comment] Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Jeff Garzik <jeff@garzik.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Hugh Dickins <hugh@veritas.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Acked-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Nick Piggin <npiggin@suse.de> 2008-07-25 22:45:30 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-07-26 15:00:06 -0400
commit: e286781d5f2e9c846e012a39653a166e9d31777d (patch)
tree: 14958fe6d8f3e0459c96c68b3034ea2433ab85ac /mm
parent: 47feff2c8eefe85099f87c43d3096855f0085ca0 (diff)
5 files changed, 105 insertions, 44 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 2d3ec1ffc66e..4e182a9a14c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -442,39 +442,43 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 }
 /**
- * add_to_page_cache - add newly allocated pagecache pages
+ * add_to_page_cache_locked - add a locked page to the pagecache
 * @page:       page to add
 * @mapping:    the page's address_space
 * @offset:     page index
 * @gfp_mask:   page allocation mode
 *
- * This function is used to add newly allocated pagecache pages;
+ * This function is used to add a page to the pagecache. It must be locked.
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
 * This function does not add the page to the LRU.  The caller must do that.
 */
-int add_to_page_cache(struct page *page, struct address_space *mapping,
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                pgoff_t offset, gfp_t gfp_mask)
 {
-        int error = mem_cgroup_cache_charge(page, current->mm,
+        int error;
+        VM_BUG_ON(!PageLocked(page));
+        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & ~__GFP_HIGHMEM);
        if (error)
                goto out;
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error == 0) {
+                page_cache_get(page);
+                page->mapping = mapping;
+                page->index = offset;
                write_lock_irq(&mapping->tree_lock);
                error = radix_tree_insert(&mapping->page_tree, offset, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageLocked(page);
-                        page->mapping = mapping;
-                        page->index = offset;
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                } else
+                } else {
+                        page->mapping = NULL;
                        mem_cgroup_uncharge_cache_page(page);
+                        page_cache_release(page);
+                }
                write_unlock_irq(&mapping->tree_lock);
                radix_tree_preload_end();
@@ -483,7 +487,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 out:
        return error;
 }
-EXPORT_SYMBOL(add_to_page_cache);
+EXPORT_SYMBOL(add_to_page_cache_locked);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t offset, gfp_t gfp_mask)
diff --git a/mm/migrate.c b/mm/migrate.c
index d8c65a65c61d..3ca6392e82cc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
        page = migration_entry_to_page(entry);
-        get_page(page);
+        /*
+         * Once radix-tree replacement of page migration started, page_count
+         * *must* be zero. And, we don't want to call wait_on_page_locked()
+         * against a page without get_page().
+         * So, we use get_page_unless_zero(), here. Even failed, page fault
+         * will occur again.
+         */
+        if (!get_page_unless_zero(page))
+                goto out;
        pte_unmap_unlock(ptep, ptl);
        wait_on_page_locked(page);
        put_page(page);
@@ -305,6 +313,7 @@ out:
 static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page)
 {
+        int expected_count;
        void **pslot;
        if (!mapping) {
@@ -319,12 +328,18 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                        page_index(page));
-        if (page_count(page) != 2 + !!PagePrivate(page) ||
+        expected_count = 2 + !!PagePrivate(page);
+        if (page_count(page) != expected_count ||
                        (struct page *)radix_tree_deref_slot(pslot) != page) {
                write_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
+        if (!page_freeze_refs(page, expected_count)) {
+                write_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
        /*
         * Now we know that no one else is looking at the page.
         */
@@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        radix_tree_replace_slot(pslot, newpage);
+        page_unfreeze_refs(page, expected_count);
        /*
         * Drop cache reference from old page.
         * We know this isn't the last reference.
diff --git a/mm/shmem.c b/mm/shmem.c
index f92fea94d037..1089092aecaf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -936,7 +936,7 @@ found:
        spin_lock(&info->lock);
        ptr = shmem_swp_entry(info, idx, NULL);
        if (ptr && ptr->val == entry.val) {
-                error = add_to_page_cache(page, inode->i_mapping,
+                error = add_to_page_cache_locked(page, inode->i_mapping,
                                                idx, GFP_NOWAIT);
                /* does mem_cgroup_uncharge_cache_page on error */
        } else  /* we must compensate for our precharge above */
@@ -1301,8 +1301,8 @@ repeat:
                        SetPageUptodate(filepage);
                        set_page_dirty(filepage);
                        swap_free(swap);
-                } else if (!(error = add_to_page_cache(
+                } else if (!(error = add_to_page_cache_locked(swappage, mapping,
-                                swappage, mapping, idx, GFP_NOWAIT))) {
+                                        idx, GFP_NOWAIT))) {
                        info->flags |= SHMEM_PAGEIN;
                        shmem_swp_set(info, entry, 0);
                        shmem_swp_unmap(entry);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..3e3381d6c7ee 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -64,7 +64,7 @@ void show_swap_cache_info(void)
 }
 /*
- * add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +76,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
        BUG_ON(PagePrivate(page));
        error = radix_tree_preload(gfp_mask);
        if (!error) {
+                page_cache_get(page);
+                SetPageSwapCache(page);
+                set_page_private(page, entry.val);
                write_lock_irq(&swapper_space.tree_lock);
                error = radix_tree_insert(&swapper_space.page_tree,
                                                entry.val, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageSwapCache(page);
-                        set_page_private(page, entry.val);
                        total_swapcache_pages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
                        INC_CACHE_INFO(add_total);
                }
                write_unlock_irq(&swapper_space.tree_lock);
                radix_tree_preload_end();
+                if (unlikely(error)) {
+                        set_page_private(page, 0UL);
+                        ClearPageSwapCache(page);
+                        page_cache_release(page);
+                }
        }
        return error;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26672c6cd3ce..0075eac1cd04 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -391,12 +391,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 }
 /*
- * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * Same as remove_mapping, but if the page is removed from the mapping, it
- * someone else has a ref on the page, abort and return 0.  If it was
+ * gets returned with a refcount of 0.
- * successfully detached, return 1.  Assumes the caller has a single ref on
- * this page.
 */
-int remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page)
 {
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
@@ -427,24 +425,24 @@ int remove_mapping(struct address_space *mapping, struct page *page)
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under tree_lock, then this ordering is not required.
         */
-        if (unlikely(page_count(page) != 2))
+        if (!page_freeze_refs(page, 2))
                goto cannot_free;
-        smp_rmb();
+        /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
-        if (unlikely(PageDirty(page)))
+        if (unlikely(PageDirty(page))) {
+                page_unfreeze_refs(page, 2);
                goto cannot_free;
+        }
        if (PageSwapCache(page)) {
                swp_entry_t swap = { .val = page_private(page) };
                __delete_from_swap_cache(page);
                write_unlock_irq(&mapping->tree_lock);
                swap_free(swap);
-                __put_page(page);       /* The pagecache ref */
+        } else {
-                return 1;
+                __remove_from_page_cache(page);
+                write_unlock_irq(&mapping->tree_lock);
        }
-        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
-        __put_page(page);
        return 1;
 cannot_free:
@@ -453,6 +451,26 @@ cannot_free:
 }
 /*
+ * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * someone else has a ref on the page, abort and return 0.  If it was
+ * successfully detached, return 1.  Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+        if (__remove_mapping(mapping, page)) {
+                /*
+                 * Unfreezing the refcount with 1 rather than 2 effectively
+                 * drops the pagecache ref for us without requiring another
+                 * atomic operation.
+                 */
+                page_unfreeze_refs(page, 1);
+                return 1;
+        }
+        return 0;
+}
+/*
 * shrink_page_list() returns the number of reclaimed pages
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
@@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PagePrivate(page)) {
                        if (!try_to_release_page(page, sc->gfp_mask))
                                goto activate_locked;
-                        if (!mapping && page_count(page) == 1)
+                        if (!mapping && page_count(page) == 1) {
-                                goto free_it;
+                                unlock_page(page);
+                                if (put_page_testzero(page))
+                                        goto free_it;
+                                else {
+                                        /*
+                                         * rare race with speculative reference.
+                                         * the speculative reference will free
+                                         * this page shortly, so we may
+                                         * increment nr_reclaimed here (and
+                                         * leave it off the LRU).
+                                         */
+                                        nr_reclaimed++;
+                                        continue;
+                                }
+                        }
                }
-                if (!mapping || !remove_mapping(mapping, page))
+                if (!mapping || !__remove_mapping(mapping, page))
                        goto keep_locked;
-free_it:
                unlock_page(page);
+free_it:
                nr_reclaimed++;
-                if (!pagevec_add(&freed_pvec, page))
+                if (!pagevec_add(&freed_pvec, page)) {
-                        __pagevec_release_nonlru(&freed_pvec);
+                        __pagevec_free(&freed_pvec);
+                        pagevec_reinit(&freed_pvec);
+                }
                continue;
 activate_locked:
@@ -623,7 +657,7 @@ keep:
        }
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
-                __pagevec_release_nonlru(&freed_pvec);
+                __pagevec_free(&freed_pvec);
        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
author	Nick Piggin <npiggin@suse.de>	2008-07-25 22:45:30 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-07-26 15:00:06 -0400
commit	e286781d5f2e9c846e012a39653a166e9d31777d (patch)
tree	14958fe6d8f3e0459c96c68b3034ea2433ab85ac /mm
parent	47feff2c8eefe85099f87c43d3096855f0085ca0 (diff)