mm: thrash detection-based file cache sizing

The VM maintains cached filesystem pages on two types of lists. One list holds the pages recently faulted into the cache, the other list holds pages that have been referenced repeatedly on that first list. The idea is to prefer reclaiming young pages over those that have shown to benefit from caching in the past. We call the recently usedbut ultimately was not significantly better than a FIFO policy and still thrashed cache based on eviction speed, rather than actual demand for cache. This patch solves one half of the problem by decoupling the ability to detect working set changes from the inactive list size. By maintaining a history of recently evicted file pages it can detect frequently used pages with an arbitrarily small inactive list size, and subsequently apply pressure on the active list based on actual demand for cache, not just overall eviction speed. Every zone maintains a counter that tracks inactive list aging speed. When a page is evicted, a snapshot of this counter is stored in the now-empty page cache radix tree slot. On refault, the minimum access distance of the page can be assessed, to evaluate whether the page should be part of the active list or not. This fixes the VM's blindness towards working set changes in excess of the inactive list. And it's the foundation to further improve the protection ability and reduce the minimum inactive list size of 50%. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Reviewed-by: Bob Liu <bob.liu@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Johannes Weiner <hannes@cmpxchg.org> 2014-04-03 17:47:51 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-03 19:21:01 -0400
commit: a528910e12ec7ee203095eb1711468a66b9b60b0 (patch)
tree: c9ceed84994f015e991161c91b87d47d93cd6491 /mm
parent: 91b0abe36a7b2b3b02d7500925a5f8455334f0e5 (diff)
6 files changed, 321 insertions, 23 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 310c90a09264..cdd741519ee0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
                           compaction.o balloon_compaction.o \
-                           interval_tree.o list_lru.o $(mmu-y)
+                           interval_tree.o list_lru.o workingset.o $(mmu-y)
 obj-y += init-mm.o
diff --git a/mm/filemap.c b/mm/filemap.c
index 05c44aa44188..a603c4d7d3c9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -469,7 +469,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 static int page_cache_tree_insert(struct address_space *mapping,
-                                  struct page *page)
+                                  struct page *page, void **shadowp)
 {
        void **slot;
        int error;
@@ -484,6 +484,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
                radix_tree_replace_slot(slot, page);
                mapping->nrshadows--;
                mapping->nrpages++;
+                if (shadowp)
+                        *shadowp = p;
                return 0;
        }
        error = radix_tree_insert(&mapping->page_tree, page->index, page);
@@ -492,18 +494,10 @@ static int page_cache_tree_insert(struct address_space *mapping,
        return error;
 }
-/**
+static int __add_to_page_cache_locked(struct page *page,
- * add_to_page_cache_locked - add a locked page to the pagecache
+                                      struct address_space *mapping,
- * @page:       page to add
+                                      pgoff_t offset, gfp_t gfp_mask,
- * @mapping:    the page's address_space
+                                      void **shadowp)
- * @offset:     page index
- * @gfp_mask:   page allocation mode
- *
- * This function is used to add a page to the pagecache. It must be locked.
- * This function does not add the page to the LRU.  The caller must do that.
- */
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-                pgoff_t offset, gfp_t gfp_mask)
 {
        int error;
@@ -526,7 +520,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        page->index = offset;
        spin_lock_irq(&mapping->tree_lock);
-        error = page_cache_tree_insert(mapping, page);
+        error = page_cache_tree_insert(mapping, page, shadowp);
        radix_tree_preload_end();
        if (unlikely(error))
                goto err_insert;
@@ -542,16 +536,49 @@ err_insert:
        page_cache_release(page);
        return error;
 }
+/**
+ * add_to_page_cache_locked - add a locked page to the pagecache
+ * @page:       page to add
+ * @mapping:    the page's address_space
+ * @offset:     page index
+ * @gfp_mask:   page allocation mode
+ *
+ * This function is used to add a page to the pagecache. It must be locked.
+ * This function does not add the page to the LRU.  The caller must do that.
+ */
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+                pgoff_t offset, gfp_t gfp_mask)
+{
+        return __add_to_page_cache_locked(page, mapping, offset,
+                                          gfp_mask, NULL);
+}
 EXPORT_SYMBOL(add_to_page_cache_locked);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t offset, gfp_t gfp_mask)
 {
+        void *shadow = NULL;
        int ret;
-        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+        __set_page_locked(page);
-        if (ret == 0)
+        ret = __add_to_page_cache_locked(page, mapping, offset,
-                lru_cache_add_file(page);
+                                         gfp_mask, &shadow);
+        if (unlikely(ret))
+                __clear_page_locked(page);
+        else {
+                /*
+                 * The page might have been evicted from cache only
+                 * recently, in which case it should be activated like
+                 * any other repeatedly accessed page.
+                 */
+                if (shadow && workingset_refault(shadow)) {
+                        SetPageActive(page);
+                        workingset_activation(page);
+                } else
+                        ClearPageActive(page);
+                lru_cache_add(page);
+        }
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
diff --git a/mm/swap.c b/mm/swap.c
index c8048d71c642..9ce43ba4498b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -574,6 +574,8 @@ void mark_page_accessed(struct page *page)
                else
                        __lru_cache_activate_page(page);
                ClearPageReferenced(page);
+                if (page_is_file_cache(page))
+                        workingset_activation(page);
        } else if (!PageReferenced(page)) {
                SetPageReferenced(page);
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2a0bb8fdb259..1f56a80a7c41 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -523,7 +523,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 * Same as remove_mapping, but if the page is removed from the mapping, it
 * gets returned with a refcount of 0.
 */
-static int __remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page,
+                            bool reclaimed)
 {
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
@@ -569,10 +570,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
                swapcache_free(swap, page);
        } else {
                void (*freepage)(struct page *);
+                void *shadow = NULL;
                freepage = mapping->a_ops->freepage;
+                /*
-                __delete_from_page_cache(page, NULL);
+                 * Remember a shadow entry for reclaimed file cache in
+                 * order to detect refaults, thus thrashing, later on.
+                 *
+                 * But don't store shadows in an address space that is
+                 * already exiting.  This is not just an optizimation,
+                 * inode reclaim needs to empty out the radix tree or
+                 * the nodes are lost.  Don't plant shadows behind its
+                 * back.
+                 */
+                if (reclaimed && page_is_file_cache(page) &&
+                    !mapping_exiting(mapping))
+                        shadow = workingset_eviction(mapping, page);
+                __delete_from_page_cache(page, shadow);
                spin_unlock_irq(&mapping->tree_lock);
                mem_cgroup_uncharge_cache_page(page);
@@ -595,7 +609,7 @@ cannot_free:
 */
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
-        if (__remove_mapping(mapping, page)) {
+        if (__remove_mapping(mapping, page, false)) {
                /*
                 * Unfreezing the refcount with 1 rather than 2 effectively
                 * drops the pagecache ref for us without requiring another
@@ -1065,7 +1079,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        }
                }
-                if (!mapping || !__remove_mapping(mapping, page))
+                if (!mapping || !__remove_mapping(mapping, page, true))
                        goto keep_locked;
                /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index def5dd2fbe61..843125a0e255 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -770,6 +770,8 @@ const char * const vmstat_text[] = {
        "numa_local",
        "numa_other",
 #endif
+        "workingset_refault",
+        "workingset_activate",
        "nr_anon_transparent_hugepages",
        "nr_free_cma",
        "nr_dirty_threshold",
diff --git a/mm/workingset.c b/mm/workingset.c
new file mode 100644
index 000000000000..8a6c7cff4923
--- /dev/null
+++ b/mm/workingset.c
@@ -0,0 +1,253 @@
+/*
+ * Workingset detection
+ *
+ * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
+ */
+#include <linux/memcontrol.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+/*
+ *              Double CLOCK lists
+ *
+ * Per zone, two clock lists are maintained for file pages: the
+ * inactive and the active list.  Freshly faulted pages start out at
+ * the head of the inactive list and page reclaim scans pages from the
+ * tail.  Pages that are accessed multiple times on the inactive list
+ * are promoted to the active list, to protect them from reclaim,
+ * whereas active pages are demoted to the inactive list when the
+ * active list grows too big.
+ *
+ *   fault ------------------------+
+ *                                 |
+ *              +--------------+   |            +-------------+
+ *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
+ *              +--------------+                +-------------+    |
+ *                     |                                           |
+ *                     +-------------- promotion ------------------+
+ *
+ *
+ *              Access frequency and refault distance
+ *
+ * A workload is thrashing when its pages are frequently used but they
+ * are evicted from the inactive list every time before another access
+ * would have promoted them to the active list.
+ *
+ * In cases where the average access distance between thrashing pages
+ * is bigger than the size of memory there is nothing that can be
+ * done - the thrashing set could never fit into memory under any
+ * circumstance.
+ *
+ * However, the average access distance could be bigger than the
+ * inactive list, yet smaller than the size of memory.  In this case,
+ * the set could fit into memory if it weren't for the currently
+ * active pages - which may be used more, hopefully less frequently:
+ *
+ *      +-memory available to cache-+
+ *      |                           |
+ *      +-inactive------+-active----+
+ *  a b | c d e f g h i | J K L M N |
+ *      +---------------+-----------+
+ *
+ * It is prohibitively expensive to accurately track access frequency
+ * of pages.  But a reasonable approximation can be made to measure
+ * thrashing on the inactive list, after which refaulting pages can be
+ * activated optimistically to compete with the existing active pages.
+ *
+ * Approximating inactive page access frequency - Observations:
+ *
+ * 1. When a page is accessed for the first time, it is added to the
+ *    head of the inactive list, slides every existing inactive page
+ *    towards the tail by one slot, and pushes the current tail page
+ *    out of memory.
+ *
+ * 2. When a page is accessed for the second time, it is promoted to
+ *    the active list, shrinking the inactive list by one slot.  This
+ *    also slides all inactive pages that were faulted into the cache
+ *    more recently than the activated page towards the tail of the
+ *    inactive list.
+ *
+ * Thus:
+ *
+ * 1. The sum of evictions and activations between any two points in
+ *    time indicate the minimum number of inactive pages accessed in
+ *    between.
+ *
+ * 2. Moving one inactive page N page slots towards the tail of the
+ *    list requires at least N inactive page accesses.
+ *
+ * Combining these:
+ *
+ * 1. When a page is finally evicted from memory, the number of
+ *    inactive pages accessed while the page was in cache is at least
+ *    the number of page slots on the inactive list.
+ *
+ * 2. In addition, measuring the sum of evictions and activations (E)
+ *    at the time of a page's eviction, and comparing it to another
+ *    reading (R) at the time the page faults back into memory tells
+ *    the minimum number of accesses while the page was not cached.
+ *    This is called the refault distance.
+ *
+ * Because the first access of the page was the fault and the second
+ * access the refault, we combine the in-cache distance with the
+ * out-of-cache distance to get the complete minimum access distance
+ * of this page:
+ *
+ *      NR_inactive + (R - E)
+ *
+ * And knowing the minimum access distance of a page, we can easily
+ * tell if the page would be able to stay in cache assuming all page
+ * slots in the cache were available:
+ *
+ *   NR_inactive + (R - E) <= NR_inactive + NR_active
+ *
+ * which can be further simplified to
+ *
+ *   (R - E) <= NR_active
+ *
+ * Put into words, the refault distance (out-of-cache) can be seen as
+ * a deficit in inactive list space (in-cache).  If the inactive list
+ * had (R - E) more page slots, the page would not have been evicted
+ * in between accesses, but activated instead.  And on a full system,
+ * the only thing eating into inactive list space is active pages.
+ *
+ *
+ *              Activating refaulting pages
+ *
+ * All that is known about the active list is that the pages have been
+ * accessed more than once in the past.  This means that at any given
+ * time there is actually a good chance that pages on the active list
+ * are no longer in active use.
+ *
+ * So when a refault distance of (R - E) is observed and there are at
+ * least (R - E) active pages, the refaulting page is activated
+ * optimistically in the hope that (R - E) active pages are actually
+ * used less frequently than the refaulting page - or even not used at
+ * all anymore.
+ *
+ * If this is wrong and demotion kicks in, the pages which are truly
+ * used more frequently will be reactivated while the less frequently
+ * used once will be evicted from memory.
+ *
+ * But if this is right, the stale pages will be pushed out of memory
+ * and the used pages get to stay in cache.
+ *
+ *
+ *              Implementation
+ *
+ * For each zone's file LRU lists, a counter for inactive evictions
+ * and activations is maintained (zone->inactive_age).
+ *
+ * On eviction, a snapshot of this counter (along with some bits to
+ * identify the zone) is stored in the now empty page cache radix tree
+ * slot of the evicted page.  This is called a shadow entry.
+ *
+ * On cache misses for which there are shadow entries, an eligible
+ * refault distance will immediately activate the refaulting page.
+ */
+static void *pack_shadow(unsigned long eviction, struct zone *zone)
+{
+        eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
+        eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
+        eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
+        return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
+}
+static void unpack_shadow(void *shadow,
+                          struct zone **zone,
+                          unsigned long *distance)
+{
+        unsigned long entry = (unsigned long)shadow;
+        unsigned long eviction;
+        unsigned long refault;
+        unsigned long mask;
+        int zid, nid;
+        entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+        zid = entry & ((1UL << ZONES_SHIFT) - 1);
+        entry >>= ZONES_SHIFT;
+        nid = entry & ((1UL << NODES_SHIFT) - 1);
+        entry >>= NODES_SHIFT;
+        eviction = entry;
+        *zone = NODE_DATA(nid)->node_zones + zid;
+        refault = atomic_long_read(&(*zone)->inactive_age);
+        mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
+                        RADIX_TREE_EXCEPTIONAL_SHIFT);
+        /*
+         * The unsigned subtraction here gives an accurate distance
+         * across inactive_age overflows in most cases.
+         *
+         * There is a special case: usually, shadow entries have a
+         * short lifetime and are either refaulted or reclaimed along
+         * with the inode before they get too old.  But it is not
+         * impossible for the inactive_age to lap a shadow entry in
+         * the field, which can then can result in a false small
+         * refault distance, leading to a false activation should this
+         * old entry actually refault again.  However, earlier kernels
+         * used to deactivate unconditionally with *every* reclaim
+         * invocation for the longest time, so the occasional
+         * inappropriate activation leading to pressure on the active
+         * list is not a problem.
+         */
+        *distance = (refault - eviction) & mask;
+}
+/**
+ * workingset_eviction - note the eviction of a page from memory
+ * @mapping: address space the page was backing
+ * @page: the page being evicted
+ *
+ * Returns a shadow entry to be stored in @mapping->page_tree in place
+ * of the evicted @page so that a later refault can be detected.
+ */
+void *workingset_eviction(struct address_space *mapping, struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        unsigned long eviction;
+        eviction = atomic_long_inc_return(&zone->inactive_age);
+        return pack_shadow(eviction, zone);
+}
+/**
+ * workingset_refault - evaluate the refault of a previously evicted page
+ * @shadow: shadow entry of the evicted page
+ *
+ * Calculates and evaluates the refault distance of the previously
+ * evicted page in the context of the zone it was allocated in.
+ *
+ * Returns %true if the page should be activated, %false otherwise.
+ */
+bool workingset_refault(void *shadow)
+{
+        unsigned long refault_distance;
+        struct zone *zone;
+        unpack_shadow(shadow, &zone, &refault_distance);
+        inc_zone_state(zone, WORKINGSET_REFAULT);
+        if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+                inc_zone_state(zone, WORKINGSET_ACTIVATE);
+                return true;
+        }
+        return false;
+}
+/**
+ * workingset_activation - note a page activation
+ * @page: page that is being activated
+ */
+void workingset_activation(struct page *page)
+{
+        atomic_long_inc(&page_zone(page)->inactive_age);
+}
author	Johannes Weiner <hannes@cmpxchg.org>	2014-04-03 17:47:51 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-03 19:21:01 -0400
commit	a528910e12ec7ee203095eb1711468a66b9b60b0 (patch)
tree	c9ceed84994f015e991161c91b87d47d93cd6491 /mm
parent	91b0abe36a7b2b3b02d7500925a5f8455334f0e5 (diff)

diff --git a/mm/Makefile b/mm/Makefile index 310c90a09264..cdd741519ee0 100644 --- a/mm/Makefile +++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
17	util.o mmzone.o vmstat.o backing-dev.o \	17	util.o mmzone.o vmstat.o backing-dev.o \
18	mm_init.o mmu_context.o percpu.o slab_common.o \	18	mm_init.o mmu_context.o percpu.o slab_common.o \
19	compaction.o balloon_compaction.o \	19	compaction.o balloon_compaction.o \
20	interval_tree.o list_lru.o $(mmu-y)	20	interval_tree.o list_lru.o workingset.o $(mmu-y)
21		21
22	obj-y += init-mm.o	22	obj-y += init-mm.o
23		23


diff --git a/mm/filemap.c b/mm/filemap.c index 05c44aa44188..a603c4d7d3c9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -469,7 +469,7 @@ int replace_page_cache_page(struct page old, struct page new, gfp_t gfp_mask)
469	EXPORT_SYMBOL_GPL(replace_page_cache_page);	469	EXPORT_SYMBOL_GPL(replace_page_cache_page);
470		470
471	static int page_cache_tree_insert(struct address_space *mapping,	471	static int page_cache_tree_insert(struct address_space *mapping,
472	struct page *page)	472	struct page page, void *shadowp)
473	{	473	{
474	void **slot;	474	void **slot;
475	int error;	475	int error;
@@ -484,6 +484,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
484	radix_tree_replace_slot(slot, page);	484	radix_tree_replace_slot(slot, page);
485	mapping->nrshadows--;	485	mapping->nrshadows--;
486	mapping->nrpages++;	486	mapping->nrpages++;
		487	if (shadowp)
		488	*shadowp = p;
487	return 0;	489	return 0;
488	}	490	}
489	error = radix_tree_insert(&mapping->page_tree, page->index, page);	491	error = radix_tree_insert(&mapping->page_tree, page->index, page);
@@ -492,18 +494,10 @@ static int page_cache_tree_insert(struct address_space *mapping,
492	return error;	494	return error;
493	}	495	}
494		496
495	/**	497	static int __add_to_page_cache_locked(struct page *page,
496	* add_to_page_cache_locked - add a locked page to the pagecache	498	struct address_space *mapping,
497	* @page: page to add	499	pgoff_t offset, gfp_t gfp_mask,
498	* @mapping: the page's address_space	500	void **shadowp)
499	* @offset: page index
500	* @gfp_mask: page allocation mode
501	*
502	* This function is used to add a page to the pagecache. It must be locked.
503	* This function does not add the page to the LRU. The caller must do that.
504	*/
505	int add_to_page_cache_locked(struct page page, struct address_space mapping,
506	pgoff_t offset, gfp_t gfp_mask)
507	{	501	{
508	int error;	502	int error;
509		503
@@ -526,7 +520,7 @@ int add_to_page_cache_locked(struct page page, struct address_space mapping,
526	page->index = offset;	520	page->index = offset;
527		521
528	spin_lock_irq(&mapping->tree_lock);	522	spin_lock_irq(&mapping->tree_lock);
529	error = page_cache_tree_insert(mapping, page);	523	error = page_cache_tree_insert(mapping, page, shadowp);
530	radix_tree_preload_end();	524	radix_tree_preload_end();
531	if (unlikely(error))	525	if (unlikely(error))
532	goto err_insert;	526	goto err_insert;
@@ -542,16 +536,49 @@ err_insert:
542	page_cache_release(page);	536	page_cache_release(page);
543	return error;	537	return error;
544	}	538	}
		539
		540	/**
		541	* add_to_page_cache_locked - add a locked page to the pagecache
		542	* @page: page to add
		543	* @mapping: the page's address_space
		544	* @offset: page index
		545	* @gfp_mask: page allocation mode
		546	*
		547	* This function is used to add a page to the pagecache. It must be locked.
		548	* This function does not add the page to the LRU. The caller must do that.
		549	*/
		550	int add_to_page_cache_locked(struct page page, struct address_space mapping,
		551	pgoff_t offset, gfp_t gfp_mask)
		552	{
		553	return __add_to_page_cache_locked(page, mapping, offset,
		554	gfp_mask, NULL);
		555	}
545	EXPORT_SYMBOL(add_to_page_cache_locked);	556	EXPORT_SYMBOL(add_to_page_cache_locked);
546		557
547	int add_to_page_cache_lru(struct page page, struct address_space mapping,	558	int add_to_page_cache_lru(struct page page, struct address_space mapping,
548	pgoff_t offset, gfp_t gfp_mask)	559	pgoff_t offset, gfp_t gfp_mask)
549	{	560	{
		561	void *shadow = NULL;
550	int ret;	562	int ret;
551		563
552	ret = add_to_page_cache(page, mapping, offset, gfp_mask);	564	__set_page_locked(page);
553	if (ret == 0)	565	ret = __add_to_page_cache_locked(page, mapping, offset,
554	lru_cache_add_file(page);	566	gfp_mask, &shadow);
		567	if (unlikely(ret))
		568	__clear_page_locked(page);
		569	else {
		570	/*
		571	* The page might have been evicted from cache only
		572	* recently, in which case it should be activated like
		573	* any other repeatedly accessed page.
		574	*/
		575	if (shadow && workingset_refault(shadow)) {
		576	SetPageActive(page);
		577	workingset_activation(page);
		578	} else
		579	ClearPageActive(page);
		580	lru_cache_add(page);
		581	}
555	return ret;	582	return ret;
556	}	583	}
557	EXPORT_SYMBOL_GPL(add_to_page_cache_lru);	584	EXPORT_SYMBOL_GPL(add_to_page_cache_lru);


diff --git a/mm/swap.c b/mm/swap.c index c8048d71c642..9ce43ba4498b 100644 --- a/mm/swap.c +++ b/mm/swap.c
@@ -574,6 +574,8 @@ void mark_page_accessed(struct page *page)
574	else	574	else
575	__lru_cache_activate_page(page);	575	__lru_cache_activate_page(page);
576	ClearPageReferenced(page);	576	ClearPageReferenced(page);
		577	if (page_is_file_cache(page))
		578	workingset_activation(page);
577	} else if (!PageReferenced(page)) {	579	} else if (!PageReferenced(page)) {
578	SetPageReferenced(page);	580	SetPageReferenced(page);
579	}	581	}


diff --git a/mm/vmscan.c b/mm/vmscan.c index 2a0bb8fdb259..1f56a80a7c41 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -523,7 +523,8 @@ static pageout_t pageout(struct page page, struct address_space mapping,
523	* Same as remove_mapping, but if the page is removed from the mapping, it	523	* Same as remove_mapping, but if the page is removed from the mapping, it
524	* gets returned with a refcount of 0.	524	* gets returned with a refcount of 0.
525	*/	525	*/
526	static int __remove_mapping(struct address_space mapping, struct page page)	526	static int __remove_mapping(struct address_space mapping, struct page page,
		527	bool reclaimed)
527	{	528	{
528	BUG_ON(!PageLocked(page));	529	BUG_ON(!PageLocked(page));
529	BUG_ON(mapping != page_mapping(page));	530	BUG_ON(mapping != page_mapping(page));
@@ -569,10 +570,23 @@ static int __remove_mapping(struct address_space mapping, struct page page)
569	swapcache_free(swap, page);	570	swapcache_free(swap, page);
570	} else {	571	} else {
571	void (freepage)(struct page );	572	void (freepage)(struct page );
		573	void *shadow = NULL;
572		574
573	freepage = mapping->a_ops->freepage;	575	freepage = mapping->a_ops->freepage;
574		576	/*
575	__delete_from_page_cache(page, NULL);	577	* Remember a shadow entry for reclaimed file cache in
		578	* order to detect refaults, thus thrashing, later on.
		579	*
		580	* But don't store shadows in an address space that is
		581	* already exiting. This is not just an optizimation,
		582	* inode reclaim needs to empty out the radix tree or
		583	* the nodes are lost. Don't plant shadows behind its
		584	* back.
		585	*/
		586	if (reclaimed && page_is_file_cache(page) &&
		587	!mapping_exiting(mapping))
		588	shadow = workingset_eviction(mapping, page);
		589	__delete_from_page_cache(page, shadow);
576	spin_unlock_irq(&mapping->tree_lock);	590	spin_unlock_irq(&mapping->tree_lock);
577	mem_cgroup_uncharge_cache_page(page);	591	mem_cgroup_uncharge_cache_page(page);
578		592
@@ -595,7 +609,7 @@ cannot_free:
595	*/	609	*/
596	int remove_mapping(struct address_space mapping, struct page page)	610	int remove_mapping(struct address_space mapping, struct page page)
597	{	611	{
598	if (__remove_mapping(mapping, page)) {	612	if (__remove_mapping(mapping, page, false)) {
599	/*	613	/*
600	* Unfreezing the refcount with 1 rather than 2 effectively	614	* Unfreezing the refcount with 1 rather than 2 effectively
601	* drops the pagecache ref for us without requiring another	615	* drops the pagecache ref for us without requiring another
@@ -1065,7 +1079,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1065	}	1079	}
1066	}	1080	}
1067		1081
1068	if (!mapping \|\| !__remove_mapping(mapping, page))	1082	if (!mapping \|\| !__remove_mapping(mapping, page, true))
1069	goto keep_locked;	1083	goto keep_locked;
1070		1084
1071	/*	1085	/*


diff --git a/mm/vmstat.c b/mm/vmstat.c index def5dd2fbe61..843125a0e255 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c
@@ -770,6 +770,8 @@ const char * const vmstat_text[] = {
770	"numa_local",	770	"numa_local",
771	"numa_other",	771	"numa_other",
772	#endif	772	#endif
		773	"workingset_refault",
		774	"workingset_activate",
773	"nr_anon_transparent_hugepages",	775	"nr_anon_transparent_hugepages",
774	"nr_free_cma",	776	"nr_free_cma",
775	"nr_dirty_threshold",	777	"nr_dirty_threshold",


diff --git a/mm/workingset.c b/mm/workingset.c new file mode 100644 index 000000000000..8a6c7cff4923 --- /dev/null +++ b/mm/workingset.c
@@ -0,0 +1,253 @@
		1	/*
		2	* Workingset detection
		3	*
		4	* Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
		5	*/
		6
		7	#include <linux/memcontrol.h>
		8	#include <linux/writeback.h>
		9	#include <linux/pagemap.h>
		10	#include <linux/atomic.h>
		11	#include <linux/module.h>
		12	#include <linux/swap.h>
		13	#include <linux/fs.h>
		14	#include <linux/mm.h>
		15
		16	/*
		17	* Double CLOCK lists
		18	*
		19	* Per zone, two clock lists are maintained for file pages: the
		20	* inactive and the active list. Freshly faulted pages start out at
		21	* the head of the inactive list and page reclaim scans pages from the
		22	* tail. Pages that are accessed multiple times on the inactive list
		23	* are promoted to the active list, to protect them from reclaim,
		24	* whereas active pages are demoted to the inactive list when the
		25	* active list grows too big.
		26	*
		27	* fault ------------------------+
		28	* \|
		29	* +--------------+ \| +-------------+
		30	* reclaim <- \| inactive \| <-+-- demotion \| active \| <--+
		31	* +--------------+ +-------------+ \|
		32	* \| \|
		33	* +-------------- promotion ------------------+
		34	*
		35	*
		36	* Access frequency and refault distance
		37	*
		38	* A workload is thrashing when its pages are frequently used but they
		39	* are evicted from the inactive list every time before another access
		40	* would have promoted them to the active list.
		41	*
		42	* In cases where the average access distance between thrashing pages
		43	* is bigger than the size of memory there is nothing that can be
		44	* done - the thrashing set could never fit into memory under any
		45	* circumstance.
		46	*
		47	* However, the average access distance could be bigger than the
		48	* inactive list, yet smaller than the size of memory. In this case,
		49	* the set could fit into memory if it weren't for the currently
		50	* active pages - which may be used more, hopefully less frequently:
		51	*
		52	* +-memory available to cache-+
		53	* \| \|
		54	* +-inactive------+-active----+
		55	* a b \| c d e f g h i \| J K L M N \|
		56	* +---------------+-----------+
		57	*
		58	* It is prohibitively expensive to accurately track access frequency
		59	* of pages. But a reasonable approximation can be made to measure
		60	* thrashing on the inactive list, after which refaulting pages can be
		61	* activated optimistically to compete with the existing active pages.
		62	*
		63	* Approximating inactive page access frequency - Observations:
		64	*
		65	* 1. When a page is accessed for the first time, it is added to the
		66	* head of the inactive list, slides every existing inactive page
		67	* towards the tail by one slot, and pushes the current tail page
		68	* out of memory.
		69	*
		70	* 2. When a page is accessed for the second time, it is promoted to
		71	* the active list, shrinking the inactive list by one slot. This
		72	* also slides all inactive pages that were faulted into the cache
		73	* more recently than the activated page towards the tail of the
		74	* inactive list.
		75	*
		76	* Thus:
		77	*
		78	* 1. The sum of evictions and activations between any two points in
		79	* time indicate the minimum number of inactive pages accessed in
		80	* between.
		81	*
		82	* 2. Moving one inactive page N page slots towards the tail of the
		83	* list requires at least N inactive page accesses.
		84	*
		85	* Combining these:
		86	*
		87	* 1. When a page is finally evicted from memory, the number of
		88	* inactive pages accessed while the page was in cache is at least
		89	* the number of page slots on the inactive list.
		90	*
		91	* 2. In addition, measuring the sum of evictions and activations (E)
		92	* at the time of a page's eviction, and comparing it to another
		93	* reading (R) at the time the page faults back into memory tells
		94	* the minimum number of accesses while the page was not cached.
		95	* This is called the refault distance.
		96	*
		97	* Because the first access of the page was the fault and the second
		98	* access the refault, we combine the in-cache distance with the
		99	* out-of-cache distance to get the complete minimum access distance
		100	* of this page:
		101	*
		102	* NR_inactive + (R - E)
		103	*
		104	* And knowing the minimum access distance of a page, we can easily
		105	* tell if the page would be able to stay in cache assuming all page
		106	* slots in the cache were available:
		107	*
		108	* NR_inactive + (R - E) <= NR_inactive + NR_active
		109	*
		110	* which can be further simplified to
		111	*
		112	* (R - E) <= NR_active
		113	*
		114	* Put into words, the refault distance (out-of-cache) can be seen as
		115	* a deficit in inactive list space (in-cache). If the inactive list
		116	* had (R - E) more page slots, the page would not have been evicted
		117	* in between accesses, but activated instead. And on a full system,
		118	* the only thing eating into inactive list space is active pages.
		119	*
		120	*
		121	* Activating refaulting pages
		122	*
		123	* All that is known about the active list is that the pages have been
		124	* accessed more than once in the past. This means that at any given
		125	* time there is actually a good chance that pages on the active list
		126	* are no longer in active use.
		127	*
		128	* So when a refault distance of (R - E) is observed and there are at
		129	* least (R - E) active pages, the refaulting page is activated
		130	* optimistically in the hope that (R - E) active pages are actually
		131	* used less frequently than the refaulting page - or even not used at
		132	* all anymore.
		133	*
		134	* If this is wrong and demotion kicks in, the pages which are truly
		135	* used more frequently will be reactivated while the less frequently
		136	* used once will be evicted from memory.
		137	*
		138	* But if this is right, the stale pages will be pushed out of memory
		139	* and the used pages get to stay in cache.
		140	*
		141	*
		142	* Implementation
		143	*
		144	* For each zone's file LRU lists, a counter for inactive evictions
		145	* and activations is maintained (zone->inactive_age).
		146	*
		147	* On eviction, a snapshot of this counter (along with some bits to
		148	* identify the zone) is stored in the now empty page cache radix tree
		149	* slot of the evicted page. This is called a shadow entry.
		150	*
		151	* On cache misses for which there are shadow entries, an eligible
		152	* refault distance will immediately activate the refaulting page.
		153	*/
		154
		155	static void pack_shadow(unsigned long eviction, struct zone zone)
		156	{
		157	eviction = (eviction << NODES_SHIFT) \| zone_to_nid(zone);
		158	eviction = (eviction << ZONES_SHIFT) \| zone_idx(zone);
		159	eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
		160
		161	return (void *)(eviction \| RADIX_TREE_EXCEPTIONAL_ENTRY);
		162	}
		163
		164	static void unpack_shadow(void *shadow,
		165	struct zone **zone,
		166	unsigned long *distance)
		167	{
		168	unsigned long entry = (unsigned long)shadow;
		169	unsigned long eviction;
		170	unsigned long refault;
		171	unsigned long mask;
		172	int zid, nid;
		173
		174	entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
		175	zid = entry & ((1UL << ZONES_SHIFT) - 1);
		176	entry >>= ZONES_SHIFT;
		177	nid = entry & ((1UL << NODES_SHIFT) - 1);
		178	entry >>= NODES_SHIFT;
		179	eviction = entry;
		180
		181	*zone = NODE_DATA(nid)->node_zones + zid;
		182
		183	refault = atomic_long_read(&(*zone)->inactive_age);
		184	mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
		185	RADIX_TREE_EXCEPTIONAL_SHIFT);
		186	/*
		187	* The unsigned subtraction here gives an accurate distance
		188	* across inactive_age overflows in most cases.
		189	*
		190	* There is a special case: usually, shadow entries have a
		191	* short lifetime and are either refaulted or reclaimed along
		192	* with the inode before they get too old. But it is not
		193	* impossible for the inactive_age to lap a shadow entry in
		194	* the field, which can then can result in a false small
		195	* refault distance, leading to a false activation should this
		196	* old entry actually refault again. However, earlier kernels
		197	* used to deactivate unconditionally with every reclaim
		198	* invocation for the longest time, so the occasional
		199	* inappropriate activation leading to pressure on the active
		200	* list is not a problem.
		201	*/
		202	*distance = (refault - eviction) & mask;
		203	}
		204
		205	/**
		206	* workingset_eviction - note the eviction of a page from memory
		207	* @mapping: address space the page was backing
		208	* @page: the page being evicted
		209	*
		210	* Returns a shadow entry to be stored in @mapping->page_tree in place
		211	* of the evicted @page so that a later refault can be detected.
		212	*/
		213	void workingset_eviction(struct address_space mapping, struct page *page)
		214	{
		215	struct zone *zone = page_zone(page);
		216	unsigned long eviction;
		217
		218	eviction = atomic_long_inc_return(&zone->inactive_age);
		219	return pack_shadow(eviction, zone);
		220	}
		221
		222	/**
		223	* workingset_refault - evaluate the refault of a previously evicted page
		224	* @shadow: shadow entry of the evicted page
		225	*
		226	* Calculates and evaluates the refault distance of the previously
		227	* evicted page in the context of the zone it was allocated in.
		228	*
		229	* Returns %true if the page should be activated, %false otherwise.
		230	*/
		231	bool workingset_refault(void *shadow)
		232	{
		233	unsigned long refault_distance;
		234	struct zone *zone;
		235
		236	unpack_shadow(shadow, &zone, &refault_distance);
		237	inc_zone_state(zone, WORKINGSET_REFAULT);
		238
		239	if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
		240	inc_zone_state(zone, WORKINGSET_ACTIVATE);
		241	return true;
		242	}
		243	return false;
		244	}
		245
		246	/**
		247	* workingset_activation - note a page activation
		248	* @page: page that is being activated
		249	*/
		250	void workingset_activation(struct page *page)
		251	{
		252	atomic_long_inc(&page_zone(page)->inactive_age);
		253	}