aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-04-03 17:47:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-03 19:21:01 -0400
commita528910e12ec7ee203095eb1711468a66b9b60b0 (patch)
treec9ceed84994f015e991161c91b87d47d93cd6491 /mm
parent91b0abe36a7b2b3b02d7500925a5f8455334f0e5 (diff)
mm: thrash detection-based file cache sizing
The VM maintains cached filesystem pages on two types of lists. One list holds the pages recently faulted into the cache, the other list holds pages that have been referenced repeatedly on that first list. The idea is to prefer reclaiming young pages over those that have shown to benefit from caching in the past. We call the recently usedbut ultimately was not significantly better than a FIFO policy and still thrashed cache based on eviction speed, rather than actual demand for cache. This patch solves one half of the problem by decoupling the ability to detect working set changes from the inactive list size. By maintaining a history of recently evicted file pages it can detect frequently used pages with an arbitrarily small inactive list size, and subsequently apply pressure on the active list based on actual demand for cache, not just overall eviction speed. Every zone maintains a counter that tracks inactive list aging speed. When a page is evicted, a snapshot of this counter is stored in the now-empty page cache radix tree slot. On refault, the minimum access distance of the page can be assessed, to evaluate whether the page should be part of the active list or not. This fixes the VM's blindness towards working set changes in excess of the inactive list. And it's the foundation to further improve the protection ability and reduce the minimum inactive list size of 50%. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Reviewed-by: Bob Liu <bob.liu@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/filemap.c61
-rw-r--r--mm/swap.c2
-rw-r--r--mm/vmscan.c24
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c253
6 files changed, 321 insertions, 23 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 310c90a09264..cdd741519ee0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o balloon_compaction.o \ 19 compaction.o balloon_compaction.o \
20 interval_tree.o list_lru.o $(mmu-y) 20 interval_tree.o list_lru.o workingset.o $(mmu-y)
21 21
22obj-y += init-mm.o 22obj-y += init-mm.o
23 23
diff --git a/mm/filemap.c b/mm/filemap.c
index 05c44aa44188..a603c4d7d3c9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -469,7 +469,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
469EXPORT_SYMBOL_GPL(replace_page_cache_page); 469EXPORT_SYMBOL_GPL(replace_page_cache_page);
470 470
471static int page_cache_tree_insert(struct address_space *mapping, 471static int page_cache_tree_insert(struct address_space *mapping,
472 struct page *page) 472 struct page *page, void **shadowp)
473{ 473{
474 void **slot; 474 void **slot;
475 int error; 475 int error;
@@ -484,6 +484,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
484 radix_tree_replace_slot(slot, page); 484 radix_tree_replace_slot(slot, page);
485 mapping->nrshadows--; 485 mapping->nrshadows--;
486 mapping->nrpages++; 486 mapping->nrpages++;
487 if (shadowp)
488 *shadowp = p;
487 return 0; 489 return 0;
488 } 490 }
489 error = radix_tree_insert(&mapping->page_tree, page->index, page); 491 error = radix_tree_insert(&mapping->page_tree, page->index, page);
@@ -492,18 +494,10 @@ static int page_cache_tree_insert(struct address_space *mapping,
492 return error; 494 return error;
493} 495}
494 496
495/** 497static int __add_to_page_cache_locked(struct page *page,
496 * add_to_page_cache_locked - add a locked page to the pagecache 498 struct address_space *mapping,
497 * @page: page to add 499 pgoff_t offset, gfp_t gfp_mask,
498 * @mapping: the page's address_space 500 void **shadowp)
499 * @offset: page index
500 * @gfp_mask: page allocation mode
501 *
502 * This function is used to add a page to the pagecache. It must be locked.
503 * This function does not add the page to the LRU. The caller must do that.
504 */
505int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
506 pgoff_t offset, gfp_t gfp_mask)
507{ 501{
508 int error; 502 int error;
509 503
@@ -526,7 +520,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
526 page->index = offset; 520 page->index = offset;
527 521
528 spin_lock_irq(&mapping->tree_lock); 522 spin_lock_irq(&mapping->tree_lock);
529 error = page_cache_tree_insert(mapping, page); 523 error = page_cache_tree_insert(mapping, page, shadowp);
530 radix_tree_preload_end(); 524 radix_tree_preload_end();
531 if (unlikely(error)) 525 if (unlikely(error))
532 goto err_insert; 526 goto err_insert;
@@ -542,16 +536,49 @@ err_insert:
542 page_cache_release(page); 536 page_cache_release(page);
543 return error; 537 return error;
544} 538}
539
540/**
541 * add_to_page_cache_locked - add a locked page to the pagecache
542 * @page: page to add
543 * @mapping: the page's address_space
544 * @offset: page index
545 * @gfp_mask: page allocation mode
546 *
547 * This function is used to add a page to the pagecache. It must be locked.
548 * This function does not add the page to the LRU. The caller must do that.
549 */
550int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
551 pgoff_t offset, gfp_t gfp_mask)
552{
553 return __add_to_page_cache_locked(page, mapping, offset,
554 gfp_mask, NULL);
555}
545EXPORT_SYMBOL(add_to_page_cache_locked); 556EXPORT_SYMBOL(add_to_page_cache_locked);
546 557
547int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 558int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
548 pgoff_t offset, gfp_t gfp_mask) 559 pgoff_t offset, gfp_t gfp_mask)
549{ 560{
561 void *shadow = NULL;
550 int ret; 562 int ret;
551 563
552 ret = add_to_page_cache(page, mapping, offset, gfp_mask); 564 __set_page_locked(page);
553 if (ret == 0) 565 ret = __add_to_page_cache_locked(page, mapping, offset,
554 lru_cache_add_file(page); 566 gfp_mask, &shadow);
567 if (unlikely(ret))
568 __clear_page_locked(page);
569 else {
570 /*
571 * The page might have been evicted from cache only
572 * recently, in which case it should be activated like
573 * any other repeatedly accessed page.
574 */
575 if (shadow && workingset_refault(shadow)) {
576 SetPageActive(page);
577 workingset_activation(page);
578 } else
579 ClearPageActive(page);
580 lru_cache_add(page);
581 }
555 return ret; 582 return ret;
556} 583}
557EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 584EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
diff --git a/mm/swap.c b/mm/swap.c
index c8048d71c642..9ce43ba4498b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -574,6 +574,8 @@ void mark_page_accessed(struct page *page)
574 else 574 else
575 __lru_cache_activate_page(page); 575 __lru_cache_activate_page(page);
576 ClearPageReferenced(page); 576 ClearPageReferenced(page);
577 if (page_is_file_cache(page))
578 workingset_activation(page);
577 } else if (!PageReferenced(page)) { 579 } else if (!PageReferenced(page)) {
578 SetPageReferenced(page); 580 SetPageReferenced(page);
579 } 581 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2a0bb8fdb259..1f56a80a7c41 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -523,7 +523,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
523 * Same as remove_mapping, but if the page is removed from the mapping, it 523 * Same as remove_mapping, but if the page is removed from the mapping, it
524 * gets returned with a refcount of 0. 524 * gets returned with a refcount of 0.
525 */ 525 */
526static int __remove_mapping(struct address_space *mapping, struct page *page) 526static int __remove_mapping(struct address_space *mapping, struct page *page,
527 bool reclaimed)
527{ 528{
528 BUG_ON(!PageLocked(page)); 529 BUG_ON(!PageLocked(page));
529 BUG_ON(mapping != page_mapping(page)); 530 BUG_ON(mapping != page_mapping(page));
@@ -569,10 +570,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
569 swapcache_free(swap, page); 570 swapcache_free(swap, page);
570 } else { 571 } else {
571 void (*freepage)(struct page *); 572 void (*freepage)(struct page *);
573 void *shadow = NULL;
572 574
573 freepage = mapping->a_ops->freepage; 575 freepage = mapping->a_ops->freepage;
574 576 /*
575 __delete_from_page_cache(page, NULL); 577 * Remember a shadow entry for reclaimed file cache in
578 * order to detect refaults, thus thrashing, later on.
579 *
580 * But don't store shadows in an address space that is
581 * already exiting. This is not just an optizimation,
582 * inode reclaim needs to empty out the radix tree or
583 * the nodes are lost. Don't plant shadows behind its
584 * back.
585 */
586 if (reclaimed && page_is_file_cache(page) &&
587 !mapping_exiting(mapping))
588 shadow = workingset_eviction(mapping, page);
589 __delete_from_page_cache(page, shadow);
576 spin_unlock_irq(&mapping->tree_lock); 590 spin_unlock_irq(&mapping->tree_lock);
577 mem_cgroup_uncharge_cache_page(page); 591 mem_cgroup_uncharge_cache_page(page);
578 592
@@ -595,7 +609,7 @@ cannot_free:
595 */ 609 */
596int remove_mapping(struct address_space *mapping, struct page *page) 610int remove_mapping(struct address_space *mapping, struct page *page)
597{ 611{
598 if (__remove_mapping(mapping, page)) { 612 if (__remove_mapping(mapping, page, false)) {
599 /* 613 /*
600 * Unfreezing the refcount with 1 rather than 2 effectively 614 * Unfreezing the refcount with 1 rather than 2 effectively
601 * drops the pagecache ref for us without requiring another 615 * drops the pagecache ref for us without requiring another
@@ -1065,7 +1079,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1065 } 1079 }
1066 } 1080 }
1067 1081
1068 if (!mapping || !__remove_mapping(mapping, page)) 1082 if (!mapping || !__remove_mapping(mapping, page, true))
1069 goto keep_locked; 1083 goto keep_locked;
1070 1084
1071 /* 1085 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index def5dd2fbe61..843125a0e255 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -770,6 +770,8 @@ const char * const vmstat_text[] = {
770 "numa_local", 770 "numa_local",
771 "numa_other", 771 "numa_other",
772#endif 772#endif
773 "workingset_refault",
774 "workingset_activate",
773 "nr_anon_transparent_hugepages", 775 "nr_anon_transparent_hugepages",
774 "nr_free_cma", 776 "nr_free_cma",
775 "nr_dirty_threshold", 777 "nr_dirty_threshold",
diff --git a/mm/workingset.c b/mm/workingset.c
new file mode 100644
index 000000000000..8a6c7cff4923
--- /dev/null
+++ b/mm/workingset.c
@@ -0,0 +1,253 @@
1/*
2 * Workingset detection
3 *
4 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
5 */
6
7#include <linux/memcontrol.h>
8#include <linux/writeback.h>
9#include <linux/pagemap.h>
10#include <linux/atomic.h>
11#include <linux/module.h>
12#include <linux/swap.h>
13#include <linux/fs.h>
14#include <linux/mm.h>
15
16/*
17 * Double CLOCK lists
18 *
19 * Per zone, two clock lists are maintained for file pages: the
20 * inactive and the active list. Freshly faulted pages start out at
21 * the head of the inactive list and page reclaim scans pages from the
22 * tail. Pages that are accessed multiple times on the inactive list
23 * are promoted to the active list, to protect them from reclaim,
24 * whereas active pages are demoted to the inactive list when the
25 * active list grows too big.
26 *
27 * fault ------------------------+
28 * |
29 * +--------------+ | +-------------+
30 * reclaim <- | inactive | <-+-- demotion | active | <--+
31 * +--------------+ +-------------+ |
32 * | |
33 * +-------------- promotion ------------------+
34 *
35 *
36 * Access frequency and refault distance
37 *
38 * A workload is thrashing when its pages are frequently used but they
39 * are evicted from the inactive list every time before another access
40 * would have promoted them to the active list.
41 *
42 * In cases where the average access distance between thrashing pages
43 * is bigger than the size of memory there is nothing that can be
44 * done - the thrashing set could never fit into memory under any
45 * circumstance.
46 *
47 * However, the average access distance could be bigger than the
48 * inactive list, yet smaller than the size of memory. In this case,
49 * the set could fit into memory if it weren't for the currently
50 * active pages - which may be used more, hopefully less frequently:
51 *
52 * +-memory available to cache-+
53 * | |
54 * +-inactive------+-active----+
55 * a b | c d e f g h i | J K L M N |
56 * +---------------+-----------+
57 *
58 * It is prohibitively expensive to accurately track access frequency
59 * of pages. But a reasonable approximation can be made to measure
60 * thrashing on the inactive list, after which refaulting pages can be
61 * activated optimistically to compete with the existing active pages.
62 *
63 * Approximating inactive page access frequency - Observations:
64 *
65 * 1. When a page is accessed for the first time, it is added to the
66 * head of the inactive list, slides every existing inactive page
67 * towards the tail by one slot, and pushes the current tail page
68 * out of memory.
69 *
70 * 2. When a page is accessed for the second time, it is promoted to
71 * the active list, shrinking the inactive list by one slot. This
72 * also slides all inactive pages that were faulted into the cache
73 * more recently than the activated page towards the tail of the
74 * inactive list.
75 *
76 * Thus:
77 *
78 * 1. The sum of evictions and activations between any two points in
79 * time indicate the minimum number of inactive pages accessed in
80 * between.
81 *
82 * 2. Moving one inactive page N page slots towards the tail of the
83 * list requires at least N inactive page accesses.
84 *
85 * Combining these:
86 *
87 * 1. When a page is finally evicted from memory, the number of
88 * inactive pages accessed while the page was in cache is at least
89 * the number of page slots on the inactive list.
90 *
91 * 2. In addition, measuring the sum of evictions and activations (E)
92 * at the time of a page's eviction, and comparing it to another
93 * reading (R) at the time the page faults back into memory tells
94 * the minimum number of accesses while the page was not cached.
95 * This is called the refault distance.
96 *
97 * Because the first access of the page was the fault and the second
98 * access the refault, we combine the in-cache distance with the
99 * out-of-cache distance to get the complete minimum access distance
100 * of this page:
101 *
102 * NR_inactive + (R - E)
103 *
104 * And knowing the minimum access distance of a page, we can easily
105 * tell if the page would be able to stay in cache assuming all page
106 * slots in the cache were available:
107 *
108 * NR_inactive + (R - E) <= NR_inactive + NR_active
109 *
110 * which can be further simplified to
111 *
112 * (R - E) <= NR_active
113 *
114 * Put into words, the refault distance (out-of-cache) can be seen as
115 * a deficit in inactive list space (in-cache). If the inactive list
116 * had (R - E) more page slots, the page would not have been evicted
117 * in between accesses, but activated instead. And on a full system,
118 * the only thing eating into inactive list space is active pages.
119 *
120 *
121 * Activating refaulting pages
122 *
123 * All that is known about the active list is that the pages have been
124 * accessed more than once in the past. This means that at any given
125 * time there is actually a good chance that pages on the active list
126 * are no longer in active use.
127 *
128 * So when a refault distance of (R - E) is observed and there are at
129 * least (R - E) active pages, the refaulting page is activated
130 * optimistically in the hope that (R - E) active pages are actually
131 * used less frequently than the refaulting page - or even not used at
132 * all anymore.
133 *
134 * If this is wrong and demotion kicks in, the pages which are truly
135 * used more frequently will be reactivated while the less frequently
136 * used once will be evicted from memory.
137 *
138 * But if this is right, the stale pages will be pushed out of memory
139 * and the used pages get to stay in cache.
140 *
141 *
142 * Implementation
143 *
144 * For each zone's file LRU lists, a counter for inactive evictions
145 * and activations is maintained (zone->inactive_age).
146 *
147 * On eviction, a snapshot of this counter (along with some bits to
148 * identify the zone) is stored in the now empty page cache radix tree
149 * slot of the evicted page. This is called a shadow entry.
150 *
151 * On cache misses for which there are shadow entries, an eligible
152 * refault distance will immediately activate the refaulting page.
153 */
154
155static void *pack_shadow(unsigned long eviction, struct zone *zone)
156{
157 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
158 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
159 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
160
161 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
162}
163
164static void unpack_shadow(void *shadow,
165 struct zone **zone,
166 unsigned long *distance)
167{
168 unsigned long entry = (unsigned long)shadow;
169 unsigned long eviction;
170 unsigned long refault;
171 unsigned long mask;
172 int zid, nid;
173
174 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
175 zid = entry & ((1UL << ZONES_SHIFT) - 1);
176 entry >>= ZONES_SHIFT;
177 nid = entry & ((1UL << NODES_SHIFT) - 1);
178 entry >>= NODES_SHIFT;
179 eviction = entry;
180
181 *zone = NODE_DATA(nid)->node_zones + zid;
182
183 refault = atomic_long_read(&(*zone)->inactive_age);
184 mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
185 RADIX_TREE_EXCEPTIONAL_SHIFT);
186 /*
187 * The unsigned subtraction here gives an accurate distance
188 * across inactive_age overflows in most cases.
189 *
190 * There is a special case: usually, shadow entries have a
191 * short lifetime and are either refaulted or reclaimed along
192 * with the inode before they get too old. But it is not
193 * impossible for the inactive_age to lap a shadow entry in
194 * the field, which can then can result in a false small
195 * refault distance, leading to a false activation should this
196 * old entry actually refault again. However, earlier kernels
197 * used to deactivate unconditionally with *every* reclaim
198 * invocation for the longest time, so the occasional
199 * inappropriate activation leading to pressure on the active
200 * list is not a problem.
201 */
202 *distance = (refault - eviction) & mask;
203}
204
205/**
206 * workingset_eviction - note the eviction of a page from memory
207 * @mapping: address space the page was backing
208 * @page: the page being evicted
209 *
210 * Returns a shadow entry to be stored in @mapping->page_tree in place
211 * of the evicted @page so that a later refault can be detected.
212 */
213void *workingset_eviction(struct address_space *mapping, struct page *page)
214{
215 struct zone *zone = page_zone(page);
216 unsigned long eviction;
217
218 eviction = atomic_long_inc_return(&zone->inactive_age);
219 return pack_shadow(eviction, zone);
220}
221
222/**
223 * workingset_refault - evaluate the refault of a previously evicted page
224 * @shadow: shadow entry of the evicted page
225 *
226 * Calculates and evaluates the refault distance of the previously
227 * evicted page in the context of the zone it was allocated in.
228 *
229 * Returns %true if the page should be activated, %false otherwise.
230 */
231bool workingset_refault(void *shadow)
232{
233 unsigned long refault_distance;
234 struct zone *zone;
235
236 unpack_shadow(shadow, &zone, &refault_distance);
237 inc_zone_state(zone, WORKINGSET_REFAULT);
238
239 if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
240 inc_zone_state(zone, WORKINGSET_ACTIVATE);
241 return true;
242 }
243 return false;
244}
245
246/**
247 * workingset_activation - note a page activation
248 * @page: page that is being activated
249 */
250void workingset_activation(struct page *page)
251{
252 atomic_long_inc(&page_zone(page)->inactive_age);
253}