aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-04-03 17:47:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-03 19:21:01 -0400
commit449dd6984d0e47643c04c807f609dd56d48d5bcc (patch)
tree69f4a0a90290b048e63effc617c8ec907e8d6696
parent139e561660fe11e0fc35e142a800df3dd7d03e9d (diff)
mm: keep page cache radix tree nodes in check
Previously, page cache radix tree nodes were freed after reclaim emptied out their page pointers. But now reclaim stores shadow entries in their place, which are only reclaimed when the inodes themselves are reclaimed. This is problematic for bigger files that are still in use after they have a significant amount of their cache reclaimed, without any of those pages actually refaulting. The shadow entries will just sit there and waste memory. In the worst case, the shadow entries will accumulate until the machine runs out of memory. To get this under control, the VM will track radix tree nodes exclusively containing shadow entries on a per-NUMA node list. Per-NUMA rather than global because we expect the radix tree nodes themselves to be allocated node-locally and we want to reduce cross-node references of otherwise independent cache workloads. A simple shrinker will then reclaim these nodes on memory pressure. A few things need to be stored in the radix tree node to implement the shadow node LRU and allow tree deletions coming from the list: 1. There is no index available that would describe the reverse path from the node up to the tree root, which is needed to perform a deletion. To solve this, encode in each node its offset inside the parent. This can be stored in the unused upper bits of the same member that stores the node's height at no extra space cost. 2. The number of shadow entries needs to be counted in addition to the regular entries, to quickly detect when the node is ready to go to the shadow node LRU list. The current entry count is an unsigned int but the maximum number of entries is 64, so a shadow counter can easily be stored in the unused upper bits. 3. Tree modification needs tree lock and tree root, which are located in the address space, so store an address_space backpointer in the node. The parent pointer of the node is in a union with the 2-word rcu_head, so the backpointer comes at no extra cost as well. 4. The node needs to be linked to an LRU list, which requires a list head inside the node. This does increase the size of the node, but it does not change the number of objects that fit into a slab page. [akpm@linux-foundation.org: export the right function] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/list_lru.h8
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--include/linux/radix-tree.h32
-rw-r--r--include/linux/swap.h31
-rw-r--r--lib/radix-tree.c36
-rw-r--r--mm/filemap.c90
-rw-r--r--mm/list_lru.c16
-rw-r--r--mm/truncate.c26
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/workingset.c161
10 files changed, 359 insertions, 43 deletions
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 3ce541753c88..f3434533fbf8 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -13,6 +13,8 @@
13/* list_lru_walk_cb has to always return one of those */ 13/* list_lru_walk_cb has to always return one of those */
14enum lru_status { 14enum lru_status {
15 LRU_REMOVED, /* item removed from list */ 15 LRU_REMOVED, /* item removed from list */
16 LRU_REMOVED_RETRY, /* item removed, but lock has been
17 dropped and reacquired */
16 LRU_ROTATE, /* item referenced, give another pass */ 18 LRU_ROTATE, /* item referenced, give another pass */
17 LRU_SKIP, /* item cannot be locked, skip */ 19 LRU_SKIP, /* item cannot be locked, skip */
18 LRU_RETRY, /* item not freeable. May drop the lock 20 LRU_RETRY, /* item not freeable. May drop the lock
@@ -32,7 +34,11 @@ struct list_lru {
32}; 34};
33 35
34void list_lru_destroy(struct list_lru *lru); 36void list_lru_destroy(struct list_lru *lru);
35int list_lru_init(struct list_lru *lru); 37int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
38static inline int list_lru_init(struct list_lru *lru)
39{
40 return list_lru_init_key(lru, NULL);
41}
36 42
37/** 43/**
38 * list_lru_add: add an element to the lru list's tail 44 * list_lru_add: add an element to the lru list's tail
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f25db1d74a21..fac5509c18f0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -144,6 +144,7 @@ enum zone_stat_item {
144#endif 144#endif
145 WORKINGSET_REFAULT, 145 WORKINGSET_REFAULT,
146 WORKINGSET_ACTIVATE, 146 WORKINGSET_ACTIVATE,
147 WORKINGSET_NODERECLAIM,
147 NR_ANON_TRANSPARENT_HUGEPAGES, 148 NR_ANON_TRANSPARENT_HUGEPAGES,
148 NR_FREE_CMA_PAGES, 149 NR_FREE_CMA_PAGES,
149 NR_VM_ZONE_STAT_ITEMS }; 150 NR_VM_ZONE_STAT_ITEMS };
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 13636c40bc42..33170dbd9db4 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
72#define RADIX_TREE_TAG_LONGS \ 72#define RADIX_TREE_TAG_LONGS \
73 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) 73 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
74 74
75#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
76#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
77 RADIX_TREE_MAP_SHIFT))
78
79/* Height component in node->path */
80#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1)
81#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
82
83/* Internally used bits of node->count */
84#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
85#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
86
75struct radix_tree_node { 87struct radix_tree_node {
76 unsigned int height; /* Height from the bottom */ 88 unsigned int path; /* Offset in parent & height from the bottom */
77 unsigned int count; 89 unsigned int count;
78 union { 90 union {
79 struct radix_tree_node *parent; /* Used when ascending tree */ 91 struct {
80 struct rcu_head rcu_head; /* Used when freeing node */ 92 /* Used when ascending tree */
93 struct radix_tree_node *parent;
94 /* For tree user */
95 void *private_data;
96 };
97 /* Used when freeing node */
98 struct rcu_head rcu_head;
81 }; 99 };
100 /* For tree user */
101 struct list_head private_list;
82 void __rcu *slots[RADIX_TREE_MAP_SIZE]; 102 void __rcu *slots[RADIX_TREE_MAP_SIZE];
83 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 103 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
84}; 104};
85 105
86#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
87#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
88 RADIX_TREE_MAP_SHIFT))
89
90/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ 106/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
91struct radix_tree_root { 107struct radix_tree_root {
92 unsigned int height; 108 unsigned int height;
@@ -251,7 +267,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
251 struct radix_tree_node **nodep, void ***slotp); 267 struct radix_tree_node **nodep, void ***slotp);
252void *radix_tree_lookup(struct radix_tree_root *, unsigned long); 268void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
253void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); 269void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
254bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, 270bool __radix_tree_delete_node(struct radix_tree_root *root,
255 struct radix_tree_node *node); 271 struct radix_tree_node *node);
256void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); 272void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
257void *radix_tree_delete(struct radix_tree_root *, unsigned long); 273void *radix_tree_delete(struct radix_tree_root *, unsigned long);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b83cf61403ed..350711560753 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -264,6 +264,37 @@ struct swap_list_t {
264void *workingset_eviction(struct address_space *mapping, struct page *page); 264void *workingset_eviction(struct address_space *mapping, struct page *page);
265bool workingset_refault(void *shadow); 265bool workingset_refault(void *shadow);
266void workingset_activation(struct page *page); 266void workingset_activation(struct page *page);
267extern struct list_lru workingset_shadow_nodes;
268
269static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
270{
271 return node->count & RADIX_TREE_COUNT_MASK;
272}
273
274static inline void workingset_node_pages_inc(struct radix_tree_node *node)
275{
276 node->count++;
277}
278
279static inline void workingset_node_pages_dec(struct radix_tree_node *node)
280{
281 node->count--;
282}
283
284static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
285{
286 return node->count >> RADIX_TREE_COUNT_SHIFT;
287}
288
289static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
290{
291 node->count += 1U << RADIX_TREE_COUNT_SHIFT;
292}
293
294static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
295{
296 node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
297}
267 298
268/* linux/mm/page_alloc.c */ 299/* linux/mm/page_alloc.c */
269extern unsigned long totalram_pages; 300extern unsigned long totalram_pages;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index d60be40c111b..9599aa72d7a0 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -342,7 +342,8 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
342 342
343 /* Increase the height. */ 343 /* Increase the height. */
344 newheight = root->height+1; 344 newheight = root->height+1;
345 node->height = newheight; 345 BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
346 node->path = newheight;
346 node->count = 1; 347 node->count = 1;
347 node->parent = NULL; 348 node->parent = NULL;
348 slot = root->rnode; 349 slot = root->rnode;
@@ -400,11 +401,12 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
400 /* Have to add a child node. */ 401 /* Have to add a child node. */
401 if (!(slot = radix_tree_node_alloc(root))) 402 if (!(slot = radix_tree_node_alloc(root)))
402 return -ENOMEM; 403 return -ENOMEM;
403 slot->height = height; 404 slot->path = height;
404 slot->parent = node; 405 slot->parent = node;
405 if (node) { 406 if (node) {
406 rcu_assign_pointer(node->slots[offset], slot); 407 rcu_assign_pointer(node->slots[offset], slot);
407 node->count++; 408 node->count++;
409 slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
408 } else 410 } else
409 rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); 411 rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
410 } 412 }
@@ -498,7 +500,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
498 } 500 }
499 node = indirect_to_ptr(node); 501 node = indirect_to_ptr(node);
500 502
501 height = node->height; 503 height = node->path & RADIX_TREE_HEIGHT_MASK;
502 if (index > radix_tree_maxindex(height)) 504 if (index > radix_tree_maxindex(height))
503 return NULL; 505 return NULL;
504 506
@@ -704,7 +706,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
704 return (index == 0); 706 return (index == 0);
705 node = indirect_to_ptr(node); 707 node = indirect_to_ptr(node);
706 708
707 height = node->height; 709 height = node->path & RADIX_TREE_HEIGHT_MASK;
708 if (index > radix_tree_maxindex(height)) 710 if (index > radix_tree_maxindex(height))
709 return 0; 711 return 0;
710 712
@@ -741,7 +743,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
741{ 743{
742 unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; 744 unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
743 struct radix_tree_node *rnode, *node; 745 struct radix_tree_node *rnode, *node;
744 unsigned long index, offset; 746 unsigned long index, offset, height;
745 747
746 if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) 748 if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
747 return NULL; 749 return NULL;
@@ -772,7 +774,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
772 return NULL; 774 return NULL;
773 775
774restart: 776restart:
775 shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT; 777 height = rnode->path & RADIX_TREE_HEIGHT_MASK;
778 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
776 offset = index >> shift; 779 offset = index >> shift;
777 780
778 /* Index outside of the tree */ 781 /* Index outside of the tree */
@@ -1142,7 +1145,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
1142 unsigned int shift, height; 1145 unsigned int shift, height;
1143 unsigned long i; 1146 unsigned long i;
1144 1147
1145 height = slot->height; 1148 height = slot->path & RADIX_TREE_HEIGHT_MASK;
1146 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 1149 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
1147 1150
1148 for ( ; height > 1; height--) { 1151 for ( ; height > 1; height--) {
@@ -1205,7 +1208,8 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
1205 } 1208 }
1206 1209
1207 node = indirect_to_ptr(node); 1210 node = indirect_to_ptr(node);
1208 max_index = radix_tree_maxindex(node->height); 1211 max_index = radix_tree_maxindex(node->path &
1212 RADIX_TREE_HEIGHT_MASK);
1209 if (cur_index > max_index) { 1213 if (cur_index > max_index) {
1210 rcu_read_unlock(); 1214 rcu_read_unlock();
1211 break; 1215 break;
@@ -1301,7 +1305,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1301 * 1305 *
1302 * Returns %true if @node was freed, %false otherwise. 1306 * Returns %true if @node was freed, %false otherwise.
1303 */ 1307 */
1304bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, 1308bool __radix_tree_delete_node(struct radix_tree_root *root,
1305 struct radix_tree_node *node) 1309 struct radix_tree_node *node)
1306{ 1310{
1307 bool deleted = false; 1311 bool deleted = false;
@@ -1320,9 +1324,10 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
1320 1324
1321 parent = node->parent; 1325 parent = node->parent;
1322 if (parent) { 1326 if (parent) {
1323 index >>= RADIX_TREE_MAP_SHIFT; 1327 unsigned int offset;
1324 1328
1325 parent->slots[index & RADIX_TREE_MAP_MASK] = NULL; 1329 offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
1330 parent->slots[offset] = NULL;
1326 parent->count--; 1331 parent->count--;
1327 } else { 1332 } else {
1328 root_tag_clear_all(root); 1333 root_tag_clear_all(root);
@@ -1386,7 +1391,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
1386 node->slots[offset] = NULL; 1391 node->slots[offset] = NULL;
1387 node->count--; 1392 node->count--;
1388 1393
1389 __radix_tree_delete_node(root, index, node); 1394 __radix_tree_delete_node(root, node);
1390 1395
1391 return entry; 1396 return entry;
1392} 1397}
@@ -1419,9 +1424,12 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
1419EXPORT_SYMBOL(radix_tree_tagged); 1424EXPORT_SYMBOL(radix_tree_tagged);
1420 1425
1421static void 1426static void
1422radix_tree_node_ctor(void *node) 1427radix_tree_node_ctor(void *arg)
1423{ 1428{
1424 memset(node, 0, sizeof(struct radix_tree_node)); 1429 struct radix_tree_node *node = arg;
1430
1431 memset(node, 0, sizeof(*node));
1432 INIT_LIST_HEAD(&node->private_list);
1425} 1433}
1426 1434
1427static __init unsigned long __maxindex(unsigned int height) 1435static __init unsigned long __maxindex(unsigned int height)
diff --git a/mm/filemap.c b/mm/filemap.c
index a603c4d7d3c9..d6df3bacb0fb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -110,11 +110,17 @@
110static void page_cache_tree_delete(struct address_space *mapping, 110static void page_cache_tree_delete(struct address_space *mapping,
111 struct page *page, void *shadow) 111 struct page *page, void *shadow)
112{ 112{
113 if (shadow) { 113 struct radix_tree_node *node;
114 void **slot; 114 unsigned long index;
115 unsigned int offset;
116 unsigned int tag;
117 void **slot;
115 118
116 slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); 119 VM_BUG_ON(!PageLocked(page));
117 radix_tree_replace_slot(slot, shadow); 120
121 __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
122
123 if (shadow) {
118 mapping->nrshadows++; 124 mapping->nrshadows++;
119 /* 125 /*
120 * Make sure the nrshadows update is committed before 126 * Make sure the nrshadows update is committed before
@@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping,
123 * same time and miss a shadow entry. 129 * same time and miss a shadow entry.
124 */ 130 */
125 smp_wmb(); 131 smp_wmb();
126 } else 132 }
127 radix_tree_delete(&mapping->page_tree, page->index);
128 mapping->nrpages--; 133 mapping->nrpages--;
134
135 if (!node) {
136 /* Clear direct pointer tags in root node */
137 mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
138 radix_tree_replace_slot(slot, shadow);
139 return;
140 }
141
142 /* Clear tree tags for the removed page */
143 index = page->index;
144 offset = index & RADIX_TREE_MAP_MASK;
145 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
146 if (test_bit(offset, node->tags[tag]))
147 radix_tree_tag_clear(&mapping->page_tree, index, tag);
148 }
149
150 /* Delete page, swap shadow entry */
151 radix_tree_replace_slot(slot, shadow);
152 workingset_node_pages_dec(node);
153 if (shadow)
154 workingset_node_shadows_inc(node);
155 else
156 if (__radix_tree_delete_node(&mapping->page_tree, node))
157 return;
158
159 /*
160 * Track node that only contains shadow entries.
161 *
162 * Avoid acquiring the list_lru lock if already tracked. The
163 * list_empty() test is safe as node->private_list is
164 * protected by mapping->tree_lock.
165 */
166 if (!workingset_node_pages(node) &&
167 list_empty(&node->private_list)) {
168 node->private_data = mapping;
169 list_lru_add(&workingset_shadow_nodes, &node->private_list);
170 }
129} 171}
130 172
131/* 173/*
@@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page);
471static int page_cache_tree_insert(struct address_space *mapping, 513static int page_cache_tree_insert(struct address_space *mapping,
472 struct page *page, void **shadowp) 514 struct page *page, void **shadowp)
473{ 515{
516 struct radix_tree_node *node;
474 void **slot; 517 void **slot;
475 int error; 518 int error;
476 519
477 slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); 520 error = __radix_tree_create(&mapping->page_tree, page->index,
478 if (slot) { 521 &node, &slot);
522 if (error)
523 return error;
524 if (*slot) {
479 void *p; 525 void *p;
480 526
481 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 527 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
482 if (!radix_tree_exceptional_entry(p)) 528 if (!radix_tree_exceptional_entry(p))
483 return -EEXIST; 529 return -EEXIST;
484 radix_tree_replace_slot(slot, page);
485 mapping->nrshadows--;
486 mapping->nrpages++;
487 if (shadowp) 530 if (shadowp)
488 *shadowp = p; 531 *shadowp = p;
489 return 0; 532 mapping->nrshadows--;
533 if (node)
534 workingset_node_shadows_dec(node);
490 } 535 }
491 error = radix_tree_insert(&mapping->page_tree, page->index, page); 536 radix_tree_replace_slot(slot, page);
492 if (!error) 537 mapping->nrpages++;
493 mapping->nrpages++; 538 if (node) {
494 return error; 539 workingset_node_pages_inc(node);
540 /*
541 * Don't track node that contains actual pages.
542 *
543 * Avoid acquiring the list_lru lock if already
544 * untracked. The list_empty() test is safe as
545 * node->private_list is protected by
546 * mapping->tree_lock.
547 */
548 if (!list_empty(&node->private_list))
549 list_lru_del(&workingset_shadow_nodes,
550 &node->private_list);
551 }
552 return 0;
495} 553}
496 554
497static int __add_to_page_cache_locked(struct page *page, 555static int __add_to_page_cache_locked(struct page *page,
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 72f9decb0104..f1a0db194173 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -87,11 +87,20 @@ restart:
87 87
88 ret = isolate(item, &nlru->lock, cb_arg); 88 ret = isolate(item, &nlru->lock, cb_arg);
89 switch (ret) { 89 switch (ret) {
90 case LRU_REMOVED_RETRY:
91 assert_spin_locked(&nlru->lock);
90 case LRU_REMOVED: 92 case LRU_REMOVED:
91 if (--nlru->nr_items == 0) 93 if (--nlru->nr_items == 0)
92 node_clear(nid, lru->active_nodes); 94 node_clear(nid, lru->active_nodes);
93 WARN_ON_ONCE(nlru->nr_items < 0); 95 WARN_ON_ONCE(nlru->nr_items < 0);
94 isolated++; 96 isolated++;
97 /*
98 * If the lru lock has been dropped, our list
99 * traversal is now invalid and so we have to
100 * restart from scratch.
101 */
102 if (ret == LRU_REMOVED_RETRY)
103 goto restart;
95 break; 104 break;
96 case LRU_ROTATE: 105 case LRU_ROTATE:
97 list_move_tail(item, &nlru->list); 106 list_move_tail(item, &nlru->list);
@@ -103,6 +112,7 @@ restart:
103 * The lru lock has been dropped, our list traversal is 112 * The lru lock has been dropped, our list traversal is
104 * now invalid and so we have to restart from scratch. 113 * now invalid and so we have to restart from scratch.
105 */ 114 */
115 assert_spin_locked(&nlru->lock);
106 goto restart; 116 goto restart;
107 default: 117 default:
108 BUG(); 118 BUG();
@@ -114,7 +124,7 @@ restart:
114} 124}
115EXPORT_SYMBOL_GPL(list_lru_walk_node); 125EXPORT_SYMBOL_GPL(list_lru_walk_node);
116 126
117int list_lru_init(struct list_lru *lru) 127int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
118{ 128{
119 int i; 129 int i;
120 size_t size = sizeof(*lru->node) * nr_node_ids; 130 size_t size = sizeof(*lru->node) * nr_node_ids;
@@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru)
126 nodes_clear(lru->active_nodes); 136 nodes_clear(lru->active_nodes);
127 for (i = 0; i < nr_node_ids; i++) { 137 for (i = 0; i < nr_node_ids; i++) {
128 spin_lock_init(&lru->node[i].lock); 138 spin_lock_init(&lru->node[i].lock);
139 if (key)
140 lockdep_set_class(&lru->node[i].lock, key);
129 INIT_LIST_HEAD(&lru->node[i].list); 141 INIT_LIST_HEAD(&lru->node[i].list);
130 lru->node[i].nr_items = 0; 142 lru->node[i].nr_items = 0;
131 } 143 }
132 return 0; 144 return 0;
133} 145}
134EXPORT_SYMBOL_GPL(list_lru_init); 146EXPORT_SYMBOL_GPL(list_lru_init_key);
135 147
136void list_lru_destroy(struct list_lru *lru) 148void list_lru_destroy(struct list_lru *lru)
137{ 149{
diff --git a/mm/truncate.c b/mm/truncate.c
index 0db9258319f0..e5cc39ab0751 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -25,6 +25,9 @@
25static void clear_exceptional_entry(struct address_space *mapping, 25static void clear_exceptional_entry(struct address_space *mapping,
26 pgoff_t index, void *entry) 26 pgoff_t index, void *entry)
27{ 27{
28 struct radix_tree_node *node;
29 void **slot;
30
28 /* Handled by shmem itself */ 31 /* Handled by shmem itself */
29 if (shmem_mapping(mapping)) 32 if (shmem_mapping(mapping))
30 return; 33 return;
@@ -35,8 +38,27 @@ static void clear_exceptional_entry(struct address_space *mapping,
35 * without the tree itself locked. These unlocked entries 38 * without the tree itself locked. These unlocked entries
36 * need verification under the tree lock. 39 * need verification under the tree lock.
37 */ 40 */
38 if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry) 41 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
39 mapping->nrshadows--; 42 goto unlock;
43 if (*slot != entry)
44 goto unlock;
45 radix_tree_replace_slot(slot, NULL);
46 mapping->nrshadows--;
47 if (!node)
48 goto unlock;
49 workingset_node_shadows_dec(node);
50 /*
51 * Don't track node without shadow entries.
52 *
53 * Avoid acquiring the list_lru lock if already untracked.
54 * The list_empty() test is safe as node->private_list is
55 * protected by mapping->tree_lock.
56 */
57 if (!workingset_node_shadows(node) &&
58 !list_empty(&node->private_list))
59 list_lru_del(&workingset_shadow_nodes, &node->private_list);
60 __radix_tree_delete_node(&mapping->page_tree, node);
61unlock:
40 spin_unlock_irq(&mapping->tree_lock); 62 spin_unlock_irq(&mapping->tree_lock);
41} 63}
42 64
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 843125a0e255..f3155d51acfd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -772,6 +772,7 @@ const char * const vmstat_text[] = {
772#endif 772#endif
773 "workingset_refault", 773 "workingset_refault",
774 "workingset_activate", 774 "workingset_activate",
775 "workingset_nodereclaim",
775 "nr_anon_transparent_hugepages", 776 "nr_anon_transparent_hugepages",
776 "nr_free_cma", 777 "nr_free_cma",
777 "nr_dirty_threshold", 778 "nr_dirty_threshold",
diff --git a/mm/workingset.c b/mm/workingset.c
index 8a6c7cff4923..f7216fa7da27 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -251,3 +251,164 @@ void workingset_activation(struct page *page)
251{ 251{
252 atomic_long_inc(&page_zone(page)->inactive_age); 252 atomic_long_inc(&page_zone(page)->inactive_age);
253} 253}
254
255/*
256 * Shadow entries reflect the share of the working set that does not
257 * fit into memory, so their number depends on the access pattern of
258 * the workload. In most cases, they will refault or get reclaimed
259 * along with the inode, but a (malicious) workload that streams
260 * through files with a total size several times that of available
261 * memory, while preventing the inodes from being reclaimed, can
262 * create excessive amounts of shadow nodes. To keep a lid on this,
263 * track shadow nodes and reclaim them when they grow way past the
264 * point where they would still be useful.
265 */
266
267struct list_lru workingset_shadow_nodes;
268
269static unsigned long count_shadow_nodes(struct shrinker *shrinker,
270 struct shrink_control *sc)
271{
272 unsigned long shadow_nodes;
273 unsigned long max_nodes;
274 unsigned long pages;
275
276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
277 local_irq_disable();
278 shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
279 local_irq_enable();
280
281 pages = node_present_pages(sc->nid);
282 /*
283 * Active cache pages are limited to 50% of memory, and shadow
284 * entries that represent a refault distance bigger than that
285 * do not have any effect. Limit the number of shadow nodes
286 * such that shadow entries do not exceed the number of active
287 * cache pages, assuming a worst-case node population density
288 * of 1/8th on average.
289 *
290 * On 64-bit with 7 radix_tree_nodes per page and 64 slots
291 * each, this will reclaim shadow entries when they consume
292 * ~2% of available memory:
293 *
294 * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
295 */
296 max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
297
298 if (shadow_nodes <= max_nodes)
299 return 0;
300
301 return shadow_nodes - max_nodes;
302}
303
304static enum lru_status shadow_lru_isolate(struct list_head *item,
305 spinlock_t *lru_lock,
306 void *arg)
307{
308 struct address_space *mapping;
309 struct radix_tree_node *node;
310 unsigned int i;
311 int ret;
312
313 /*
314 * Page cache insertions and deletions synchroneously maintain
315 * the shadow node LRU under the mapping->tree_lock and the
316 * lru_lock. Because the page cache tree is emptied before
317 * the inode can be destroyed, holding the lru_lock pins any
318 * address_space that has radix tree nodes on the LRU.
319 *
320 * We can then safely transition to the mapping->tree_lock to
321 * pin only the address_space of the particular node we want
322 * to reclaim, take the node off-LRU, and drop the lru_lock.
323 */
324
325 node = container_of(item, struct radix_tree_node, private_list);
326 mapping = node->private_data;
327
328 /* Coming from the list, invert the lock order */
329 if (!spin_trylock(&mapping->tree_lock)) {
330 spin_unlock(lru_lock);
331 ret = LRU_RETRY;
332 goto out;
333 }
334
335 list_del_init(item);
336 spin_unlock(lru_lock);
337
338 /*
339 * The nodes should only contain one or more shadow entries,
340 * no pages, so we expect to be able to remove them all and
341 * delete and free the empty node afterwards.
342 */
343
344 BUG_ON(!node->count);
345 BUG_ON(node->count & RADIX_TREE_COUNT_MASK);
346
347 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
348 if (node->slots[i]) {
349 BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
350 node->slots[i] = NULL;
351 BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
352 node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
353 BUG_ON(!mapping->nrshadows);
354 mapping->nrshadows--;
355 }
356 }
357 BUG_ON(node->count);
358 inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
359 if (!__radix_tree_delete_node(&mapping->page_tree, node))
360 BUG();
361
362 spin_unlock(&mapping->tree_lock);
363 ret = LRU_REMOVED_RETRY;
364out:
365 local_irq_enable();
366 cond_resched();
367 local_irq_disable();
368 spin_lock(lru_lock);
369 return ret;
370}
371
372static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
373 struct shrink_control *sc)
374{
375 unsigned long ret;
376
377 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
378 local_irq_disable();
379 ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
380 shadow_lru_isolate, NULL, &sc->nr_to_scan);
381 local_irq_enable();
382 return ret;
383}
384
385static struct shrinker workingset_shadow_shrinker = {
386 .count_objects = count_shadow_nodes,
387 .scan_objects = scan_shadow_nodes,
388 .seeks = DEFAULT_SEEKS,
389 .flags = SHRINKER_NUMA_AWARE,
390};
391
392/*
393 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
394 * mapping->tree_lock.
395 */
396static struct lock_class_key shadow_nodes_key;
397
398static int __init workingset_init(void)
399{
400 int ret;
401
402 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
403 if (ret)
404 goto err;
405 ret = register_shrinker(&workingset_shadow_shrinker);
406 if (ret)
407 goto err_list_lru;
408 return 0;
409err_list_lru:
410 list_lru_destroy(&workingset_shadow_nodes);
411err:
412 return ret;
413}
414module_init(workingset_init);