aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/list_lru.h8
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--include/linux/radix-tree.h32
-rw-r--r--include/linux/swap.h31
-rw-r--r--lib/radix-tree.c36
-rw-r--r--mm/filemap.c90
-rw-r--r--mm/list_lru.c16
-rw-r--r--mm/truncate.c26
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/workingset.c161
10 files changed, 359 insertions, 43 deletions
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 3ce541753c88..f3434533fbf8 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -13,6 +13,8 @@
13/* list_lru_walk_cb has to always return one of those */ 13/* list_lru_walk_cb has to always return one of those */
14enum lru_status { 14enum lru_status {
15 LRU_REMOVED, /* item removed from list */ 15 LRU_REMOVED, /* item removed from list */
16 LRU_REMOVED_RETRY, /* item removed, but lock has been
17 dropped and reacquired */
16 LRU_ROTATE, /* item referenced, give another pass */ 18 LRU_ROTATE, /* item referenced, give another pass */
17 LRU_SKIP, /* item cannot be locked, skip */ 19 LRU_SKIP, /* item cannot be locked, skip */
18 LRU_RETRY, /* item not freeable. May drop the lock 20 LRU_RETRY, /* item not freeable. May drop the lock
@@ -32,7 +34,11 @@ struct list_lru {
32}; 34};
33 35
34void list_lru_destroy(struct list_lru *lru); 36void list_lru_destroy(struct list_lru *lru);
35int list_lru_init(struct list_lru *lru); 37int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
38static inline int list_lru_init(struct list_lru *lru)
39{
40 return list_lru_init_key(lru, NULL);
41}
36 42
37/** 43/**
38 * list_lru_add: add an element to the lru list's tail 44 * list_lru_add: add an element to the lru list's tail
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f25db1d74a21..fac5509c18f0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -144,6 +144,7 @@ enum zone_stat_item {
144#endif 144#endif
145 WORKINGSET_REFAULT, 145 WORKINGSET_REFAULT,
146 WORKINGSET_ACTIVATE, 146 WORKINGSET_ACTIVATE,
147 WORKINGSET_NODERECLAIM,
147 NR_ANON_TRANSPARENT_HUGEPAGES, 148 NR_ANON_TRANSPARENT_HUGEPAGES,
148 NR_FREE_CMA_PAGES, 149 NR_FREE_CMA_PAGES,
149 NR_VM_ZONE_STAT_ITEMS }; 150 NR_VM_ZONE_STAT_ITEMS };
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 13636c40bc42..33170dbd9db4 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
72#define RADIX_TREE_TAG_LONGS \ 72#define RADIX_TREE_TAG_LONGS \
73 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) 73 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
74 74
75#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
76#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
77 RADIX_TREE_MAP_SHIFT))
78
79/* Height component in node->path */
80#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1)
81#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
82
83/* Internally used bits of node->count */
84#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
85#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
86
75struct radix_tree_node { 87struct radix_tree_node {
76 unsigned int height; /* Height from the bottom */ 88 unsigned int path; /* Offset in parent & height from the bottom */
77 unsigned int count; 89 unsigned int count;
78 union { 90 union {
79 struct radix_tree_node *parent; /* Used when ascending tree */ 91 struct {
80 struct rcu_head rcu_head; /* Used when freeing node */ 92 /* Used when ascending tree */
93 struct radix_tree_node *parent;
94 /* For tree user */
95 void *private_data;
96 };
97 /* Used when freeing node */
98 struct rcu_head rcu_head;
81 }; 99 };
100 /* For tree user */
101 struct list_head private_list;
82 void __rcu *slots[RADIX_TREE_MAP_SIZE]; 102 void __rcu *slots[RADIX_TREE_MAP_SIZE];
83 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 103 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
84}; 104};
85 105
86#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
87#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
88 RADIX_TREE_MAP_SHIFT))
89
90/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ 106/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
91struct radix_tree_root { 107struct radix_tree_root {
92 unsigned int height; 108 unsigned int height;
@@ -251,7 +267,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
251 struct radix_tree_node **nodep, void ***slotp); 267 struct radix_tree_node **nodep, void ***slotp);
252void *radix_tree_lookup(struct radix_tree_root *, unsigned long); 268void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
253void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); 269void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
254bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, 270bool __radix_tree_delete_node(struct radix_tree_root *root,
255 struct radix_tree_node *node); 271 struct radix_tree_node *node);
256void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); 272void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
257void *radix_tree_delete(struct radix_tree_root *, unsigned long); 273void *radix_tree_delete(struct radix_tree_root *, unsigned long);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b83cf61403ed..350711560753 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -264,6 +264,37 @@ struct swap_list_t {
264void *workingset_eviction(struct address_space *mapping, struct page *page); 264void *workingset_eviction(struct address_space *mapping, struct page *page);
265bool workingset_refault(void *shadow); 265bool workingset_refault(void *shadow);
266void workingset_activation(struct page *page); 266void workingset_activation(struct page *page);
267extern struct list_lru workingset_shadow_nodes;
268
269static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
270{
271 return node->count & RADIX_TREE_COUNT_MASK;
272}
273
274static inline void workingset_node_pages_inc(struct radix_tree_node *node)
275{
276 node->count++;
277}
278
279static inline void workingset_node_pages_dec(struct radix_tree_node *node)
280{
281 node->count--;
282}
283
284static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
285{
286 return node->count >> RADIX_TREE_COUNT_SHIFT;
287}
288
289static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
290{
291 node->count += 1U << RADIX_TREE_COUNT_SHIFT;
292}
293
294static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
295{
296 node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
297}
267 298
268/* linux/mm/page_alloc.c */ 299/* linux/mm/page_alloc.c */
269extern unsigned long totalram_pages; 300extern unsigned long totalram_pages;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index d60be40c111b..9599aa72d7a0 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -342,7 +342,8 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
342 342
343 /* Increase the height. */ 343 /* Increase the height. */
344 newheight = root->height+1; 344 newheight = root->height+1;
345 node->height = newheight; 345 BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
346 node->path = newheight;
346 node->count = 1; 347 node->count = 1;
347 node->parent = NULL; 348 node->parent = NULL;
348 slot = root->rnode; 349 slot = root->rnode;
@@ -400,11 +401,12 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
400 /* Have to add a child node. */ 401 /* Have to add a child node. */
401 if (!(slot = radix_tree_node_alloc(root))) 402 if (!(slot = radix_tree_node_alloc(root)))
402 return -ENOMEM; 403 return -ENOMEM;
403 slot->height = height; 404 slot->path = height;
404 slot->parent = node; 405 slot->parent = node;
405 if (node) { 406 if (node) {
406 rcu_assign_pointer(node->slots[offset], slot); 407 rcu_assign_pointer(node->slots[offset], slot);
407 node->count++; 408 node->count++;
409 slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
408 } else 410 } else
409 rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); 411 rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
410 } 412 }
@@ -498,7 +500,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
498 } 500 }
499 node = indirect_to_ptr(node); 501 node = indirect_to_ptr(node);
500 502
501 height = node->height; 503 height = node->path & RADIX_TREE_HEIGHT_MASK;
502 if (index > radix_tree_maxindex(height)) 504 if (index > radix_tree_maxindex(height))
503 return NULL; 505 return NULL;
504 506
@@ -704,7 +706,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
704 return (index == 0); 706 return (index == 0);
705 node = indirect_to_ptr(node); 707 node = indirect_to_ptr(node);
706 708
707 height = node->height; 709 height = node->path & RADIX_TREE_HEIGHT_MASK;
708 if (index > radix_tree_maxindex(height)) 710 if (index > radix_tree_maxindex(height))
709 return 0; 711 return 0;
710 712
@@ -741,7 +743,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
741{ 743{
742 unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; 744 unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
743 struct radix_tree_node *rnode, *node; 745 struct radix_tree_node *rnode, *node;
744 unsigned long index, offset; 746 unsigned long index, offset, height;
745 747
746 if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) 748 if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
747 return NULL; 749 return NULL;
@@ -772,7 +774,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
772 return NULL; 774 return NULL;
773 775
774restart: 776restart:
775 shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT; 777 height = rnode->path & RADIX_TREE_HEIGHT_MASK;
778 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
776 offset = index >> shift; 779 offset = index >> shift;
777 780
778 /* Index outside of the tree */ 781 /* Index outside of the tree */
@@ -1142,7 +1145,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
1142 unsigned int shift, height; 1145 unsigned int shift, height;
1143 unsigned long i; 1146 unsigned long i;
1144 1147
1145 height = slot->height; 1148 height = slot->path & RADIX_TREE_HEIGHT_MASK;
1146 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 1149 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
1147 1150
1148 for ( ; height > 1; height--) { 1151 for ( ; height > 1; height--) {
@@ -1205,7 +1208,8 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
1205 } 1208 }
1206 1209
1207 node = indirect_to_ptr(node); 1210 node = indirect_to_ptr(node);
1208 max_index = radix_tree_maxindex(node->height); 1211 max_index = radix_tree_maxindex(node->path &
1212 RADIX_TREE_HEIGHT_MASK);
1209 if (cur_index > max_index) { 1213 if (cur_index > max_index) {
1210 rcu_read_unlock(); 1214 rcu_read_unlock();
1211 break; 1215 break;
@@ -1301,7 +1305,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1301 * 1305 *
1302 * Returns %true if @node was freed, %false otherwise. 1306 * Returns %true if @node was freed, %false otherwise.
1303 */ 1307 */
1304bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, 1308bool __radix_tree_delete_node(struct radix_tree_root *root,
1305 struct radix_tree_node *node) 1309 struct radix_tree_node *node)
1306{ 1310{
1307 bool deleted = false; 1311 bool deleted = false;
@@ -1320,9 +1324,10 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
1320 1324
1321 parent = node->parent; 1325 parent = node->parent;
1322 if (parent) { 1326 if (parent) {
1323 index >>= RADIX_TREE_MAP_SHIFT; 1327 unsigned int offset;
1324 1328
1325 parent->slots[index & RADIX_TREE_MAP_MASK] = NULL; 1329 offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
1330 parent->slots[offset] = NULL;
1326 parent->count--; 1331 parent->count--;
1327 } else { 1332 } else {
1328 root_tag_clear_all(root); 1333 root_tag_clear_all(root);
@@ -1386,7 +1391,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
1386 node->slots[offset] = NULL; 1391 node->slots[offset] = NULL;
1387 node->count--; 1392 node->count--;
1388 1393
1389 __radix_tree_delete_node(root, index, node); 1394 __radix_tree_delete_node(root, node);
1390 1395
1391 return entry; 1396 return entry;
1392} 1397}
@@ -1419,9 +1424,12 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
1419EXPORT_SYMBOL(radix_tree_tagged); 1424EXPORT_SYMBOL(radix_tree_tagged);
1420 1425
1421static void 1426static void
1422radix_tree_node_ctor(void *node) 1427radix_tree_node_ctor(void *arg)
1423{ 1428{
1424 memset(node, 0, sizeof(struct radix_tree_node)); 1429 struct radix_tree_node *node = arg;
1430
1431 memset(node, 0, sizeof(*node));
1432 INIT_LIST_HEAD(&node->private_list);
1425} 1433}
1426 1434
1427static __init unsigned long __maxindex(unsigned int height) 1435static __init unsigned long __maxindex(unsigned int height)
diff --git a/mm/filemap.c b/mm/filemap.c
index a603c4d7d3c9..d6df3bacb0fb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -110,11 +110,17 @@
110static void page_cache_tree_delete(struct address_space *mapping, 110static void page_cache_tree_delete(struct address_space *mapping,
111 struct page *page, void *shadow) 111 struct page *page, void *shadow)
112{ 112{
113 if (shadow) { 113 struct radix_tree_node *node;
114 void **slot; 114 unsigned long index;
115 unsigned int offset;
116 unsigned int tag;
117 void **slot;
115 118
116 slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); 119 VM_BUG_ON(!PageLocked(page));
117 radix_tree_replace_slot(slot, shadow); 120
121 __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
122
123 if (shadow) {
118 mapping->nrshadows++; 124 mapping->nrshadows++;
119 /* 125 /*
120 * Make sure the nrshadows update is committed before 126 * Make sure the nrshadows update is committed before
@@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping,
123 * same time and miss a shadow entry. 129 * same time and miss a shadow entry.
124 */ 130 */
125 smp_wmb(); 131 smp_wmb();
126 } else 132 }
127 radix_tree_delete(&mapping->page_tree, page->index);
128 mapping->nrpages--; 133 mapping->nrpages--;
134
135 if (!node) {
136 /* Clear direct pointer tags in root node */
137 mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
138 radix_tree_replace_slot(slot, shadow);
139 return;
140 }
141
142 /* Clear tree tags for the removed page */
143 index = page->index;
144 offset = index & RADIX_TREE_MAP_MASK;
145 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
146 if (test_bit(offset, node->tags[tag]))
147 radix_tree_tag_clear(&mapping->page_tree, index, tag);
148 }
149
150 /* Delete page, swap shadow entry */
151 radix_tree_replace_slot(slot, shadow);
152 workingset_node_pages_dec(node);
153 if (shadow)
154 workingset_node_shadows_inc(node);
155 else
156 if (__radix_tree_delete_node(&mapping->page_tree, node))
157 return;
158
159 /*
160 * Track node that only contains shadow entries.
161 *
162 * Avoid acquiring the list_lru lock if already tracked. The
163 * list_empty() test is safe as node->private_list is
164 * protected by mapping->tree_lock.
165 */
166 if (!workingset_node_pages(node) &&
167 list_empty(&node->private_list)) {
168 node->private_data = mapping;
169 list_lru_add(&workingset_shadow_nodes, &node->private_list);
170 }
129} 171}
130 172
131/* 173/*
@@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page);
471static int page_cache_tree_insert(struct address_space *mapping, 513static int page_cache_tree_insert(struct address_space *mapping,
472 struct page *page, void **shadowp) 514 struct page *page, void **shadowp)
473{ 515{
516 struct radix_tree_node *node;
474 void **slot; 517 void **slot;
475 int error; 518 int error;
476 519
477 slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); 520 error = __radix_tree_create(&mapping->page_tree, page->index,
478 if (slot) { 521 &node, &slot);
522 if (error)
523 return error;
524 if (*slot) {
479 void *p; 525 void *p;
480 526
481 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 527 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
482 if (!radix_tree_exceptional_entry(p)) 528 if (!radix_tree_exceptional_entry(p))
483 return -EEXIST; 529 return -EEXIST;
484 radix_tree_replace_slot(slot, page);
485 mapping->nrshadows--;
486 mapping->nrpages++;
487 if (shadowp) 530 if (shadowp)
488 *shadowp = p; 531 *shadowp = p;
489 return 0; 532 mapping->nrshadows--;
533 if (node)
534 workingset_node_shadows_dec(node);
490 } 535 }
491 error = radix_tree_insert(&mapping->page_tree, page->index, page); 536 radix_tree_replace_slot(slot, page);
492 if (!error) 537 mapping->nrpages++;
493 mapping->nrpages++; 538 if (node) {
494 return error; 539 workingset_node_pages_inc(node);
540 /*
541 * Don't track node that contains actual pages.
542 *
543 * Avoid acquiring the list_lru lock if already
544 * untracked. The list_empty() test is safe as
545 * node->private_list is protected by
546 * mapping->tree_lock.
547 */
548 if (!list_empty(&node->private_list))
549 list_lru_del(&workingset_shadow_nodes,
550 &node->private_list);
551 }
552 return 0;
495} 553}
496 554
497static int __add_to_page_cache_locked(struct page *page, 555static int __add_to_page_cache_locked(struct page *page,
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 72f9decb0104..f1a0db194173 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -87,11 +87,20 @@ restart:
87 87
88 ret = isolate(item, &nlru->lock, cb_arg); 88 ret = isolate(item, &nlru->lock, cb_arg);
89 switch (ret) { 89 switch (ret) {
90 case LRU_REMOVED_RETRY:
91 assert_spin_locked(&nlru->lock);
90 case LRU_REMOVED: 92 case LRU_REMOVED:
91 if (--nlru->nr_items == 0) 93 if (--nlru->nr_items == 0)
92 node_clear(nid, lru->active_nodes); 94 node_clear(nid, lru->active_nodes);
93 WARN_ON_ONCE(nlru->nr_items < 0); 95 WARN_ON_ONCE(nlru->nr_items < 0);
94 isolated++; 96 isolated++;
97 /*
98 * If the lru lock has been dropped, our list
99 * traversal is now invalid and so we have to
100 * restart from scratch.
101 */
102 if (ret == LRU_REMOVED_RETRY)
103 goto restart;
95 break; 104 break;
96 case LRU_ROTATE: 105 case LRU_ROTATE:
97 list_move_tail(item, &nlru->list); 106 list_move_tail(item, &nlru->list);
@@ -103,6 +112,7 @@ restart:
103 * The lru lock has been dropped, our list traversal is 112 * The lru lock has been dropped, our list traversal is
104 * now invalid and so we have to restart from scratch. 113 * now invalid and so we have to restart from scratch.
105 */ 114 */
115 assert_spin_locked(&nlru->lock);
106 goto restart; 116 goto restart;
107 default: 117 default:
108 BUG(); 118 BUG();
@@ -114,7 +124,7 @@ restart:
114} 124}
115EXPORT_SYMBOL_GPL(list_lru_walk_node); 125EXPORT_SYMBOL_GPL(list_lru_walk_node);
116 126
117int list_lru_init(struct list_lru *lru) 127int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
118{ 128{
119 int i; 129 int i;
120 size_t size = sizeof(*lru->node) * nr_node_ids; 130 size_t size = sizeof(*lru->node) * nr_node_ids;
@@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru)
126 nodes_clear(lru->active_nodes); 136 nodes_clear(lru->active_nodes);
127 for (i = 0; i < nr_node_ids; i++) { 137 for (i = 0; i < nr_node_ids; i++) {
128 spin_lock_init(&lru->node[i].lock); 138 spin_lock_init(&lru->node[i].lock);
139 if (key)
140 lockdep_set_class(&lru->node[i].lock, key);
129 INIT_LIST_HEAD(&lru->node[i].list); 141 INIT_LIST_HEAD(&lru->node[i].list);
130 lru->node[i].nr_items = 0; 142 lru->node[i].nr_items = 0;
131 } 143 }
132 return 0; 144 return 0;
133} 145}
134EXPORT_SYMBOL_GPL(list_lru_init); 146EXPORT_SYMBOL_GPL(list_lru_init_key);
135 147
136void list_lru_destroy(struct list_lru *lru) 148void list_lru_destroy(struct list_lru *lru)
137{ 149{
diff --git a/mm/truncate.c b/mm/truncate.c
index 0db9258319f0..e5cc39ab0751 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -25,6 +25,9 @@
25static void clear_exceptional_entry(struct address_space *mapping, 25static void clear_exceptional_entry(struct address_space *mapping,
26 pgoff_t index, void *entry) 26 pgoff_t index, void *entry)
27{ 27{
28 struct radix_tree_node *node;
29 void **slot;
30
28 /* Handled by shmem itself */ 31 /* Handled by shmem itself */
29 if (shmem_mapping(mapping)) 32 if (shmem_mapping(mapping))
30 return; 33 return;
@@ -35,8 +38,27 @@ static void clear_exceptional_entry(struct address_space *mapping,
35 * without the tree itself locked. These unlocked entries 38 * without the tree itself locked. These unlocked entries
36 * need verification under the tree lock. 39 * need verification under the tree lock.
37 */ 40 */
38 if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry) 41 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
39 mapping->nrshadows--; 42 goto unlock;
43 if (*slot != entry)
44 goto unlock;
45 radix_tree_replace_slot(slot, NULL);
46 mapping->nrshadows--;
47 if (!node)
48 goto unlock;
49 workingset_node_shadows_dec(node);
50 /*
51 * Don't track node without shadow entries.
52 *
53 * Avoid acquiring the list_lru lock if already untracked.
54 * The list_empty() test is safe as node->private_list is
55 * protected by mapping->tree_lock.
56 */
57 if (!workingset_node_shadows(node) &&
58 !list_empty(&node->private_list))
59 list_lru_del(&workingset_shadow_nodes, &node->private_list);
60 __radix_tree_delete_node(&mapping->page_tree, node);
61unlock:
40 spin_unlock_irq(&mapping->tree_lock); 62 spin_unlock_irq(&mapping->tree_lock);
41} 63}
42 64
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 843125a0e255..f3155d51acfd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -772,6 +772,7 @@ const char * const vmstat_text[] = {
772#endif 772#endif
773 "workingset_refault", 773 "workingset_refault",
774 "workingset_activate", 774 "workingset_activate",
775 "workingset_nodereclaim",
775 "nr_anon_transparent_hugepages", 776 "nr_anon_transparent_hugepages",
776 "nr_free_cma", 777 "nr_free_cma",
777 "nr_dirty_threshold", 778 "nr_dirty_threshold",
diff --git a/mm/workingset.c b/mm/workingset.c
index 8a6c7cff4923..f7216fa7da27 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -251,3 +251,164 @@ void workingset_activation(struct page *page)
251{ 251{
252 atomic_long_inc(&page_zone(page)->inactive_age); 252 atomic_long_inc(&page_zone(page)->inactive_age);
253} 253}
254
255/*
256 * Shadow entries reflect the share of the working set that does not
257 * fit into memory, so their number depends on the access pattern of
258 * the workload. In most cases, they will refault or get reclaimed
259 * along with the inode, but a (malicious) workload that streams
260 * through files with a total size several times that of available
261 * memory, while preventing the inodes from being reclaimed, can
262 * create excessive amounts of shadow nodes. To keep a lid on this,
263 * track shadow nodes and reclaim them when they grow way past the
264 * point where they would still be useful.
265 */
266
267struct list_lru workingset_shadow_nodes;
268
269static unsigned long count_shadow_nodes(struct shrinker *shrinker,
270 struct shrink_control *sc)
271{
272 unsigned long shadow_nodes;
273 unsigned long max_nodes;
274 unsigned long pages;
275
276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
277 local_irq_disable();
278 shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
279 local_irq_enable();
280
281 pages = node_present_pages(sc->nid);
282 /*
283 * Active cache pages are limited to 50% of memory, and shadow
284 * entries that represent a refault distance bigger than that
285 * do not have any effect. Limit the number of shadow nodes
286 * such that shadow entries do not exceed the number of active
287 * cache pages, assuming a worst-case node population density
288 * of 1/8th on average.
289 *
290 * On 64-bit with 7 radix_tree_nodes per page and 64 slots
291 * each, this will reclaim shadow entries when they consume
292 * ~2% of available memory:
293 *
294 * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
295 */
296 max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
297
298 if (shadow_nodes <= max_nodes)
299 return 0;
300
301 return shadow_nodes - max_nodes;
302}
303
304static enum lru_status shadow_lru_isolate(struct list_head *item,
305 spinlock_t *lru_lock,
306 void *arg)
307{
308 struct address_space *mapping;
309 struct radix_tree_node *node;
310 unsigned int i;
311 int ret;
312
313 /*
314 * Page cache insertions and deletions synchroneously maintain
315 * the shadow node LRU under the mapping->tree_lock and the
316 * lru_lock. Because the page cache tree is emptied before
317 * the inode can be destroyed, holding the lru_lock pins any
318 * address_space that has radix tree nodes on the LRU.
319 *
320 * We can then safely transition to the mapping->tree_lock to
321 * pin only the address_space of the particular node we want
322 * to reclaim, take the node off-LRU, and drop the lru_lock.
323 */
324
325 node = container_of(item, struct radix_tree_node, private_list);
326 mapping = node->private_data;
327
328 /* Coming from the list, invert the lock order */
329 if (!spin_trylock(&mapping->tree_lock)) {
330 spin_unlock(lru_lock);
331 ret = LRU_RETRY;
332 goto out;
333 }
334
335 list_del_init(item);
336 spin_unlock(lru_lock);
337
338 /*
339 * The nodes should only contain one or more shadow entries,
340 * no pages, so we expect to be able to remove them all and
341 * delete and free the empty node afterwards.
342 */
343
344 BUG_ON(!node->count);
345 BUG_ON(node->count & RADIX_TREE_COUNT_MASK);
346
347 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
348 if (node->slots[i]) {
349 BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
350 node->slots[i] = NULL;
351 BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
352 node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
353 BUG_ON(!mapping->nrshadows);
354 mapping->nrshadows--;
355 }
356 }
357 BUG_ON(node->count);
358 inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
359 if (!__radix_tree_delete_node(&mapping->page_tree, node))
360 BUG();
361
362 spin_unlock(&mapping->tree_lock);
363 ret = LRU_REMOVED_RETRY;
364out:
365 local_irq_enable();
366 cond_resched();
367 local_irq_disable();
368 spin_lock(lru_lock);
369 return ret;
370}
371
372static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
373 struct shrink_control *sc)
374{
375 unsigned long ret;
376
377 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
378 local_irq_disable();
379 ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
380 shadow_lru_isolate, NULL, &sc->nr_to_scan);
381 local_irq_enable();
382 return ret;
383}
384
385static struct shrinker workingset_shadow_shrinker = {
386 .count_objects = count_shadow_nodes,
387 .scan_objects = scan_shadow_nodes,
388 .seeks = DEFAULT_SEEKS,
389 .flags = SHRINKER_NUMA_AWARE,
390};
391
392/*
393 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
394 * mapping->tree_lock.
395 */
396static struct lock_class_key shadow_nodes_key;
397
398static int __init workingset_init(void)
399{
400 int ret;
401
402 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
403 if (ret)
404 goto err;
405 ret = register_shrinker(&workingset_shadow_shrinker);
406 if (ret)
407 goto err_list_lru;
408 return 0;
409err_list_lru:
410 list_lru_destroy(&workingset_shadow_nodes);
411err:
412 return ret;
413}
414module_init(workingset_init);