aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-04-03 17:47:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-03 19:21:01 -0400
commit449dd6984d0e47643c04c807f609dd56d48d5bcc (patch)
tree69f4a0a90290b048e63effc617c8ec907e8d6696 /mm/filemap.c
parent139e561660fe11e0fc35e142a800df3dd7d03e9d (diff)
mm: keep page cache radix tree nodes in check
Previously, page cache radix tree nodes were freed after reclaim emptied out their page pointers. But now reclaim stores shadow entries in their place, which are only reclaimed when the inodes themselves are reclaimed. This is problematic for bigger files that are still in use after they have a significant amount of their cache reclaimed, without any of those pages actually refaulting. The shadow entries will just sit there and waste memory. In the worst case, the shadow entries will accumulate until the machine runs out of memory. To get this under control, the VM will track radix tree nodes exclusively containing shadow entries on a per-NUMA node list. Per-NUMA rather than global because we expect the radix tree nodes themselves to be allocated node-locally and we want to reduce cross-node references of otherwise independent cache workloads. A simple shrinker will then reclaim these nodes on memory pressure. A few things need to be stored in the radix tree node to implement the shadow node LRU and allow tree deletions coming from the list: 1. There is no index available that would describe the reverse path from the node up to the tree root, which is needed to perform a deletion. To solve this, encode in each node its offset inside the parent. This can be stored in the unused upper bits of the same member that stores the node's height at no extra space cost. 2. The number of shadow entries needs to be counted in addition to the regular entries, to quickly detect when the node is ready to go to the shadow node LRU list. The current entry count is an unsigned int but the maximum number of entries is 64, so a shadow counter can easily be stored in the unused upper bits. 3. Tree modification needs tree lock and tree root, which are located in the address space, so store an address_space backpointer in the node. The parent pointer of the node is in a union with the 2-word rcu_head, so the backpointer comes at no extra cost as well. 4. The node needs to be linked to an LRU list, which requires a list head inside the node. This does increase the size of the node, but it does not change the number of objects that fit into a slab page. [akpm@linux-foundation.org: export the right function] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c90
1 files changed, 74 insertions, 16 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a603c4d7d3c9..d6df3bacb0fb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -110,11 +110,17 @@
110static void page_cache_tree_delete(struct address_space *mapping, 110static void page_cache_tree_delete(struct address_space *mapping,
111 struct page *page, void *shadow) 111 struct page *page, void *shadow)
112{ 112{
113 if (shadow) { 113 struct radix_tree_node *node;
114 void **slot; 114 unsigned long index;
115 unsigned int offset;
116 unsigned int tag;
117 void **slot;
115 118
116 slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); 119 VM_BUG_ON(!PageLocked(page));
117 radix_tree_replace_slot(slot, shadow); 120
121 __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
122
123 if (shadow) {
118 mapping->nrshadows++; 124 mapping->nrshadows++;
119 /* 125 /*
120 * Make sure the nrshadows update is committed before 126 * Make sure the nrshadows update is committed before
@@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping,
123 * same time and miss a shadow entry. 129 * same time and miss a shadow entry.
124 */ 130 */
125 smp_wmb(); 131 smp_wmb();
126 } else 132 }
127 radix_tree_delete(&mapping->page_tree, page->index);
128 mapping->nrpages--; 133 mapping->nrpages--;
134
135 if (!node) {
136 /* Clear direct pointer tags in root node */
137 mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
138 radix_tree_replace_slot(slot, shadow);
139 return;
140 }
141
142 /* Clear tree tags for the removed page */
143 index = page->index;
144 offset = index & RADIX_TREE_MAP_MASK;
145 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
146 if (test_bit(offset, node->tags[tag]))
147 radix_tree_tag_clear(&mapping->page_tree, index, tag);
148 }
149
150 /* Delete page, swap shadow entry */
151 radix_tree_replace_slot(slot, shadow);
152 workingset_node_pages_dec(node);
153 if (shadow)
154 workingset_node_shadows_inc(node);
155 else
156 if (__radix_tree_delete_node(&mapping->page_tree, node))
157 return;
158
159 /*
160 * Track node that only contains shadow entries.
161 *
162 * Avoid acquiring the list_lru lock if already tracked. The
163 * list_empty() test is safe as node->private_list is
164 * protected by mapping->tree_lock.
165 */
166 if (!workingset_node_pages(node) &&
167 list_empty(&node->private_list)) {
168 node->private_data = mapping;
169 list_lru_add(&workingset_shadow_nodes, &node->private_list);
170 }
129} 171}
130 172
131/* 173/*
@@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page);
471static int page_cache_tree_insert(struct address_space *mapping, 513static int page_cache_tree_insert(struct address_space *mapping,
472 struct page *page, void **shadowp) 514 struct page *page, void **shadowp)
473{ 515{
516 struct radix_tree_node *node;
474 void **slot; 517 void **slot;
475 int error; 518 int error;
476 519
477 slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); 520 error = __radix_tree_create(&mapping->page_tree, page->index,
478 if (slot) { 521 &node, &slot);
522 if (error)
523 return error;
524 if (*slot) {
479 void *p; 525 void *p;
480 526
481 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 527 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
482 if (!radix_tree_exceptional_entry(p)) 528 if (!radix_tree_exceptional_entry(p))
483 return -EEXIST; 529 return -EEXIST;
484 radix_tree_replace_slot(slot, page);
485 mapping->nrshadows--;
486 mapping->nrpages++;
487 if (shadowp) 530 if (shadowp)
488 *shadowp = p; 531 *shadowp = p;
489 return 0; 532 mapping->nrshadows--;
533 if (node)
534 workingset_node_shadows_dec(node);
490 } 535 }
491 error = radix_tree_insert(&mapping->page_tree, page->index, page); 536 radix_tree_replace_slot(slot, page);
492 if (!error) 537 mapping->nrpages++;
493 mapping->nrpages++; 538 if (node) {
494 return error; 539 workingset_node_pages_inc(node);
540 /*
541 * Don't track node that contains actual pages.
542 *
543 * Avoid acquiring the list_lru lock if already
544 * untracked. The list_empty() test is safe as
545 * node->private_list is protected by
546 * mapping->tree_lock.
547 */
548 if (!list_empty(&node->private_list))
549 list_lru_del(&workingset_shadow_nodes,
550 &node->private_list);
551 }
552 return 0;
495} 553}
496 554
497static int __add_to_page_cache_locked(struct page *page, 555static int __add_to_page_cache_locked(struct page *page,